distributed_belief_propagation_train.py

# -*- coding: utf-8 -*-
"""Distributed Belief Propagation_train.ipynb

Automatically generated by Colaboratory.

Original file is located at
    https://colab.research.google.com/drive/1YmRny5Lt65kfgLTdgur0jwfonyEWhMvl
"""

!pip install pyspark
!pip install bigdl-orca-spark3
!pip install torch torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/cpu
!pip install pyg-lib torch-scatter torch-sparse torch-cluster torch-spline-conv torch-geometric -f https://data.pyg.org/whl/torch-1.13.0+cu116.html
!pip install ray

from google.colab import drive
drive.mount('/content/drive')

# Commented out IPython magic to ensure Python compatibility.
# %cd "/content/drive/MyDrive/501_proje"

import matplotlib
matplotlib.use('Agg')
import matplotlib.pyplot as plt
import os
import copy
import warnings
import numpy as np
import pandas as pd
import networkx as nx
import matplotlib.pyplot as plt
import tensorflow as tf

from sklearn.metrics import confusion_matrix, classification_report
from sklearn.model_selection import train_test_split
warnings.filterwarnings('ignore')

#from tf_models import *

import pickle

with open('train_test_split.pkl', 'rb') as f:
    data_object = pickle.load(f)

node_features, labels, classified_idx, edges, edge_weights = data_object
labels_filtered = labels[classified_idx]
node_features_filtered = node_features.iloc[classified_idx][0]

node_features

edges = edges.to_numpy().T
#edge_weights = tf.ones(shape=edges.shape[1])
edge_weights = np.ones(shape=edges.shape[1])
#node_features = tf.cast(node_features.to_numpy(), dtype=tf.dtypes.float32)
node_features = node_features.to_numpy(dtype='float32')

X_train, X_test, y_train, y_test = train_test_split(node_features_filtered, labels_filtered, test_size=0.99, random_state=4)

X_train = X_train.to_numpy(dtype='int32')
X_test = X_test.to_numpy(dtype='int32')

y_train = pd.Series(y_train)
y_test = pd.Series(y_test)

graph_info = (node_features, edges, edge_weights)
unique, counts = np.unique(labels, return_counts=True)
num_classes = len(unique)


hidden_units = [32, 32]
learning_rate = 0.01
dropout_rate = 0.5
num_epochs = 1
batch_size = 16

y_train

import tensorflow as tf
from tensorflow import keras
#from tensorflow.keras import layers
#from tensorflow.python.keras import layers


def create_ffn(hidden_units, dropout_rate, name=None):
    fnn_layers = []

    for units in hidden_units:
        fnn_layers.append(keras.layers.BatchNormalization())
        fnn_layers.append(keras.layers.Dropout(dropout_rate))
        fnn_layers.append(keras.layers.Dense(units, activation=tf.nn.gelu))

    return keras.Sequential(fnn_layers, name=name)


class GraphConvLayer(keras.layers.Layer):
    def __init__(
        self,
        hidden_units,
        dropout_rate=0.2,
        aggregation_type="mean",
        combination_type="concat",
        normalize=False,
        *args,
        **kwargs,
    ):
        super(GraphConvLayer, self).__init__(*args, **kwargs)

        self.aggregation_type = aggregation_type
        self.combination_type = combination_type
        self.normalize = normalize

        self.ffn_prepare = create_ffn(hidden_units, dropout_rate)
        if self.combination_type == "gated":
            self.update_fn = keras.layers.GRU(
                units=hidden_units,
                activation="tanh",
                recurrent_activation="sigmoid",
                dropout=dropout_rate,
                return_state=True,
                recurrent_dropout=dropout_rate,
            )
        else:
            self.update_fn = create_ffn(hidden_units, dropout_rate)

    def prepare(self, node_repesentations, weights=None):
        # node_repesentations shape is [num_edges, embedding_dim].
        messages = self.ffn_prepare(node_repesentations)
        if weights is not None:
            messages = messages * tf.expand_dims(weights, -1)
        return messages

    def aggregate(self, node_indices, neighbour_messages, node_repesentations):
        # node_indices shape is [num_edges].
        # neighbour_messages shape: [num_edges, representation_dim].
        # node_repesentations shape is [num_nodes, representation_dim]
        num_nodes = node_repesentations.shape[0]
        if self.aggregation_type == "sum":
            aggregated_message = tf.math.unsorted_segment_sum(
                neighbour_messages, node_indices, num_segments=num_nodes
            )
        elif self.aggregation_type == "mean":
            aggregated_message = tf.math.unsorted_segment_mean(
                neighbour_messages, node_indices, num_segments=num_nodes
            )
        elif self.aggregation_type == "max":
            aggregated_message = tf.math.unsorted_segment_max(
                neighbour_messages, node_indices, num_segments=num_nodes
            )
        else:
            raise ValueError(f"Invalid aggregation type: {self.aggregation_type}.")

        return aggregated_message

    def update(self, node_repesentations, aggregated_messages):
        # node_repesentations shape is [num_nodes, representation_dim].
        # aggregated_messages shape is [num_nodes, representation_dim].
        if self.combination_type == "gru":
            # Create a sequence of two elements for the GRU layer.
            h = tf.stack([node_repesentations, aggregated_messages], axis=1)
        elif self.combination_type == "concat":
            # Concatenate the node_repesentations and aggregated_messages.
            h = tf.concat([node_repesentations, aggregated_messages], axis=1)
        elif self.combination_type == "add":
            # Add node_repesentations and aggregated_messages.
            h = node_repesentations + aggregated_messages
        else:
            raise ValueError(f"Invalid combination type: {self.combination_type}.")

        # Apply the processing function.
        node_embeddings = self.update_fn(h)
        if self.combination_type == "gru":
            node_embeddings = tf.unstack(node_embeddings, axis=1)[-1]

        if self.normalize:
            node_embeddings = tf.nn.l2_normalize(node_embeddings, axis=-1)
        return node_embeddings

    def call(self, inputs):
        """Process the inputs to produce the node_embeddings.

        inputs: a tuple of three elements: node_repesentations, edges, edge_weights.
        Returns: node_embeddings of shape [num_nodes, representation_dim].
        """

        node_repesentations, edges, edge_weights = inputs
        # Get node_indices (source) and neighbour_indices (target) from edges.
        node_indices, neighbour_indices = edges[0], edges[1]
        # neighbour_repesentations shape is [num_edges, representation_dim].
        neighbour_repesentations = tf.gather(node_repesentations, neighbour_indices)

        # Prepare the messages of the neighbours.
        neighbour_messages = self.prepare(neighbour_repesentations, edge_weights)
        # Aggregate the neighbour messages.
        aggregated_messages = self.aggregate(
            node_indices, neighbour_messages, node_repesentations
        )
        # Update the node embedding with the neighbour messages.
        return self.update(node_repesentations, aggregated_messages)


class GNNNodeClassifier(tf.keras.Model):
    def __init__(
        self,
        graph_info,
        num_classes,
        hidden_units,
        aggregation_type="sum",
        combination_type="concat",
        dropout_rate=0.2,
        normalize=True,
        *args,
        **kwargs,
    ):
        super(GNNNodeClassifier, self).__init__(*args, **kwargs)

        # Unpack graph_info to three elements: node_features, edges, and edge_weight.
        node_features, edges, edge_weights = graph_info
        self.node_features = node_features
        self.edges = edges
        self.edge_weights = edge_weights
        # Set edge_weights to ones if not provided.
        if self.edge_weights is None:
            self.edge_weights = tf.ones(shape=edges.shape[1])
        # Scale edge_weights to sum to 1.
        self.edge_weights = self.edge_weights / tf.math.reduce_sum(self.edge_weights)

        # Create a process layer.
        self.preprocess = create_ffn(hidden_units, dropout_rate, name="preprocess")
        # Create the first GraphConv layer.
        self.conv1 = GraphConvLayer(
            hidden_units,
            dropout_rate,
            aggregation_type,
            combination_type,
            normalize,
            name="graph_conv1",
        )
        # Create the second GraphConv layer.
        self.conv2 = GraphConvLayer(
            hidden_units,
            dropout_rate,
            aggregation_type,
            combination_type,
            normalize,
            name="graph_conv2",
        )
        # Create a postprocess layer.
        self.postprocess = create_ffn(hidden_units, dropout_rate, name="postprocess")
        # Create a compute logits layer.
        self.compute_logits = keras.layers.Dense(units=num_classes, name="logits")

    def call(self, input_node_indices):
        # Preprocess the node_features to produce node representations.
        x = self.preprocess(self.node_features)
        # Apply the first graph conv layer.
        x1 = self.conv1((x, self.edges, self.edge_weights))
        # Skip connection.
        x = x1 + x
        # Apply the second graph conv layer.
        x2 = self.conv2((x, self.edges, self.edge_weights))
        # Skip connection.
        x = x2 + x
        # Postprocess node embedding.
        x = self.postprocess(x)
        # Fetch node embeddings for the input node_indices.
        node_embeddings = tf.gather(x, input_node_indices)
        # Compute logits
        return self.compute_logits(node_embeddings)

def test_data_creator():
    #(train_feature, train_label), _ = tf.keras.datasets.mnist.load_data()

    dataset = tf.data.Dataset.from_tensor_slices((X_test, y_test))
    return dataset

def train_data_creator(config, batch_size):
    #(train_feature, train_label), _ = tf.keras.datasets.mnist.load_data()

    dataset = tf.data.Dataset.from_tensor_slices((X_train, y_train))
    dataset = dataset.repeat()
    dataset = dataset.shuffle(1000)
    dataset = dataset.batch(batch_size)
    return dataset


def model_creator(config):
    model = GNNNodeClassifier(
        graph_info=graph_info,
        num_classes=num_classes,
        hidden_units=hidden_units,
        dropout_rate=dropout_rate,
        name="gnn_model",
    )

    model.compile(optimizer=keras.optimizers.Adam(learning_rate),
                  loss=keras.losses.SparseCategoricalCrossentropy(from_logits=True),
                  metrics=[keras.metrics.SparseCategoricalAccuracy(name="acc")])
    return model

from bigdl.orca import init_orca_context, stop_orca_context, OrcaContext
from bigdl.orca.learn.tf2 import Estimator

sc = init_orca_context(cluster_mode="local", cores=1, memory="4g", num_nodes=1)
spark = OrcaContext.get_spark_session()

est = Estimator.from_keras(model_creator=model_creator, workers_per_node=2,backend="ray",)

X_train.shape[0]

batch_size = 32
train_steps = int(X_train.shape[0] / batch_size)

stats = est.fit(train_data_creator, epochs=3, batch_size=batch_size, steps_per_epoch=train_steps)


df_test_x = pd.DataFrame(X_test, columns=list('a'))
df_test_y = pd.DataFrame(y_test, columns=list('b'))
a =spark.createDataFrame(df_test_x)
b=spark.createDataFrame(df_test_y)
b.show()
comp = pd.concat([df_test_x, df_test_y], axis=1)
comp2=a =spark.createDataFrame(comp)
comp2.show()

a=test_data_creator()

y_test

comp

comp.to_parquet('df.parquet.gzip',
              compression='gzip')

a=read_parquet('df.parquet.gzip')

from bigdl.orca.data.pandas.preprocessing import read_parquet

dataset = tf.data.Dataset.from_tensor_slices((X_test, y_test))


len(y_test)

X_train, X_test, y_train, y_test = train_test_split(node_features_filtered, labels_filtered, test_size=0.0001, random_state=4)

X_train = X_train.to_numpy(dtype='int32')
X_test = X_test.to_numpy(dtype='int32')

y_train = pd.Series(y_train)
y_test = pd.Series(y_test)


import numpy as np
from bigdl.orca.data import XShards

train_shards = XShards.partition([X_test, y_test])

def transform_to_dict(train_data):
    return {"x": train_data[0], "y": train_data[1]}

train_shards = train_shards.transform_shard(transform_to_dict)

len(X_test)

df_test_x = pd.DataFrame(X_test, columns=list('a'))
df_test_y = pd.DataFrame(y_test, columns=list('b'))
a =spark.createDataFrame(df_test_x)
b=spark.createDataFrame(df_test_y)
comp = pd.concat([df_test_x, df_test_y], axis=1)
comp2=a =spark.createDataFrame(comp)

type(comp2)

def test_data_creator(config, batch_size):
    #(train_feature, train_label), _ = tf.keras.datasets.mnist.load_data()

    dataset = tf.data.Dataset.from_tensor_slices((X_test[0:200], y_test[0:200]))
    dataset = dataset.batch(batch_size)
    return dataset

result = est.evaluate(data=test_data_creator, batch_size=32)
for r in result:
    print(r, ":", result[r])

prediction_df = est.predict(train_shards,20)