mlcommons · davidjurado · Aug 9, 2024 · Aug 14, 2024 · Aug 23, 2024 · Aug 26, 2024
@@ -0,0 +1 @@
+mlcube/workspace/*
@@ -0,0 +1,21 @@
+FROM pytorch/pytorch:1.13.0-cuda11.6-cudnn8-devel
+
+WORKDIR /workspace/repository
+
+RUN pip install torch==1.13.0+cu117 torchvision==0.14.0+cu117 torchaudio==0.13.0 --extra-index-url https://download.pytorch.org/whl/cu117
+RUN pip install scikit-learn==0.24.2
+RUN pip install torch_geometric==2.4.0
+RUN pip install --no-index  torch_scatter==2.1.1 torch_sparse==0.6.17 -f https://data.pyg.org/whl/torch-1.13.0+cu117.html
+RUN pip install graphlearn-torch==0.2.2
+RUN pip install numpy==1.26.4
+
+RUN apt update
+RUN apt install -y git wget unzip
+RUN pip install git+https://github.com/mlcommons/logging.git
+
+# TF32 instead of FP32 for faster compute
+ENV NVIDIA_TF32_OVERRIDE=1
+
+COPY . /workspace/repository
+RUN chmod +x *.sh
+WORKDIR /workspace/repository
@@ -0,0 +1,82 @@
+import argparse, datetime, os
+import numpy as np
+import torch
+import os.path as osp
+
+import graphlearn_torch as glt
+
+from dataset import float2half
+from download import download_dataset
+from torch_geometric.utils import add_self_loops, remove_self_loops
+from typing import Literal
+
+
+class IGBHeteroDatasetCompress(object):
+  def __init__(self,
+               path,
+               dataset_size,
+               layout: Literal['CSC', 'CSR'] = 'CSC',):
+    self.dir = path
+    self.dataset_size = dataset_size
+    self.layout = layout
+
+    self.ntypes = ['paper']
+    self.etypes = None
+    self.edge_dict = {}
+    self.paper_nodes_num = {'tiny':100000, 'small':1000000, 'medium':10000000, 'large':100000000, 'full':269346174}
+    if not osp.exists(osp.join(path, self.dataset_size, 'processed')):
+      download_dataset(path, 'heterogeneous', dataset_size)
+    self.process()
+
+  def process(self):
+    paper_paper_edges = torch.from_numpy(np.load(osp.join(self.dir, self.dataset_size, 'processed',
+    'paper__cites__paper', 'edge_index.npy'))).t()
+    cites_edge = add_self_loops(remove_self_loops(paper_paper_edges)[0])[0]
+    self.edge_dict = {
+        ('paper', 'cites', 'paper'): (torch.cat([cites_edge[1, :], cites_edge[0, :]]), torch.cat([cites_edge[0, :], cites_edge[1, :]])),
+    }
+    self.etypes = list(self.edge_dict.keys())
+
+    # init graphlearn_torch Dataset.
+    edge_dir = 'out' if self.layout == 'CSR' else 'in'
+    glt_dataset = glt.data.Dataset(edge_dir=edge_dir)
+    glt_dataset.init_graph(
+      edge_index=self.edge_dict,
+      graph_mode='CPU',
+    )
+
+    # save the corresponding csr or csc file
+    compress_edge_dict = {}
+    compress_edge_dict[('paper', 'cites', 'paper')] = 'paper__cites__paper'
+
+    for etype in self.etypes:
+      graph = glt_dataset.get_graph(etype)
+      indptr, indices, _ = graph.export_topology()
+      path = os.path.join(self.dir, self.dataset_size, 'processed', self.layout, compress_edge_dict[etype])
+      if not os.path.exists(path):
+        os.makedirs(path)
+      torch.save(indptr, os.path.join(path, 'indptr.pt'))
+      torch.save(indices, os.path.join(path, 'indices.pt'))
+    path = os.path.join(self.dir, self.dataset_size, 'processed', self.layout)
+    print(f"The {self.layout} graph has been persisted in path: {path}")
+
+
+
+if __name__ == '__main__':
+  parser = argparse.ArgumentParser()
+  root = osp.join(osp.dirname(osp.dirname(osp.dirname(osp.realpath(__file__)))), 'data', 'igbh')
+  glt.utils.ensure_dir(root)
+  parser.add_argument('--path', type=str, default=root,
+      help='path containing the datasets')
+  parser.add_argument('--dataset_size', type=str, default='full',
+      choices=['tiny', 'small', 'medium', 'large', 'full'],
+      help='size of the datasets')
+  parser.add_argument("--layout", type=str, default='CSC')
+  parser.add_argument('--use_fp16', action="store_true",
+    help="convert the node/edge feature into fp16 format")
+  args = parser.parse_args()
+  print(f"Start constructing the {args.layout} graph...")
+  igbh_dataset = IGBHeteroDatasetCompress(args.path, args.dataset_size, args.layout)
+  if args.use_fp16:
+    base_path = osp.join(args.path, args.dataset_size, 'processed')
+    float2half(base_path, args.dataset_size)
@@ -0,0 +1,101 @@
+#!/bin/bash
+
+DATA_DIR="./igbh/full/processed"
+
+# Capture MLCube parameter
+while [ $# -gt 0 ]; do
+    case "$1" in
+    --data_dir=*)
+        DATA_DIR="${1#*=}"
+        ;;
+    *) ;;
+    esac
+    shift
+done
+
+#https://github.com/IllinoisGraphBenchmark/IGB-Datasets/blob/main/igb/download_igbh600m.sh
+echo "IGBH600M download starting"
+mkdir -p $DATA_DIR/full/processed
+cd $DATA_DIR/full/processed
+
+# paper
+mkdir paper
+cd paper
+wget -c https://igb-public.s3.us-east-2.amazonaws.com/IGBH/processed/paper/node_feat.npy
+wget -c https://igb-public.s3.us-east-2.amazonaws.com/IGBH/processed/paper/node_label_19.npy
+wget -c https://igb-public.s3.us-east-2.amazonaws.com/IGBH/processed/paper/node_label_2K.npy
+wget -c https://igb-public.s3.us-east-2.amazonaws.com/IGBH/processed/paper/paper_id_index_mapping.npy
+cd ..
+
+# paper__cites__paper
+mkdir paper__cites__paper
+cd paper__cites__paper
+wget -c https://igb-public.s3.us-east-2.amazonaws.com/IGBH/processed/paper__cites__paper/edge_index.npy
+cd ..
+
+# author
+mkdir author
+cd author
+wget -c https://igb-public.s3.us-east-2.amazonaws.com/IGBH/processed/author/author_id_index_mapping.npy
+wget -c https://igb-public.s3.us-east-2.amazonaws.com/IGBH/processed/author/node_feat.npy
+cd ..
+
+# conference
+mkdir conference
+cd conference
+wget -c https://igb-public.s3.us-east-2.amazonaws.com/IGBH/processed/conference/conference_id_index_mapping.npy
+wget -c https://igb-public.s3.us-east-2.amazonaws.com/IGBH/processed/conference/node_feat.npy
+cd ..
+
+# institute
+mkdir institute
+cd institute
+wget -c https://igb-public.s3.us-east-2.amazonaws.com/IGBH/processed/institute/institute_id_index_mapping.npy
+wget -c https://igb-public.s3.us-east-2.amazonaws.com/IGBH/processed/institute/node_feat.npy
+cd ..
+
+# journal
+mkdir journal
+cd journal
+wget -c https://igb-public.s3.us-east-2.amazonaws.com/IGBH/processed/journal/journal_id_index_mapping.npy
+wget -c https://igb-public.s3.us-east-2.amazonaws.com/IGBH/processed/journal/node_feat.npy
+cd ..
+
+# fos
+mkdir fos
+cd fos
+wget -c https://igb-public.s3.us-east-2.amazonaws.com/IGBH/processed/fos/fos_id_index_mapping.npy
+wget -c https://igb-public.s3.us-east-2.amazonaws.com/IGBH/processed/fos/node_feat.npy
+cd ..
+
+# author__affiliated_to__institute
+mkdir author__affiliated_to__institute
+cd author__affiliated_to__institute
+wget -c https://igb-public.s3.us-east-2.amazonaws.com/IGBH/processed/author__affiliated_to__institute/edge_index.npy
+cd ..
+
+# paper__published__journal
+mkdir paper__published__journal
+cd paper__published__journal
+wget -c https://igb-public.s3.us-east-2.amazonaws.com/IGBH/processed/paper__published__journal/edge_index.npy
+cd ..
+
+# paper__topic__fos
+mkdir paper__topic__fos
+cd paper__topic__fos
+wget -c https://igb-public.s3.us-east-2.amazonaws.com/IGBH/processed/paper__topic__fos/edge_index.npy
+cd ..
+
+# paper__venue__conference
+mkdir paper__venue__conference
+cd paper__venue__conference
+wget -c https://igb-public.s3.us-east-2.amazonaws.com/IGBH/processed/paper__venue__conference/edge_index.npy
+cd ..
+
+# paper__written_by__author
+mkdir paper__written_by__author
+cd paper__written_by__author
+wget -c https://igb-public.s3.us-east-2.amazonaws.com/IGBH/processed/paper__written_by__author/edge_index.npy
+cd ..
+
+echo "IGBH-IGBH download complete"
@@ -0,0 +1,23 @@
+#!/bin/bash
+
+DATA_DIR="./igbh/full/processed"
+
+# Capture MLCube parameter
+while [ $# -gt 0 ]; do
+    case "$1" in
+    --data_dir=*)
+        DATA_DIR="${1#*=}"
+        ;;
+    *) ;;
+    esac
+    shift
+done
+
+echo "Minified dataset download starting ..."
+mkdir -p $DATA_DIR
+cd $DATA_DIR
+
+wget https://mlcube.mlcommons-storage.org/minibenchmarks/gnn.zip
+unzip -o gnn.zip
+rm gnn.zip
+echo "completed!"
@@ -0,0 +1 @@
+workspace/*
@@ -0,0 +1,48 @@
+name: graph_nn9
+description: Graph Neural Network
+authors:
+  - { name: "MLCommons Best Practices Working Group" }
+
+platform:
+  accelerator_count: 1
+
+docker:
+  # Image name.
+  image: mlcommons/graph_nn:0.0.1
+  # Docker build context relative to $MLCUBE_ROOT. Default is `build`.
+  build_context: "../"
+  # Docker file name within docker build context, default is `Dockerfile`.
+  build_file: "Dockerfile_mlcube"
+  # GPU arguments
+  gpu_args: "--gpus=all --shm-size=1G"
+
+tasks:
+  download_data:
+    entrypoint: ./download_data.sh -a
+    parameters:
+      outputs:
+        data_dir: data/
+  process_data:
+    entrypoint: ./process_data.sh -a
+    parameters:
+      outputs:
+        data_dir: data/
+  train:
+    entrypoint: ./run_and_time.sh -a
+    parameters:
+      inputs:
+        data_dir: data/
+      outputs:
+        log_dir: logs/
+  download_demo:
+    entrypoint: ./download_demo.sh -a
+    parameters:
+      outputs:
+        data_dir: data/
+  demo:
+    entrypoint: ./run_demo.sh -a
+    parameters:
+      inputs:
+        data_dir: data/
+      outputs:
+        log_dir: logs/
@@ -0,0 +1,18 @@
+#!/bin/bash
+
+DATA_DIR="./igbh/full/processed"
+
+# Capture MLCube parameter
+while [ $# -gt 0 ]; do
+    case "$1" in
+    --data_dir=*)
+        DATA_DIR="${1#*=}"
+        ;;
+    *) ;;
+    esac
+    shift
+done
+
+echo "Dataset processing starting"
+python split_seeds.py --dataset_size='full' --path $DATA_DIR
+echo "Dataset processing finished"
@@ -0,0 +1,40 @@
+#!/bin/bash
+
+set +x
+set -e
+
+# start timing
+start=$(date +%s)
+start_fmt=$(date +%Y-%m-%d\ %r)
+echo "STARTING TIMING RUN AT $start_fmt"
+
+# Set variables
+: "${DATA_DIR:=./igbh/full/processed}"
+: "${LOG_DIR:=./workspace/logs}"
+
+# Handle MLCube parameters
+while [ $# -gt 0 ]; do
+    case "$1" in
+    --data_dir=*)
+        DATA_DIR="${1#*=}"
+        ;;
+    --log_dir=*)
+        LOG_DIR="${1#*=}"
+        ;;
+    *) ;;
+    esac
+    shift
+done
+
+# run benchmark
+echo "running benchmark"
+
+python compress_graph.py --path $DATA_DIR \
+    --dataset_size='full' \
+    --layout='CSC' \
+    --use_fp16 |& tee "$LOG_DIR/train_console.log"
+
+# end timing
+end=$(date +%s)
+end_fmt=$(date +%Y-%m-%d\ %r)
+echo "ENDING TIMING RUN AT $end_fmt"
@@ -0,0 +1,39 @@
+#!/bin/bash
+
+set +x
+set -e
+
+# start timing
+start=$(date +%s)
+start_fmt=$(date +%Y-%m-%d\ %r)
+echo "STARTING TIMING RUN AT $start_fmt"
+
+# Set variables
+: "${DATA_DIR:=./igbh/full/processed}"
+: "${LOG_DIR:=./workspace/logs}"
+
+# Handle MLCube parameters
+while [ $# -gt 0 ]; do
+    case "$1" in
+    --data_dir=*)
+        DATA_DIR="${1#*=}"
+        ;;
+    --log_dir=*)
+        LOG_DIR="${1#*=}"
+        ;;
+    *) ;;
+    esac
+    shift
+done
+
+# run benchmark
+echo "running benchmark"
+
+python compress_graph_demo.py --path $DATA_DIR \
+    --dataset_size='tiny' \
+    --layout='CSC' |& tee "$LOG_DIR/demo_console.log"
+
+# end timing
+end=$(date +%s)
+end_fmt=$(date +%Y-%m-%d\ %r)
+echo "ENDING TIMING RUN AT $end_fmt"