From 8b7b50ad79e3c7de5db677df26526decf20e33b1 Mon Sep 17 00:00:00 2001
From: yanbing-j <yanbing.jiang@intel.com>
Date: Mon, 27 Jun 2022 13:40:02 +0800
Subject: [PATCH 01/24] [Benchmark] Add inference and profile in citation

---
 benchmark/citation/appnp.py      |  13 +++-
 benchmark/citation/arma.py       |  13 +++-
 benchmark/citation/cheb.py       |  13 +++-
 benchmark/citation/gat.py        |  13 +++-
 benchmark/citation/gcn.py        |  13 +++-
 benchmark/citation/sgc.py        |  13 +++-
 benchmark/citation/train_eval.py | 129 +++++++++++++++++++++----------
 7 files changed, 161 insertions(+), 46 deletions(-)

diff --git a/benchmark/citation/appnp.py b/benchmark/citation/appnp.py
index 10f805a0c06f..469240d2c307 100644
--- a/benchmark/citation/appnp.py
+++ b/benchmark/citation/appnp.py
@@ -20,6 +20,8 @@
 parser.add_argument('--normalize_features', type=bool, default=True)
 parser.add_argument('--K', type=int, default=10)
 parser.add_argument('--alpha', type=float, default=0.1)
+parser.add_argument('--inference', type=bool, default=False)
+parser.add_argument('--profile', type=bool, default=False) # Currently support profile in inference
 args = parser.parse_args()
 
 
@@ -47,4 +49,13 @@ def forward(self, data):
 dataset = get_planetoid_dataset(args.dataset, args.normalize_features)
 permute_masks = random_planetoid_splits if args.random_splits else None
 run(dataset, Net(dataset), args.runs, args.epochs, args.lr, args.weight_decay,
-    args.early_stopping, permute_masks)
+    args.early_stopping, args.inference, args.profile, permute_masks)
+
+if args.profile:
+    import os
+    import pathlib
+    profile_dir = str(pathlib.Path.cwd()) + '/'
+    profile_file = profile_dir + 'profile-citation-APPNP-' + args.dataset + '-random_splits-' + str(args.random_splits) + '.log'
+    timeline_file = profile_dir + 'profile-citation-APPNP-' + args.dataset + '-random_splits-' + str(args.random_splits) + '.json'
+    os.rename('profile.log', profile_file)
+    os.rename('timeline.json', timeline_file)
diff --git a/benchmark/citation/arma.py b/benchmark/citation/arma.py
index 0d0405e7a548..31c39b4a9e9a 100644
--- a/benchmark/citation/arma.py
+++ b/benchmark/citation/arma.py
@@ -21,6 +21,8 @@
 parser.add_argument('--num_layers', type=int, default=1)
 parser.add_argument('--shared_weights', type=bool, default=False)
 parser.add_argument('--skip_dropout', type=float, default=0.75)
+parser.add_argument('--inference', type=bool, default=False)
+parser.add_argument('--profile', type=bool, default=False) # Currently support profile in inference
 args = parser.parse_args()
 
 
@@ -49,4 +51,13 @@ def forward(self, data):
 dataset = get_planetoid_dataset(args.dataset, args.normalize_features)
 permute_masks = random_planetoid_splits if args.random_splits else None
 run(dataset, Net(dataset), args.runs, args.epochs, args.lr, args.weight_decay,
-    args.early_stopping, permute_masks)
+    args.early_stopping, args.inference, args.profile, permute_masks)
+
+if args.profile:
+    import os
+    import pathlib
+    profile_dir = str(pathlib.Path.cwd()) + '/'
+    profile_file = profile_dir + 'profile-citation-ARMA-' + args.dataset + '-random_splits-' + str(args.random_splits) + '.log'
+    timeline_file = profile_dir + 'profile-citation-ARMA-' + args.dataset + '-random_splits-' + str(args.random_splits) + '.json'
+    os.rename('profile.log', profile_file)
+    os.rename('timeline.json', timeline_file)
diff --git a/benchmark/citation/cheb.py b/benchmark/citation/cheb.py
index 18e6b4b23934..ba56dd323522 100644
--- a/benchmark/citation/cheb.py
+++ b/benchmark/citation/cheb.py
@@ -18,6 +18,8 @@
 parser.add_argument('--dropout', type=float, default=0.5)
 parser.add_argument('--normalize_features', type=bool, default=True)
 parser.add_argument('--num_hops', type=int, default=3)
+parser.add_argument('--inference', type=bool, default=False)
+parser.add_argument('--profile', type=bool, default=False) # Currently support profile in inference
 args = parser.parse_args()
 
 
@@ -42,4 +44,13 @@ def forward(self, data):
 dataset = get_planetoid_dataset(args.dataset, args.normalize_features)
 permute_masks = random_planetoid_splits if args.random_splits else None
 run(dataset, Net(dataset), args.runs, args.epochs, args.lr, args.weight_decay,
-    args.early_stopping, permute_masks)
+    args.early_stopping, args.inference, args.profile, permute_masks)
+
+if args.profile:
+    import os
+    import pathlib
+    profile_dir = str(pathlib.Path.cwd()) + '/'
+    profile_file = profile_dir + 'profile-citation-CHEBY-' + args.dataset + '-random_splits-' + str(args.random_splits) + '.log'
+    timeline_file = profile_dir + 'profile-citation-CHEBY-' + args.dataset + '-random_splits-' + str(args.random_splits) + '.json'
+    os.rename('profile.log', profile_file)
+    os.rename('timeline.json', timeline_file)
diff --git a/benchmark/citation/gat.py b/benchmark/citation/gat.py
index 0f85e5144a73..52e0cea82581 100644
--- a/benchmark/citation/gat.py
+++ b/benchmark/citation/gat.py
@@ -19,6 +19,8 @@
 parser.add_argument('--normalize_features', type=bool, default=True)
 parser.add_argument('--heads', type=int, default=8)
 parser.add_argument('--output_heads', type=int, default=1)
+parser.add_argument('--inference', type=bool, default=False)
+parser.add_argument('--profile', type=bool, default=False) # Currently support profile in inference
 args = parser.parse_args()
 
 
@@ -47,4 +49,13 @@ def forward(self, data):
 dataset = get_planetoid_dataset(args.dataset, args.normalize_features)
 permute_masks = random_planetoid_splits if args.random_splits else None
 run(dataset, Net(dataset), args.runs, args.epochs, args.lr, args.weight_decay,
-    args.early_stopping, permute_masks)
+    args.early_stopping, args.inference, args.profile, permute_masks)
+
+if args.profile:
+    import os
+    import pathlib
+    profile_dir = str(pathlib.Path.cwd()) + '/'
+    profile_file = profile_dir + 'profile-citation-GAT-' + args.dataset + '-random_splits-' + str(args.random_splits) + '.log'
+    timeline_file = profile_dir + 'profile-citation-GAT-' + args.dataset + '-random_splits-' + str(args.random_splits) + '.json'
+    os.rename('profile.log', profile_file)
+    os.rename('timeline.json', timeline_file)
diff --git a/benchmark/citation/gcn.py b/benchmark/citation/gcn.py
index b8c220f519b8..df0b3d74385e 100644
--- a/benchmark/citation/gcn.py
+++ b/benchmark/citation/gcn.py
@@ -17,6 +17,8 @@
 parser.add_argument('--hidden', type=int, default=16)
 parser.add_argument('--dropout', type=float, default=0.5)
 parser.add_argument('--normalize_features', type=bool, default=True)
+parser.add_argument('--inference', type=bool, default=False)
+parser.add_argument('--profile', type=bool, default=False) # Currently support profile in inference
 args = parser.parse_args()
 
 
@@ -41,4 +43,13 @@ def forward(self, data):
 dataset = get_planetoid_dataset(args.dataset, args.normalize_features)
 permute_masks = random_planetoid_splits if args.random_splits else None
 run(dataset, Net(dataset), args.runs, args.epochs, args.lr, args.weight_decay,
-    args.early_stopping, permute_masks)
+    args.early_stopping, args.inference, args.profile, permute_masks)
+
+if args.profile:
+    import os
+    import pathlib
+    profile_dir = str(pathlib.Path.cwd()) + '/'
+    profile_file = profile_dir + 'profile-citation-GCN-' + args.dataset + '-random_splits-' + str(args.random_splits) + '.log'
+    timeline_file = profile_dir + 'profile-citation-GCN-' + args.dataset + '-random_splits-' + str(args.random_splits) + '.json'
+    os.rename('profile.log', profile_file)
+    os.rename('timeline.json', timeline_file)
diff --git a/benchmark/citation/sgc.py b/benchmark/citation/sgc.py
index b21a37e07a5c..5adf3814c189 100644
--- a/benchmark/citation/sgc.py
+++ b/benchmark/citation/sgc.py
@@ -16,6 +16,8 @@
 parser.add_argument('--early_stopping', type=int, default=10)
 parser.add_argument('--normalize_features', type=bool, default=False)
 parser.add_argument('--K', type=int, default=2)
+parser.add_argument('--inference', type=bool, default=False)
+parser.add_argument('--profile', type=bool, default=False) # Currently support profile in inference
 args = parser.parse_args()
 
 
@@ -37,4 +39,13 @@ def forward(self, data):
 dataset = get_planetoid_dataset(args.dataset, args.normalize_features)
 permute_masks = random_planetoid_splits if args.random_splits else None
 run(dataset, Net(dataset), args.runs, args.epochs, args.lr, args.weight_decay,
-    args.early_stopping, permute_masks)
+    args.early_stopping, args.inference, args.profile, permute_masks)
+
+if args.profile:
+    import os
+    import pathlib
+    profile_dir = str(pathlib.Path.cwd()) + '/'
+    profile_file = profile_dir + 'profile-citation-SGC-' + args.dataset + '-random_splits-' + str(args.random_splits) + '.log'
+    timeline_file = profile_dir + 'profile-citation-SGC-' + args.dataset + '-random_splits-' + str(args.random_splits) + '.json'
+    os.rename('profile.log', profile_file)
+    os.rename('timeline.json', timeline_file)
diff --git a/benchmark/citation/train_eval.py b/benchmark/citation/train_eval.py
index 5e5c190d81e5..049cfb4617aa 100644
--- a/benchmark/citation/train_eval.py
+++ b/benchmark/citation/train_eval.py
@@ -4,10 +4,12 @@
 import torch.nn.functional as F
 from torch import tensor
 from torch.optim import Adam
+from torch.profiler import profile, ProfilerActivity
 
 from torch_geometric.utils import index_to_mask
 
 device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+profile_sort = "self_cuda_time_total" if torch.cuda.is_available() else "self_cpu_time_total"
 
 
 def random_planetoid_splits(data, num_classes):
@@ -33,61 +35,103 @@ def random_planetoid_splits(data, num_classes):
 
     return data
 
-
-def run(dataset, model, runs, epochs, lr, weight_decay, early_stopping,
+def trace_handler(p):
+    output = p.key_averages().table(sort_by=profile_sort)
+    print(output)
+    import pathlib
+    profile_dir = str(pathlib.Path.cwd()) + '/'
+    profile_file = profile_dir + 'profile' + '.log'
+    with open(profile_file, 'w') as f:
+        f.write(output)
+        f.close()
+    timeline_file = profile_dir + 'timeline' + '.json'
+    p.export_chrome_trace(timeline_file)
+
+def run(dataset, model, runs, epochs, lr, weight_decay, early_stopping, inference, profiling,
         permute_masks=None, logger=None):
-
     val_losses, accs, durations = [], [], []
-    for _ in range(runs):
-        data = dataset[0]
-        if permute_masks is not None:
-            data = permute_masks(data, dataset.num_classes)
-        data = data.to(device)
+    if not inference:
+        for _ in range(runs):
+            data = dataset[0]
+            if permute_masks is not None:
+                data = permute_masks(data, dataset.num_classes)
+            data = data.to(device)
+
+            model.to(device).reset_parameters()
+            optimizer = Adam(model.parameters(), lr=lr, weight_decay=weight_decay)
+
+            if torch.cuda.is_available():
+                torch.cuda.synchronize()
+
+            t_start = time.perf_counter()
+
+            best_val_loss = float('inf')
+            test_acc = 0
+            val_loss_history = []
 
-        model.to(device).reset_parameters()
-        optimizer = Adam(model.parameters(), lr=lr, weight_decay=weight_decay)
+            for epoch in range(1, epochs + 1):
+                train(model, optimizer, data)
+                eval_info = evaluate(model, data)
+                eval_info['epoch'] = epoch
 
-        if torch.cuda.is_available():
-            torch.cuda.synchronize()
+                if logger is not None:
+                    logger(eval_info)
 
-        t_start = time.perf_counter()
+                if eval_info['val_loss'] < best_val_loss:
+                    best_val_loss = eval_info['val_loss']
+                    test_acc = eval_info['test_acc']
 
-        best_val_loss = float('inf')
-        test_acc = 0
-        val_loss_history = []
+                val_loss_history.append(eval_info['val_loss'])
+                if early_stopping > 0 and epoch > epochs // 2:
+                    tmp = tensor(val_loss_history[-(early_stopping + 1):-1])
+                    if eval_info['val_loss'] > tmp.mean().item():
+                        break
 
-        for epoch in range(1, epochs + 1):
-            train(model, optimizer, data)
-            eval_info = evaluate(model, data)
-            eval_info['epoch'] = epoch
+            if torch.cuda.is_available():
+                torch.cuda.synchronize()
 
-            if logger is not None:
-                logger(eval_info)
+            t_end = time.perf_counter()
 
-            if eval_info['val_loss'] < best_val_loss:
-                best_val_loss = eval_info['val_loss']
-                test_acc = eval_info['test_acc']
+            val_losses.append(best_val_loss)
+            accs.append(test_acc)
+            durations.append(t_end - t_start)
+        loss, acc, duration = tensor(val_losses), tensor(accs), tensor(durations)
 
-            val_loss_history.append(eval_info['val_loss'])
-            if early_stopping > 0 and epoch > epochs // 2:
-                tmp = tensor(val_loss_history[-(early_stopping + 1):-1])
-                if eval_info['val_loss'] > tmp.mean().item():
-                    break
+        print(f'Val Loss: {float(loss.mean()):.4f}, '
+            f'Test Accuracy: {float(acc.mean()):.3f} ± {float(acc.std()):.3f}, '
+            f'Duration: {float(duration.mean()):.3f}s')
+    else:
+        for i in range(runs):
+            data = dataset[0]
+            if permute_masks is not None:
+                data = permute_masks(data, dataset.num_classes)
+            data = data.to(device)
 
-        if torch.cuda.is_available():
-            torch.cuda.synchronize()
+            model.to(device).reset_parameters()
 
-        t_end = time.perf_counter()
+            if torch.cuda.is_available():
+                torch.cuda.synchronize()
 
-        val_losses.append(best_val_loss)
-        accs.append(test_acc)
-        durations.append(t_end - t_start)
+            t_start = time.perf_counter()
 
-    loss, acc, duration = tensor(val_losses), tensor(accs), tensor(durations)
+            for epoch in range(1, epochs + 1):
+                if profiling and i == int(runs / 2) and epoch == int(epochs / 2):
+                    with profile(activities=[
+                        ProfilerActivity.CPU, ProfilerActivity.CUDA],
+                        on_trace_ready=trace_handler) as p:
+                        test(model, data)
+                        p.step()
+                else:
+                    test(model, data)
 
-    print(f'Val Loss: {float(loss.mean()):.4f}, '
-          f'Test Accuracy: {float(acc.mean()):.3f} ± {float(acc.std()):.3f}, '
-          f'Duration: {float(duration.mean()):.3f}')
+            if torch.cuda.is_available():
+                torch.cuda.synchronize()
+
+            t_end = time.perf_counter()
+            durations.append(t_end - t_start)
+
+        duration = tensor(durations)
+        print(f'Inference Duration: {float(duration.mean()):.3f}s')
 
 
 def train(model, optimizer, data):
@@ -116,3 +160,8 @@ def evaluate(model, data):
         outs[f'{key}_acc'] = acc
 
     return outs
+
+def test(model, data):
+    model.eval()
+    with torch.no_grad():
+        logits = model(data)

From 064d52923a5168ab2e044435c822913020d15061 Mon Sep 17 00:00:00 2001
From: yanbing-j <yanbing.jiang@intel.com>
Date: Wed, 29 Jun 2022 08:50:42 +0800
Subject: [PATCH 02/24] Print end-to-end time of inference

---
 benchmark/citation/appnp.py      |  7 +++++--
 benchmark/citation/arma.py       |  7 +++++--
 benchmark/citation/cheb.py       |  7 +++++--
 benchmark/citation/gat.py        |  7 +++++--
 benchmark/citation/gcn.py        |  7 +++++--
 benchmark/citation/run.sh        | 36 ++++++++++++++++++++++++++++++++
 benchmark/citation/sgc.py        |  7 +++++--
 benchmark/citation/train_eval.py | 13 ------------
 8 files changed, 66 insertions(+), 25 deletions(-)

diff --git a/benchmark/citation/appnp.py b/benchmark/citation/appnp.py
index 469240d2c307..97a28699589a 100644
--- a/benchmark/citation/appnp.py
+++ b/benchmark/citation/appnp.py
@@ -1,4 +1,5 @@
 import argparse
+import time
 
 import torch
 import torch.nn.functional as F
@@ -48,14 +49,16 @@ def forward(self, data):
 
 dataset = get_planetoid_dataset(args.dataset, args.normalize_features)
 permute_masks = random_planetoid_splits if args.random_splits else None
+t_start = time.time()
 run(dataset, Net(dataset), args.runs, args.epochs, args.lr, args.weight_decay,
     args.early_stopping, args.inference, args.profile, permute_masks)
+t_end = time.time()
+duration = t_end - t_start
+print("appnp-", args.dataset, "-", args.random_splits, ": End-to-End time: ", duration, " s")
 
 if args.profile:
     import os
     import pathlib
     profile_dir = str(pathlib.Path.cwd()) + '/'
-    profile_file = profile_dir + 'profile-citation-APPNP-' + args.dataset + '-random_splits-' + str(args.random_splits) + '.log'
     timeline_file = profile_dir + 'profile-citation-APPNP-' + args.dataset + '-random_splits-' + str(args.random_splits) + '.json'
-    os.rename('profile.log', profile_file)
     os.rename('timeline.json', timeline_file)
diff --git a/benchmark/citation/arma.py b/benchmark/citation/arma.py
index 31c39b4a9e9a..103aca88c0c5 100644
--- a/benchmark/citation/arma.py
+++ b/benchmark/citation/arma.py
@@ -1,4 +1,5 @@
 import argparse
+import time
 
 import torch
 import torch.nn.functional as F
@@ -50,14 +51,16 @@ def forward(self, data):
 
 dataset = get_planetoid_dataset(args.dataset, args.normalize_features)
 permute_masks = random_planetoid_splits if args.random_splits else None
+t_start = time.time()
 run(dataset, Net(dataset), args.runs, args.epochs, args.lr, args.weight_decay,
     args.early_stopping, args.inference, args.profile, permute_masks)
+t_end = time.time()
+duration = t_end - t_start
+print("arma-{}-{}: End-to-End time: {} s".format(args.dataset, args.random_splits, duration))
 
 if args.profile:
     import os
     import pathlib
     profile_dir = str(pathlib.Path.cwd()) + '/'
-    profile_file = profile_dir + 'profile-citation-ARMA-' + args.dataset + '-random_splits-' + str(args.random_splits) + '.log'
     timeline_file = profile_dir + 'profile-citation-ARMA-' + args.dataset + '-random_splits-' + str(args.random_splits) + '.json'
-    os.rename('profile.log', profile_file)
     os.rename('timeline.json', timeline_file)
diff --git a/benchmark/citation/cheb.py b/benchmark/citation/cheb.py
index ba56dd323522..7a8788a704fc 100644
--- a/benchmark/citation/cheb.py
+++ b/benchmark/citation/cheb.py
@@ -1,4 +1,5 @@
 import argparse
+import time
 
 import torch
 import torch.nn.functional as F
@@ -43,14 +44,16 @@ def forward(self, data):
 
 dataset = get_planetoid_dataset(args.dataset, args.normalize_features)
 permute_masks = random_planetoid_splits if args.random_splits else None
+t_start = time.time()
 run(dataset, Net(dataset), args.runs, args.epochs, args.lr, args.weight_decay,
     args.early_stopping, args.inference, args.profile, permute_masks)
+t_end = time.time()
+duration = t_end - t_start
+print("cheby-{}-{}: End-to-End time: {} s".format(args.dataset, args.random_splits, duration))
 
 if args.profile:
     import os
     import pathlib
     profile_dir = str(pathlib.Path.cwd()) + '/'
-    profile_file = profile_dir + 'profile-citation-CHEBY-' + args.dataset + '-random_splits-' + str(args.random_splits) + '.log'
     timeline_file = profile_dir + 'profile-citation-CHEBY-' + args.dataset + '-random_splits-' + str(args.random_splits) + '.json'
-    os.rename('profile.log', profile_file)
     os.rename('timeline.json', timeline_file)
diff --git a/benchmark/citation/gat.py b/benchmark/citation/gat.py
index 52e0cea82581..551375534355 100644
--- a/benchmark/citation/gat.py
+++ b/benchmark/citation/gat.py
@@ -1,4 +1,5 @@
 import argparse
+import time
 
 import torch
 import torch.nn.functional as F
@@ -48,14 +49,16 @@ def forward(self, data):
 
 dataset = get_planetoid_dataset(args.dataset, args.normalize_features)
 permute_masks = random_planetoid_splits if args.random_splits else None
+t_start = time.time()
 run(dataset, Net(dataset), args.runs, args.epochs, args.lr, args.weight_decay,
     args.early_stopping, args.inference, args.profile, permute_masks)
+t_end = time.time()
+duration = t_end - t_start
+print("gat-{}-{}: End-to-End time: {} s".format(args.dataset, args.random_splits, duration))
 
 if args.profile:
     import os
     import pathlib
     profile_dir = str(pathlib.Path.cwd()) + '/'
-    profile_file = profile_dir + 'profile-citation-GAT-' + args.dataset + '-random_splits-' + str(args.random_splits) + '.log'
     timeline_file = profile_dir + 'profile-citation-GAT-' + args.dataset + '-random_splits-' + str(args.random_splits) + '.json'
-    os.rename('profile.log', profile_file)
     os.rename('timeline.json', timeline_file)
diff --git a/benchmark/citation/gcn.py b/benchmark/citation/gcn.py
index df0b3d74385e..7bdbd2c1b64a 100644
--- a/benchmark/citation/gcn.py
+++ b/benchmark/citation/gcn.py
@@ -1,4 +1,5 @@
 import argparse
+import time
 
 import torch
 import torch.nn.functional as F
@@ -42,14 +43,16 @@ def forward(self, data):
 
 dataset = get_planetoid_dataset(args.dataset, args.normalize_features)
 permute_masks = random_planetoid_splits if args.random_splits else None
+t_start = time.time()
 run(dataset, Net(dataset), args.runs, args.epochs, args.lr, args.weight_decay,
     args.early_stopping, args.inference, args.profile, permute_masks)
+t_end = time.time()
+duration = t_end - t_start
+print("gcn-{}-{}: End-to-End time: {} s".format(args.dataset, args.random_splits, duration))
 
 if args.profile:
     import os
     import pathlib
     profile_dir = str(pathlib.Path.cwd()) + '/'
-    profile_file = profile_dir + 'profile-citation-GCN-' + args.dataset + '-random_splits-' + str(args.random_splits) + '.log'
     timeline_file = profile_dir + 'profile-citation-GCN-' + args.dataset + '-random_splits-' + str(args.random_splits) + '.json'
-    os.rename('profile.log', profile_file)
     os.rename('timeline.json', timeline_file)
diff --git a/benchmark/citation/run.sh b/benchmark/citation/run.sh
index dc584555fe7a..6e4288f678e1 100755
--- a/benchmark/citation/run.sh
+++ b/benchmark/citation/run.sh
@@ -6,26 +6,38 @@ echo "===="
 echo "GCN"
 python gcn.py --dataset=Cora
 python gcn.py --dataset=Cora --random_splits=True
+python gcn.py --dataset=Cora --inference=True --profile=True
+python gcn.py --dataset=Cora --random_splits=True --inference=True --profile=True
 
 echo "GAT"
 python gat.py --dataset=Cora
 python gat.py --dataset=Cora --random_splits=True
+python gat.py --dataset=Cora --inference=True --profile=True
+python gat.py --dataset=Cora --random_splits=True --inference=True --profile=True
 
 echo "Cheby"
 python cheb.py --dataset=Cora --num_hops=3
 python cheb.py --dataset=Cora --num_hops=3 --random_splits=True
+python cheb.py --dataset=Cora --num_hops=3 --inference=True --profile=True
+python cheb.py --dataset=Cora --num_hops=3 --random_splits=True --inference=True --profile=True
 
 echo "SGC"
 python sgc.py --dataset=Cora --K=3 --weight_decay=0.0005
 python sgc.py --dataset=Cora --K=3 --weight_decay=0.0005 --random_splits=True
+python sgc.py --dataset=Cora --K=3 --weight_decay=0.0005 --inference=True --profile=True
+python sgc.py --dataset=Cora --K=3 --weight_decay=0.0005 --random_splits=True --inference=True --profile=True
 
 echo "ARMA"
 python arma.py --dataset=Cora --num_stacks=2 --num_layers=1 --shared_weights=True
 python arma.py --dataset=Cora --num_stacks=3 --num_layers=1 --shared_weights=True --random_splits=True
+python arma.py --dataset=Cora --num_stacks=2 --num_layers=1 --shared_weights=True --inference=True --profile=True
+python arma.py --dataset=Cora --num_stacks=3 --num_layers=1 --shared_weights=True --random_splits=True --inference=True --profile=True
 
 echo "APPNP"
 python appnp.py --dataset=Cora --alpha=0.1
 python appnp.py --dataset=Cora --alpha=0.1 --random_splits=True
+python appnp.py --dataset=Cora --alpha=0.1 --inference=True --profile=True
+python appnp.py --dataset=Cora --alpha=0.1 --random_splits=True --inference=True --profile=True
 
 echo "CiteSeer"
 echo "========"
@@ -33,26 +45,38 @@ echo "========"
 echo "GCN"
 python gcn.py --dataset=CiteSeer
 python gcn.py --dataset=CiteSeer --random_splits=True
+python gcn.py --dataset=CiteSeer --inference=True --profile=True
+python gcn.py --dataset=CiteSeer --random_splits=True --inference=True --profile=True
 
 echo "GAT"
 python gat.py --dataset=CiteSeer
 python gat.py --dataset=CiteSeer --random_splits=True
+python gat.py --dataset=CiteSeer --inference=True --profile=True
+python gat.py --dataset=CiteSeer --random_splits=True --inference=True --profile=True
 
 echo "Cheby"
 python cheb.py --dataset=CiteSeer --num_hops=2
 python cheb.py --dataset=CiteSeer --num_hops=3 --random_splits=True
+python cheb.py --dataset=CiteSeer --num_hops=2 --inference=True --profile=True
+python cheb.py --dataset=CiteSeer --num_hops=3 --random_splits=True --inference=True --profile=True
 
 echo "SGC"
 python sgc.py --dataset=CiteSeer --K=2 --weight_decay=0.005
 python sgc.py --dataset=CiteSeer --K=2 --weight_decay=0.005 --random_splits=True
+python sgc.py --dataset=CiteSeer --K=2 --weight_decay=0.005 --inference=True --profile=True
+python sgc.py --dataset=CiteSeer --K=2 --weight_decay=0.005 --random_splits=True --inference=True --profile=True
 
 echo "ARMA"
 python arma.py --dataset=CiteSeer --num_stacks=3 --num_layers=1 --shared_weights=True
 python arma.py --dataset=CiteSeer --num_stacks=3 --num_layers=1 --shared_weights=True --random_splits=True
+python arma.py --dataset=CiteSeer --num_stacks=3 --num_layers=1 --shared_weights=True --inference=True --profile=True
+python arma.py --dataset=CiteSeer --num_stacks=3 --num_layers=1 --shared_weights=True --random_splits=True --inference=True --profile=True
 
 echo "APPNP"
 python appnp.py --dataset=CiteSeer --alpha=0.1
 python appnp.py --dataset=CiteSeer --alpha=0.1 --random_splits=True
+python appnp.py --dataset=CiteSeer --alpha=0.1 --inference=True --profile=True
+python appnp.py --dataset=CiteSeer --alpha=0.1 --random_splits=True --inference=True --profile=True
 
 echo "PubMed"
 echo "======"
@@ -60,23 +84,35 @@ echo "======"
 echo "GCN"
 python gcn.py --dataset=PubMed
 python gcn.py --dataset=PubMed --random_splits=True
+python gcn.py --dataset=PubMed --inference=True --profile=True
+python gcn.py --dataset=PubMed --random_splits=True --inference=True --profile=True
 
 echo "GAT"
 python gat.py --dataset=PubMed --lr=0.01 --weight_decay=0.001 --output_heads=8
 python gat.py --dataset=PubMed --lr=0.01 --weight_decay=0.001 --output_heads=8 --random_splits=True
+python gat.py --dataset=PubMed --lr=0.01 --weight_decay=0.001 --output_heads=8 --inference=True --profile=True
+python gat.py --dataset=PubMed --lr=0.01 --weight_decay=0.001 --output_heads=8 --random_splits=True --inference=True --profile=True
 
 echo "Cheby"
 python cheb.py --dataset=PubMed --num_hops=2
 python cheb.py --dataset=PubMed --num_hops=2 --random_splits=True
+python cheb.py --dataset=PubMed --num_hops=2 --inference=True --profile=True
+python cheb.py --dataset=PubMed --num_hops=2 --random_splits=True --inference=True --profile=True
 
 echo "SGC"
 python sgc.py --dataset=PubMed --K=2 --weight_decay=0.0005
 python sgc.py --dataset=PubMed --K=2 --weight_decay=0.0005 --random_splits=True
+python sgc.py --dataset=PubMed --K=2 --weight_decay=0.0005 --inference=True --profile=True
+python sgc.py --dataset=PubMed --K=2 --weight_decay=0.0005 --random_splits=True --inference=True --profile=True
 
 echo "ARMA"
 python arma.py --dataset=PubMed --num_stacks=2 --num_layers=1 --skip_dropout=0
 python arma.py --dataset=PubMed --num_stacks=2 --num_layers=1 --skip_dropout=0.5 --random_splits=True
+python arma.py --dataset=PubMed --num_stacks=2 --num_layers=1 --skip_dropout=0 --inference=True --profile=True
+python arma.py --dataset=PubMed --num_stacks=2 --num_layers=1 --skip_dropout=0.5 --random_splits=True --inference=True --profile=True
 
 echo "APPNP"
 python appnp.py --dataset=PubMed --alpha=0.1
 python appnp.py --dataset=PubMed --alpha=0.1 --random_splits=True
+python appnp.py --dataset=PubMed --alpha=0.1 --inference=True --profile=True
+python appnp.py --dataset=PubMed --alpha=0.1 --random_splits=True --inference=True --profile=True
diff --git a/benchmark/citation/sgc.py b/benchmark/citation/sgc.py
index 5adf3814c189..c4b769e80899 100644
--- a/benchmark/citation/sgc.py
+++ b/benchmark/citation/sgc.py
@@ -1,4 +1,5 @@
 import argparse
+import time
 
 import torch
 import torch.nn.functional as F
@@ -38,14 +39,16 @@ def forward(self, data):
 
 dataset = get_planetoid_dataset(args.dataset, args.normalize_features)
 permute_masks = random_planetoid_splits if args.random_splits else None
+t_start = time.time()
 run(dataset, Net(dataset), args.runs, args.epochs, args.lr, args.weight_decay,
     args.early_stopping, args.inference, args.profile, permute_masks)
+t_end = time.time()
+duration = t_end - t_start
+print("sgc-{}-{}: End-to-End time: {} s".format(args.dataset, args.random_splits, duration))
 
 if args.profile:
     import os
     import pathlib
     profile_dir = str(pathlib.Path.cwd()) + '/'
-    profile_file = profile_dir + 'profile-citation-SGC-' + args.dataset + '-random_splits-' + str(args.random_splits) + '.log'
     timeline_file = profile_dir + 'profile-citation-SGC-' + args.dataset + '-random_splits-' + str(args.random_splits) + '.json'
-    os.rename('profile.log', profile_file)
     os.rename('timeline.json', timeline_file)
diff --git a/benchmark/citation/train_eval.py b/benchmark/citation/train_eval.py
index 049cfb4617aa..5390a7bfc5fd 100644
--- a/benchmark/citation/train_eval.py
+++ b/benchmark/citation/train_eval.py
@@ -40,10 +40,6 @@ def trace_handler(p):
     print(output)
     import pathlib
     profile_dir = str(pathlib.Path.cwd()) + '/'
-    profile_file = profile_dir + 'profile' + '.log'
-    with open(profile_file, 'w') as f:
-        f.write(output)
-        f.close()
     timeline_file = profile_dir + 'timeline' + '.json'
     p.export_chrome_trace(timeline_file)
 
@@ -112,8 +108,6 @@ def run(dataset, model, runs, epochs, lr, weight_decay, early_stopping, inferenc
             if torch.cuda.is_available():
                 torch.cuda.synchronize()
 
-            t_start = time.perf_counter()
-
             for epoch in range(1, epochs + 1):
                 if profiling and i == int(runs / 2) and epoch == int(epochs / 2):
                     with profile(activities=[
@@ -127,13 +121,6 @@ def run(dataset, model, runs, epochs, lr, weight_decay, early_stopping, inferenc
             if torch.cuda.is_available():
                 torch.cuda.synchronize()
 
-            t_end = time.perf_counter()
-            durations.append(t_end - t_start)
-
-        duration = tensor(durations)
-        print(f'Inference Duration: {float(duration.mean()):.3f}s')
-
-
 def train(model, optimizer, data):
     model.train()
     optimizer.zero_grad()

From fcd08fb5cd2f6ef042ecfccba3a3770e5b1d5194 Mon Sep 17 00:00:00 2001
From: yanbing-j <yanbing.jiang@intel.com>
Date: Wed, 29 Jun 2022 13:03:44 +0800
Subject: [PATCH 03/24] Print end-to-end time of one epoch

---
 benchmark/citation/appnp.py      |  6 +-----
 benchmark/citation/arma.py       |  6 +-----
 benchmark/citation/cheb.py       |  6 +-----
 benchmark/citation/gat.py        |  6 +-----
 benchmark/citation/gcn.py        |  6 +-----
 benchmark/citation/sgc.py        |  6 +-----
 benchmark/citation/train_eval.py | 31 +++++++++++++++++++------------
 7 files changed, 25 insertions(+), 42 deletions(-)

diff --git a/benchmark/citation/appnp.py b/benchmark/citation/appnp.py
index 97a28699589a..a0812060bbc1 100644
--- a/benchmark/citation/appnp.py
+++ b/benchmark/citation/appnp.py
@@ -1,5 +1,4 @@
 import argparse
-import time
 
 import torch
 import torch.nn.functional as F
@@ -49,12 +48,9 @@ def forward(self, data):
 
 dataset = get_planetoid_dataset(args.dataset, args.normalize_features)
 permute_masks = random_planetoid_splits if args.random_splits else None
-t_start = time.time()
+print("appnp-{}-{}:".format(args.dataset, args.random_splits), end=' ')
 run(dataset, Net(dataset), args.runs, args.epochs, args.lr, args.weight_decay,
     args.early_stopping, args.inference, args.profile, permute_masks)
-t_end = time.time()
-duration = t_end - t_start
-print("appnp-", args.dataset, "-", args.random_splits, ": End-to-End time: ", duration, " s")
 
 if args.profile:
     import os
diff --git a/benchmark/citation/arma.py b/benchmark/citation/arma.py
index 103aca88c0c5..43d587705422 100644
--- a/benchmark/citation/arma.py
+++ b/benchmark/citation/arma.py
@@ -1,5 +1,4 @@
 import argparse
-import time
 
 import torch
 import torch.nn.functional as F
@@ -51,12 +50,9 @@ def forward(self, data):
 
 dataset = get_planetoid_dataset(args.dataset, args.normalize_features)
 permute_masks = random_planetoid_splits if args.random_splits else None
-t_start = time.time()
+print("arma-{}-{}:".format(args.dataset, args.random_splits), end=' ')
 run(dataset, Net(dataset), args.runs, args.epochs, args.lr, args.weight_decay,
     args.early_stopping, args.inference, args.profile, permute_masks)
-t_end = time.time()
-duration = t_end - t_start
-print("arma-{}-{}: End-to-End time: {} s".format(args.dataset, args.random_splits, duration))
 
 if args.profile:
     import os
diff --git a/benchmark/citation/cheb.py b/benchmark/citation/cheb.py
index 7a8788a704fc..0ca8d4780f8f 100644
--- a/benchmark/citation/cheb.py
+++ b/benchmark/citation/cheb.py
@@ -1,5 +1,4 @@
 import argparse
-import time
 
 import torch
 import torch.nn.functional as F
@@ -44,12 +43,9 @@ def forward(self, data):
 
 dataset = get_planetoid_dataset(args.dataset, args.normalize_features)
 permute_masks = random_planetoid_splits if args.random_splits else None
-t_start = time.time()
+print("cheby-{}-{}:".format(args.dataset, args.random_splits), end=' ')
 run(dataset, Net(dataset), args.runs, args.epochs, args.lr, args.weight_decay,
     args.early_stopping, args.inference, args.profile, permute_masks)
-t_end = time.time()
-duration = t_end - t_start
-print("cheby-{}-{}: End-to-End time: {} s".format(args.dataset, args.random_splits, duration))
 
 if args.profile:
     import os
diff --git a/benchmark/citation/gat.py b/benchmark/citation/gat.py
index 551375534355..4f8b2027be12 100644
--- a/benchmark/citation/gat.py
+++ b/benchmark/citation/gat.py
@@ -1,5 +1,4 @@
 import argparse
-import time
 
 import torch
 import torch.nn.functional as F
@@ -49,12 +48,9 @@ def forward(self, data):
 
 dataset = get_planetoid_dataset(args.dataset, args.normalize_features)
 permute_masks = random_planetoid_splits if args.random_splits else None
-t_start = time.time()
+print("gat-{}-{}:".format(args.dataset, args.random_splits), end=' ')
 run(dataset, Net(dataset), args.runs, args.epochs, args.lr, args.weight_decay,
     args.early_stopping, args.inference, args.profile, permute_masks)
-t_end = time.time()
-duration = t_end - t_start
-print("gat-{}-{}: End-to-End time: {} s".format(args.dataset, args.random_splits, duration))
 
 if args.profile:
     import os
diff --git a/benchmark/citation/gcn.py b/benchmark/citation/gcn.py
index 7bdbd2c1b64a..b84b91133e4b 100644
--- a/benchmark/citation/gcn.py
+++ b/benchmark/citation/gcn.py
@@ -1,5 +1,4 @@
 import argparse
-import time
 
 import torch
 import torch.nn.functional as F
@@ -43,12 +42,9 @@ def forward(self, data):
 
 dataset = get_planetoid_dataset(args.dataset, args.normalize_features)
 permute_masks = random_planetoid_splits if args.random_splits else None
-t_start = time.time()
+print("gcn-{}-{}:".format(args.dataset, args.random_splits), end=' ')
 run(dataset, Net(dataset), args.runs, args.epochs, args.lr, args.weight_decay,
     args.early_stopping, args.inference, args.profile, permute_masks)
-t_end = time.time()
-duration = t_end - t_start
-print("gcn-{}-{}: End-to-End time: {} s".format(args.dataset, args.random_splits, duration))
 
 if args.profile:
     import os
diff --git a/benchmark/citation/sgc.py b/benchmark/citation/sgc.py
index c4b769e80899..4d836c824c9d 100644
--- a/benchmark/citation/sgc.py
+++ b/benchmark/citation/sgc.py
@@ -1,5 +1,4 @@
 import argparse
-import time
 
 import torch
 import torch.nn.functional as F
@@ -39,12 +38,9 @@ def forward(self, data):
 
 dataset = get_planetoid_dataset(args.dataset, args.normalize_features)
 permute_masks = random_planetoid_splits if args.random_splits else None
-t_start = time.time()
+print("gcn-{}-{}:".format(args.dataset, args.random_splits), end=' ')
 run(dataset, Net(dataset), args.runs, args.epochs, args.lr, args.weight_decay,
     args.early_stopping, args.inference, args.profile, permute_masks)
-t_end = time.time()
-duration = t_end - t_start
-print("sgc-{}-{}: End-to-End time: {} s".format(args.dataset, args.random_splits, duration))
 
 if args.profile:
     import os
diff --git a/benchmark/citation/train_eval.py b/benchmark/citation/train_eval.py
index 5390a7bfc5fd..7f095b5e4022 100644
--- a/benchmark/citation/train_eval.py
+++ b/benchmark/citation/train_eval.py
@@ -105,22 +105,29 @@ def run(dataset, model, runs, epochs, lr, weight_decay, early_stopping, inferenc
 
             model.to(device).reset_parameters()
 
-            if torch.cuda.is_available():
-                torch.cuda.synchronize()
-
             for epoch in range(1, epochs + 1):
-                if profiling and i == int(runs / 2) and epoch == int(epochs / 2):
-                    with profile(activities=[
-                        ProfilerActivity.CPU, ProfilerActivity.CUDA],
-                        on_trace_ready=trace_handler) as p:
+                if i == int(runs / 2) and epoch == int(epochs / 2):
+                    if profiling:
+                        with profile(activities=[
+                            ProfilerActivity.CPU, ProfilerActivity.CUDA],
+                            on_trace_ready=trace_handler) as p:
+                            test(model, data)
+                            p.step()
+                    else:
+                        if torch.cuda.is_available():
+                            torch.cuda.synchronize()
+                        t_start = time.time()
+
                         test(model, data)
-                        p.step()
+
+                        if torch.cuda.is_available():
+                            torch.cuda.synchronize()
+                        t_end = time.time()
+                        duration = t_end - t_start
+                        print("End-to-End time: {} s".format(duration), flush=True)
                 else:
                     test(model, data)
 
-            if torch.cuda.is_available():
-                torch.cuda.synchronize()
-
 def train(model, optimizer, data):
     model.train()
     optimizer.zero_grad()
@@ -151,4 +158,4 @@ def evaluate(model, data):
 def test(model, data):
     model.eval()
     with torch.no_grad():
-        logits = model(data)
+        model(data)

From 0a0d34980efd84f1fe0db7920ffef8fb8df76efa Mon Sep 17 00:00:00 2001
From: yanbing-j <yanbing.jiang@intel.com>
Date: Thu, 30 Jun 2022 14:16:23 +0800
Subject: [PATCH 04/24] Add inference.sh

---
 benchmark/citation/inference.sh | 118 ++++++++++++++++++++++++++++++++
 benchmark/citation/run.sh       |  36 ----------
 benchmark/citation/sgc.py       |   2 +-
 3 files changed, 119 insertions(+), 37 deletions(-)
 create mode 100755 benchmark/citation/inference.sh

diff --git a/benchmark/citation/inference.sh b/benchmark/citation/inference.sh
new file mode 100755
index 000000000000..7c8180573bee
--- /dev/null
+++ b/benchmark/citation/inference.sh
@@ -0,0 +1,118 @@
+#!/bin/sh
+
+echo "Cora"
+echo "===="
+
+echo "GCN"
+python gcn.py --dataset=Cora --inference=True
+python gcn.py --dataset=Cora --random_splits=True --inference=True
+python gcn.py --dataset=Cora --inference=True --profile=True
+python gcn.py --dataset=Cora --random_splits=True --inference=True --profile=True
+
+echo "GAT"
+python gat.py --dataset=Cora --inference=True
+python gat.py --dataset=Cora --random_splits=True --inference=True
+python gat.py --dataset=Cora --inference=True --profile=True
+python gat.py --dataset=Cora --random_splits=True --inference=True --profile=True
+
+echo "Cheby"
+python cheb.py --dataset=Cora --num_hops=3 --inference=True
+python cheb.py --dataset=Cora --num_hops=3 --random_splits=True --inference=True
+python cheb.py --dataset=Cora --num_hops=3 --inference=True --profile=True
+python cheb.py --dataset=Cora --num_hops=3 --random_splits=True --inference=True --profile=True
+
+echo "SGC"
+python sgc.py --dataset=Cora --K=3 --weight_decay=0.0005 --inference=True
+python sgc.py --dataset=Cora --K=3 --weight_decay=0.0005 --random_splits=True --inference=True
+python sgc.py --dataset=Cora --K=3 --weight_decay=0.0005 --inference=True --profile=True
+python sgc.py --dataset=Cora --K=3 --weight_decay=0.0005 --random_splits=True --inference=True --profile=True
+
+echo "ARMA"
+python arma.py --dataset=Cora --num_stacks=2 --num_layers=1 --shared_weights=True --inference=True
+python arma.py --dataset=Cora --num_stacks=3 --num_layers=1 --shared_weights=True --random_splits=True --inference=True
+python arma.py --dataset=Cora --num_stacks=2 --num_layers=1 --shared_weights=True --inference=True --profile=True
+python arma.py --dataset=Cora --num_stacks=3 --num_layers=1 --shared_weights=True --random_splits=True --inference=True --profile=True
+
+echo "APPNP"
+python appnp.py --dataset=Cora --alpha=0.1 --inference=True
+python appnp.py --dataset=Cora --alpha=0.1 --random_splits=True --inference=True
+python appnp.py --dataset=Cora --alpha=0.1 --inference=True --profile=True
+python appnp.py --dataset=Cora --alpha=0.1 --random_splits=True --inference=True --profile=True
+
+echo "CiteSeer"
+echo "========"
+
+echo "GCN"
+python gcn.py --dataset=CiteSeer --inference=True
+python gcn.py --dataset=CiteSeer --random_splits=True --inference=True
+python gcn.py --dataset=CiteSeer --inference=True --profile=True
+python gcn.py --dataset=CiteSeer --random_splits=True --inference=True --profile=True
+
+echo "GAT"
+python gat.py --dataset=CiteSeer --inference=True
+python gat.py --dataset=CiteSeer --random_splits=True --inference=True
+python gat.py --dataset=CiteSeer --inference=True --profile=True
+python gat.py --dataset=CiteSeer --random_splits=True --inference=True --profile=True
+
+echo "Cheby"
+python cheb.py --dataset=CiteSeer --num_hops=2 --inference=True
+python cheb.py --dataset=CiteSeer --num_hops=3 --random_splits=True --inference=True
+python cheb.py --dataset=CiteSeer --num_hops=2 --inference=True --profile=True
+python cheb.py --dataset=CiteSeer --num_hops=3 --random_splits=True --inference=True --profile=True
+
+echo "SGC"
+python sgc.py --dataset=CiteSeer --K=2 --weight_decay=0.005 --inference=True
+python sgc.py --dataset=CiteSeer --K=2 --weight_decay=0.005 --random_splits=True --inference=True
+python sgc.py --dataset=CiteSeer --K=2 --weight_decay=0.005 --inference=True --profile=True
+python sgc.py --dataset=CiteSeer --K=2 --weight_decay=0.005 --random_splits=True --inference=True --profile=True
+
+echo "ARMA"
+python arma.py --dataset=CiteSeer --num_stacks=3 --num_layers=1 --shared_weights=True --inference=True
+python arma.py --dataset=CiteSeer --num_stacks=3 --num_layers=1 --shared_weights=True --random_splits=True --inference=True
+python arma.py --dataset=CiteSeer --num_stacks=3 --num_layers=1 --shared_weights=True --inference=True --profile=True
+python arma.py --dataset=CiteSeer --num_stacks=3 --num_layers=1 --shared_weights=True --random_splits=True --inference=True --profile=True
+
+echo "APPNP"
+python appnp.py --dataset=CiteSeer --alpha=0.1 --inference=True
+python appnp.py --dataset=CiteSeer --alpha=0.1 --random_splits=True --inference=True
+python appnp.py --dataset=CiteSeer --alpha=0.1 --inference=True --profile=True
+python appnp.py --dataset=CiteSeer --alpha=0.1 --random_splits=True --inference=True --profile=True
+
+echo "PubMed"
+echo "======"
+
+echo "GCN"
+python gcn.py --dataset=PubMed --inference=True
+python gcn.py --dataset=PubMed --random_splits=True --inference=True
+python gcn.py --dataset=PubMed --inference=True --profile=True
+python gcn.py --dataset=PubMed --random_splits=True --inference=True --profile=True
+
+echo "GAT"
+python gat.py --dataset=PubMed --lr=0.01 --weight_decay=0.001 --output_heads=8 --inference=True
+python gat.py --dataset=PubMed --lr=0.01 --weight_decay=0.001 --output_heads=8 --random_splits=True --inference=True
+python gat.py --dataset=PubMed --lr=0.01 --weight_decay=0.001 --output_heads=8 --inference=True --profile=True
+python gat.py --dataset=PubMed --lr=0.01 --weight_decay=0.001 --output_heads=8 --random_splits=True --inference=True --profile=True
+
+echo "Cheby"
+python cheb.py --dataset=PubMed --num_hops=2 --inference=True
+python cheb.py --dataset=PubMed --num_hops=2 --random_splits=True --inference=True
+python cheb.py --dataset=PubMed --num_hops=2 --inference=True --profile=True
+python cheb.py --dataset=PubMed --num_hops=2 --random_splits=True --inference=True --profile=True
+
+echo "SGC"
+python sgc.py --dataset=PubMed --K=2 --weight_decay=0.0005 --inference=True
+python sgc.py --dataset=PubMed --K=2 --weight_decay=0.0005 --random_splits=True --inference=True
+python sgc.py --dataset=PubMed --K=2 --weight_decay=0.0005 --inference=True --profile=True
+python sgc.py --dataset=PubMed --K=2 --weight_decay=0.0005 --random_splits=True --inference=True --profile=True
+
+echo "ARMA"
+python arma.py --dataset=PubMed --num_stacks=2 --num_layers=1 --skip_dropout=0 --inference=True
+python arma.py --dataset=PubMed --num_stacks=2 --num_layers=1 --skip_dropout=0.5 --random_splits=True --inference=True
+python arma.py --dataset=PubMed --num_stacks=2 --num_layers=1 --skip_dropout=0 --inference=True --profile=True
+python arma.py --dataset=PubMed --num_stacks=2 --num_layers=1 --skip_dropout=0.5 --random_splits=True --inference=True --profile=True
+
+echo "APPNP"
+python appnp.py --dataset=PubMed --alpha=0.1 --inference=True
+python appnp.py --dataset=PubMed --alpha=0.1 --random_splits=True --inference=True
+python appnp.py --dataset=PubMed --alpha=0.1 --inference=True --profile=True
+python appnp.py --dataset=PubMed --alpha=0.1 --random_splits=True --inference=True --profile=True
diff --git a/benchmark/citation/run.sh b/benchmark/citation/run.sh
index 6e4288f678e1..dc584555fe7a 100755
--- a/benchmark/citation/run.sh
+++ b/benchmark/citation/run.sh
@@ -6,38 +6,26 @@ echo "===="
 echo "GCN"
 python gcn.py --dataset=Cora
 python gcn.py --dataset=Cora --random_splits=True
-python gcn.py --dataset=Cora --inference=True --profile=True
-python gcn.py --dataset=Cora --random_splits=True --inference=True --profile=True
 
 echo "GAT"
 python gat.py --dataset=Cora
 python gat.py --dataset=Cora --random_splits=True
-python gat.py --dataset=Cora --inference=True --profile=True
-python gat.py --dataset=Cora --random_splits=True --inference=True --profile=True
 
 echo "Cheby"
 python cheb.py --dataset=Cora --num_hops=3
 python cheb.py --dataset=Cora --num_hops=3 --random_splits=True
-python cheb.py --dataset=Cora --num_hops=3 --inference=True --profile=True
-python cheb.py --dataset=Cora --num_hops=3 --random_splits=True --inference=True --profile=True
 
 echo "SGC"
 python sgc.py --dataset=Cora --K=3 --weight_decay=0.0005
 python sgc.py --dataset=Cora --K=3 --weight_decay=0.0005 --random_splits=True
-python sgc.py --dataset=Cora --K=3 --weight_decay=0.0005 --inference=True --profile=True
-python sgc.py --dataset=Cora --K=3 --weight_decay=0.0005 --random_splits=True --inference=True --profile=True
 
 echo "ARMA"
 python arma.py --dataset=Cora --num_stacks=2 --num_layers=1 --shared_weights=True
 python arma.py --dataset=Cora --num_stacks=3 --num_layers=1 --shared_weights=True --random_splits=True
-python arma.py --dataset=Cora --num_stacks=2 --num_layers=1 --shared_weights=True --inference=True --profile=True
-python arma.py --dataset=Cora --num_stacks=3 --num_layers=1 --shared_weights=True --random_splits=True --inference=True --profile=True
 
 echo "APPNP"
 python appnp.py --dataset=Cora --alpha=0.1
 python appnp.py --dataset=Cora --alpha=0.1 --random_splits=True
-python appnp.py --dataset=Cora --alpha=0.1 --inference=True --profile=True
-python appnp.py --dataset=Cora --alpha=0.1 --random_splits=True --inference=True --profile=True
 
 echo "CiteSeer"
 echo "========"
@@ -45,38 +33,26 @@ echo "========"
 echo "GCN"
 python gcn.py --dataset=CiteSeer
 python gcn.py --dataset=CiteSeer --random_splits=True
-python gcn.py --dataset=CiteSeer --inference=True --profile=True
-python gcn.py --dataset=CiteSeer --random_splits=True --inference=True --profile=True
 
 echo "GAT"
 python gat.py --dataset=CiteSeer
 python gat.py --dataset=CiteSeer --random_splits=True
-python gat.py --dataset=CiteSeer --inference=True --profile=True
-python gat.py --dataset=CiteSeer --random_splits=True --inference=True --profile=True
 
 echo "Cheby"
 python cheb.py --dataset=CiteSeer --num_hops=2
 python cheb.py --dataset=CiteSeer --num_hops=3 --random_splits=True
-python cheb.py --dataset=CiteSeer --num_hops=2 --inference=True --profile=True
-python cheb.py --dataset=CiteSeer --num_hops=3 --random_splits=True --inference=True --profile=True
 
 echo "SGC"
 python sgc.py --dataset=CiteSeer --K=2 --weight_decay=0.005
 python sgc.py --dataset=CiteSeer --K=2 --weight_decay=0.005 --random_splits=True
-python sgc.py --dataset=CiteSeer --K=2 --weight_decay=0.005 --inference=True --profile=True
-python sgc.py --dataset=CiteSeer --K=2 --weight_decay=0.005 --random_splits=True --inference=True --profile=True
 
 echo "ARMA"
 python arma.py --dataset=CiteSeer --num_stacks=3 --num_layers=1 --shared_weights=True
 python arma.py --dataset=CiteSeer --num_stacks=3 --num_layers=1 --shared_weights=True --random_splits=True
-python arma.py --dataset=CiteSeer --num_stacks=3 --num_layers=1 --shared_weights=True --inference=True --profile=True
-python arma.py --dataset=CiteSeer --num_stacks=3 --num_layers=1 --shared_weights=True --random_splits=True --inference=True --profile=True
 
 echo "APPNP"
 python appnp.py --dataset=CiteSeer --alpha=0.1
 python appnp.py --dataset=CiteSeer --alpha=0.1 --random_splits=True
-python appnp.py --dataset=CiteSeer --alpha=0.1 --inference=True --profile=True
-python appnp.py --dataset=CiteSeer --alpha=0.1 --random_splits=True --inference=True --profile=True
 
 echo "PubMed"
 echo "======"
@@ -84,35 +60,23 @@ echo "======"
 echo "GCN"
 python gcn.py --dataset=PubMed
 python gcn.py --dataset=PubMed --random_splits=True
-python gcn.py --dataset=PubMed --inference=True --profile=True
-python gcn.py --dataset=PubMed --random_splits=True --inference=True --profile=True
 
 echo "GAT"
 python gat.py --dataset=PubMed --lr=0.01 --weight_decay=0.001 --output_heads=8
 python gat.py --dataset=PubMed --lr=0.01 --weight_decay=0.001 --output_heads=8 --random_splits=True
-python gat.py --dataset=PubMed --lr=0.01 --weight_decay=0.001 --output_heads=8 --inference=True --profile=True
-python gat.py --dataset=PubMed --lr=0.01 --weight_decay=0.001 --output_heads=8 --random_splits=True --inference=True --profile=True
 
 echo "Cheby"
 python cheb.py --dataset=PubMed --num_hops=2
 python cheb.py --dataset=PubMed --num_hops=2 --random_splits=True
-python cheb.py --dataset=PubMed --num_hops=2 --inference=True --profile=True
-python cheb.py --dataset=PubMed --num_hops=2 --random_splits=True --inference=True --profile=True
 
 echo "SGC"
 python sgc.py --dataset=PubMed --K=2 --weight_decay=0.0005
 python sgc.py --dataset=PubMed --K=2 --weight_decay=0.0005 --random_splits=True
-python sgc.py --dataset=PubMed --K=2 --weight_decay=0.0005 --inference=True --profile=True
-python sgc.py --dataset=PubMed --K=2 --weight_decay=0.0005 --random_splits=True --inference=True --profile=True
 
 echo "ARMA"
 python arma.py --dataset=PubMed --num_stacks=2 --num_layers=1 --skip_dropout=0
 python arma.py --dataset=PubMed --num_stacks=2 --num_layers=1 --skip_dropout=0.5 --random_splits=True
-python arma.py --dataset=PubMed --num_stacks=2 --num_layers=1 --skip_dropout=0 --inference=True --profile=True
-python arma.py --dataset=PubMed --num_stacks=2 --num_layers=1 --skip_dropout=0.5 --random_splits=True --inference=True --profile=True
 
 echo "APPNP"
 python appnp.py --dataset=PubMed --alpha=0.1
 python appnp.py --dataset=PubMed --alpha=0.1 --random_splits=True
-python appnp.py --dataset=PubMed --alpha=0.1 --inference=True --profile=True
-python appnp.py --dataset=PubMed --alpha=0.1 --random_splits=True --inference=True --profile=True
diff --git a/benchmark/citation/sgc.py b/benchmark/citation/sgc.py
index 4d836c824c9d..936879a81824 100644
--- a/benchmark/citation/sgc.py
+++ b/benchmark/citation/sgc.py
@@ -38,7 +38,7 @@ def forward(self, data):
 
 dataset = get_planetoid_dataset(args.dataset, args.normalize_features)
 permute_masks = random_planetoid_splits if args.random_splits else None
-print("gcn-{}-{}:".format(args.dataset, args.random_splits), end=' ')
+print("sgc-{}-{}:".format(args.dataset, args.random_splits), end=' ')
 run(dataset, Net(dataset), args.runs, args.epochs, args.lr, args.weight_decay,
     args.early_stopping, args.inference, args.profile, permute_masks)
 

From 5eb049122aa798a1eb51539b63a221c7884f3227 Mon Sep 17 00:00:00 2001
From: yanbing-j <yanbing.jiang@intel.com>
Date: Fri, 1 Jul 2022 10:56:22 +0800
Subject: [PATCH 05/24] Add inference and profile for to_hetero_mag

---
 examples/hetero/to_hetero_mag.py | 51 ++++++++++++++++++++++++++++----
 1 file changed, 46 insertions(+), 5 deletions(-)

diff --git a/examples/hetero/to_hetero_mag.py b/examples/hetero/to_hetero_mag.py
index 6605038c9af3..69c2e97ecd74 100644
--- a/examples/hetero/to_hetero_mag.py
+++ b/examples/hetero/to_hetero_mag.py
@@ -1,10 +1,12 @@
 import argparse
 import os.path as osp
+import time
 
 import torch
 import torch.nn.functional as F
 from torch.nn import ReLU
 from tqdm import tqdm
+from torch.profiler import profile, ProfilerActivity
 
 import torch_geometric.transforms as T
 from torch_geometric.datasets import OGB_MAG
@@ -13,9 +15,12 @@
 
 parser = argparse.ArgumentParser()
 parser.add_argument('--use_hgt_loader', action='store_true')
+parser.add_argument('--inference', type=bool, default=False)
+parser.add_argument('--profile', type=bool, default=False) # Currently support profile in inference
 args = parser.parse_args()
 
 device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+profile_sort = "self_cuda_time_total" if torch.cuda.is_available() else "self_cpu_time_total"
 
 path = osp.join(osp.dirname(osp.realpath(__file__)), '../../data/OGB')
 transform = T.ToUndirected(merge=True)
@@ -48,6 +53,13 @@
 ])
 model = to_hetero(model, data.metadata(), aggr='sum').to(device)
 
+def trace_handler(p):
+    output = p.key_averages().table(sort_by=profile_sort)
+    print(output)
+    import pathlib
+    profile_dir = str(pathlib.Path.cwd()) + '/'
+    timeline_file = profile_dir + 'timeline-to-hetero-mag' + '.json'
+    p.export_chrome_trace(timeline_file)
 
 @torch.no_grad()
 def init_params():
@@ -92,11 +104,40 @@ def test(loader):
 
     return total_correct / total_examples
 
+@torch.no_grad()
+def inference(loader):
+    model.eval()
+    for batch in tqdm(loader):
+        batch = batch.to(device, 'edge_index')
+        batch_size = batch['paper'].batch_size
+        model(batch.x_dict, batch.edge_index_dict)
 
 init_params()  # Initialize parameters.
-optimizer = torch.optim.Adam(model.parameters(), lr=0.01)
+if not args.inference:
+    optimizer = torch.optim.Adam(model.parameters(), lr=0.01)
 
-for epoch in range(1, 21):
-    loss = train()
-    val_acc = test(val_loader)
-    print(f'Epoch: {epoch:02d}, Loss: {loss:.4f}, Val: {val_acc:.4f}')
+    for epoch in range(1, 21):
+        loss = train()
+        val_acc = test(val_loader)
+        print(f'Epoch: {epoch:02d}, Loss: {loss:.4f}, Val: {val_acc:.4f}')
+else:
+    for epoch in range(1, 21):
+        if epoch == 20:
+            if args.profile:
+                with profile(activities=[
+                    ProfilerActivity.CPU, ProfilerActivity.CUDA],
+                    on_trace_ready=trace_handler) as p:
+                     inference(val_loader)
+                     p.step()
+            else:
+                if torch.cuda.is_available():
+                    torch.cuda.synchronize()
+                t_start = time.time()
+                inference(val_loader)
+                if torch.cuda.is_available():
+                    torch.cuda.synchronize()
+                t_end = time.time()
+                duration = t_end - t_start
+                print("End-to-End time: {} s".format(duration), flush=True)
+        else:
+            inference(val_loader)

From ac33b480c315c17b3ea679c3f59418362efa9ad8 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Fri, 1 Jul 2022 04:46:31 +0000
Subject: [PATCH 06/24] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 benchmark/citation/appnp.py      |  6 ++++--
 benchmark/citation/arma.py       |  6 ++++--
 benchmark/citation/cheb.py       |  6 ++++--
 benchmark/citation/gat.py        |  6 ++++--
 benchmark/citation/gcn.py        |  6 ++++--
 benchmark/citation/sgc.py        |  6 ++++--
 benchmark/citation/train_eval.py | 32 +++++++++++++++++++++-----------
 examples/hetero/to_hetero_mag.py | 23 +++++++++++++++--------
 8 files changed, 60 insertions(+), 31 deletions(-)

diff --git a/benchmark/citation/appnp.py b/benchmark/citation/appnp.py
index a0812060bbc1..4a9875921b2d 100644
--- a/benchmark/citation/appnp.py
+++ b/benchmark/citation/appnp.py
@@ -21,7 +21,8 @@
 parser.add_argument('--K', type=int, default=10)
 parser.add_argument('--alpha', type=float, default=0.1)
 parser.add_argument('--inference', type=bool, default=False)
-parser.add_argument('--profile', type=bool, default=False) # Currently support profile in inference
+parser.add_argument('--profile', type=bool,
+                    default=False)  # Currently support profile in inference
 args = parser.parse_args()
 
 
@@ -56,5 +57,6 @@ def forward(self, data):
     import os
     import pathlib
     profile_dir = str(pathlib.Path.cwd()) + '/'
-    timeline_file = profile_dir + 'profile-citation-APPNP-' + args.dataset + '-random_splits-' + str(args.random_splits) + '.json'
+    timeline_file = profile_dir + 'profile-citation-APPNP-' + args.dataset + '-random_splits-' + str(
+        args.random_splits) + '.json'
     os.rename('timeline.json', timeline_file)
diff --git a/benchmark/citation/arma.py b/benchmark/citation/arma.py
index 43d587705422..5c65e9e0d814 100644
--- a/benchmark/citation/arma.py
+++ b/benchmark/citation/arma.py
@@ -22,7 +22,8 @@
 parser.add_argument('--shared_weights', type=bool, default=False)
 parser.add_argument('--skip_dropout', type=float, default=0.75)
 parser.add_argument('--inference', type=bool, default=False)
-parser.add_argument('--profile', type=bool, default=False) # Currently support profile in inference
+parser.add_argument('--profile', type=bool,
+                    default=False)  # Currently support profile in inference
 args = parser.parse_args()
 
 
@@ -58,5 +59,6 @@ def forward(self, data):
     import os
     import pathlib
     profile_dir = str(pathlib.Path.cwd()) + '/'
-    timeline_file = profile_dir + 'profile-citation-ARMA-' + args.dataset + '-random_splits-' + str(args.random_splits) + '.json'
+    timeline_file = profile_dir + 'profile-citation-ARMA-' + args.dataset + '-random_splits-' + str(
+        args.random_splits) + '.json'
     os.rename('timeline.json', timeline_file)
diff --git a/benchmark/citation/cheb.py b/benchmark/citation/cheb.py
index 0ca8d4780f8f..b33c2fe2e96d 100644
--- a/benchmark/citation/cheb.py
+++ b/benchmark/citation/cheb.py
@@ -19,7 +19,8 @@
 parser.add_argument('--normalize_features', type=bool, default=True)
 parser.add_argument('--num_hops', type=int, default=3)
 parser.add_argument('--inference', type=bool, default=False)
-parser.add_argument('--profile', type=bool, default=False) # Currently support profile in inference
+parser.add_argument('--profile', type=bool,
+                    default=False)  # Currently support profile in inference
 args = parser.parse_args()
 
 
@@ -51,5 +52,6 @@ def forward(self, data):
     import os
     import pathlib
     profile_dir = str(pathlib.Path.cwd()) + '/'
-    timeline_file = profile_dir + 'profile-citation-CHEBY-' + args.dataset + '-random_splits-' + str(args.random_splits) + '.json'
+    timeline_file = profile_dir + 'profile-citation-CHEBY-' + args.dataset + '-random_splits-' + str(
+        args.random_splits) + '.json'
     os.rename('timeline.json', timeline_file)
diff --git a/benchmark/citation/gat.py b/benchmark/citation/gat.py
index 4f8b2027be12..0ac29696a416 100644
--- a/benchmark/citation/gat.py
+++ b/benchmark/citation/gat.py
@@ -20,7 +20,8 @@
 parser.add_argument('--heads', type=int, default=8)
 parser.add_argument('--output_heads', type=int, default=1)
 parser.add_argument('--inference', type=bool, default=False)
-parser.add_argument('--profile', type=bool, default=False) # Currently support profile in inference
+parser.add_argument('--profile', type=bool,
+                    default=False)  # Currently support profile in inference
 args = parser.parse_args()
 
 
@@ -56,5 +57,6 @@ def forward(self, data):
     import os
     import pathlib
     profile_dir = str(pathlib.Path.cwd()) + '/'
-    timeline_file = profile_dir + 'profile-citation-GAT-' + args.dataset + '-random_splits-' + str(args.random_splits) + '.json'
+    timeline_file = profile_dir + 'profile-citation-GAT-' + args.dataset + '-random_splits-' + str(
+        args.random_splits) + '.json'
     os.rename('timeline.json', timeline_file)
diff --git a/benchmark/citation/gcn.py b/benchmark/citation/gcn.py
index b84b91133e4b..d1f7683e1c28 100644
--- a/benchmark/citation/gcn.py
+++ b/benchmark/citation/gcn.py
@@ -18,7 +18,8 @@
 parser.add_argument('--dropout', type=float, default=0.5)
 parser.add_argument('--normalize_features', type=bool, default=True)
 parser.add_argument('--inference', type=bool, default=False)
-parser.add_argument('--profile', type=bool, default=False) # Currently support profile in inference
+parser.add_argument('--profile', type=bool,
+                    default=False)  # Currently support profile in inference
 args = parser.parse_args()
 
 
@@ -50,5 +51,6 @@ def forward(self, data):
     import os
     import pathlib
     profile_dir = str(pathlib.Path.cwd()) + '/'
-    timeline_file = profile_dir + 'profile-citation-GCN-' + args.dataset + '-random_splits-' + str(args.random_splits) + '.json'
+    timeline_file = profile_dir + 'profile-citation-GCN-' + args.dataset + '-random_splits-' + str(
+        args.random_splits) + '.json'
     os.rename('timeline.json', timeline_file)
diff --git a/benchmark/citation/sgc.py b/benchmark/citation/sgc.py
index 936879a81824..5ad43c3ada59 100644
--- a/benchmark/citation/sgc.py
+++ b/benchmark/citation/sgc.py
@@ -17,7 +17,8 @@
 parser.add_argument('--normalize_features', type=bool, default=False)
 parser.add_argument('--K', type=int, default=2)
 parser.add_argument('--inference', type=bool, default=False)
-parser.add_argument('--profile', type=bool, default=False) # Currently support profile in inference
+parser.add_argument('--profile', type=bool,
+                    default=False)  # Currently support profile in inference
 args = parser.parse_args()
 
 
@@ -46,5 +47,6 @@ def forward(self, data):
     import os
     import pathlib
     profile_dir = str(pathlib.Path.cwd()) + '/'
-    timeline_file = profile_dir + 'profile-citation-SGC-' + args.dataset + '-random_splits-' + str(args.random_splits) + '.json'
+    timeline_file = profile_dir + 'profile-citation-SGC-' + args.dataset + '-random_splits-' + str(
+        args.random_splits) + '.json'
     os.rename('timeline.json', timeline_file)
diff --git a/benchmark/citation/train_eval.py b/benchmark/citation/train_eval.py
index 7f095b5e4022..2710109f87ef 100644
--- a/benchmark/citation/train_eval.py
+++ b/benchmark/citation/train_eval.py
@@ -4,12 +4,13 @@
 import torch.nn.functional as F
 from torch import tensor
 from torch.optim import Adam
-from torch.profiler import profile, ProfilerActivity
+from torch.profiler import ProfilerActivity, profile
 
 from torch_geometric.utils import index_to_mask
 
 device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
-profile_sort = "self_cuda_time_total" if torch.cuda.is_available() else "self_cpu_time_total"
+profile_sort = "self_cuda_time_total" if torch.cuda.is_available(
+) else "self_cpu_time_total"
 
 
 def random_planetoid_splits(data, num_classes):
@@ -35,6 +36,7 @@ def random_planetoid_splits(data, num_classes):
 
     return data
 
+
 def trace_handler(p):
     output = p.key_averages().table(sort_by=profile_sort)
     print(output)
@@ -43,8 +45,9 @@ def trace_handler(p):
     timeline_file = profile_dir + 'timeline' + '.json'
     p.export_chrome_trace(timeline_file)
 
-def run(dataset, model, runs, epochs, lr, weight_decay, early_stopping, inference, profiling,
-        permute_masks=None, logger=None):
+
+def run(dataset, model, runs, epochs, lr, weight_decay, early_stopping,
+        inference, profiling, permute_masks=None, logger=None):
     val_losses, accs, durations = [], [], []
     if not inference:
         for _ in range(runs):
@@ -54,7 +57,8 @@ def run(dataset, model, runs, epochs, lr, weight_decay, early_stopping, inferenc
             data = data.to(device)
 
             model.to(device).reset_parameters()
-            optimizer = Adam(model.parameters(), lr=lr, weight_decay=weight_decay)
+            optimizer = Adam(model.parameters(), lr=lr,
+                             weight_decay=weight_decay)
 
             if torch.cuda.is_available():
                 torch.cuda.synchronize()
@@ -91,9 +95,11 @@ def run(dataset, model, runs, epochs, lr, weight_decay, early_stopping, inferenc
             val_losses.append(best_val_loss)
             accs.append(test_acc)
             durations.append(t_end - t_start)
-        loss, acc, duration = tensor(val_losses), tensor(accs), tensor(durations)
+        loss, acc, duration = tensor(val_losses), tensor(accs), tensor(
+            durations)
 
-        print(f'Val Loss: {float(loss.mean()):.4f}, '
+        print(
+            f'Val Loss: {float(loss.mean()):.4f}, '
             f'Test Accuracy: {float(acc.mean()):.3f} ± {float(acc.std()):.3f}, '
             f'Duration: {float(duration.mean()):.3f}s')
     else:
@@ -108,9 +114,10 @@ def run(dataset, model, runs, epochs, lr, weight_decay, early_stopping, inferenc
             for epoch in range(1, epochs + 1):
                 if i == int(runs / 2) and epoch == int(epochs / 2):
                     if profiling:
-                        with profile(activities=[
-                            ProfilerActivity.CPU, ProfilerActivity.CUDA],
-                            on_trace_ready=trace_handler) as p:
+                        with profile(
+                                activities=[
+                                    ProfilerActivity.CPU, ProfilerActivity.CUDA
+                                ], on_trace_ready=trace_handler) as p:
                             test(model, data)
                             p.step()
                     else:
@@ -124,10 +131,12 @@ def run(dataset, model, runs, epochs, lr, weight_decay, early_stopping, inferenc
                             torch.cuda.synchronize()
                         t_end = time.time()
                         duration = t_end - t_start
-                        print("End-to-End time: {} s".format(duration), flush=True)
+                        print("End-to-End time: {} s".format(duration),
+                              flush=True)
                 else:
                     test(model, data)
 
+
 def train(model, optimizer, data):
     model.train()
     optimizer.zero_grad()
@@ -155,6 +164,7 @@ def evaluate(model, data):
 
     return outs
 
+
 def test(model, data):
     model.eval()
     with torch.no_grad():
diff --git a/examples/hetero/to_hetero_mag.py b/examples/hetero/to_hetero_mag.py
index 69c2e97ecd74..bbad09c1aa57 100644
--- a/examples/hetero/to_hetero_mag.py
+++ b/examples/hetero/to_hetero_mag.py
@@ -5,8 +5,8 @@
 import torch
 import torch.nn.functional as F
 from torch.nn import ReLU
+from torch.profiler import ProfilerActivity, profile
 from tqdm import tqdm
-from torch.profiler import profile, ProfilerActivity
 
 import torch_geometric.transforms as T
 from torch_geometric.datasets import OGB_MAG
@@ -16,11 +16,13 @@
 parser = argparse.ArgumentParser()
 parser.add_argument('--use_hgt_loader', action='store_true')
 parser.add_argument('--inference', type=bool, default=False)
-parser.add_argument('--profile', type=bool, default=False) # Currently support profile in inference
+parser.add_argument('--profile', type=bool,
+                    default=False)  # Currently support profile in inference
 args = parser.parse_args()
 
 device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
-profile_sort = "self_cuda_time_total" if torch.cuda.is_available() else "self_cpu_time_total"
+profile_sort = "self_cuda_time_total" if torch.cuda.is_available(
+) else "self_cpu_time_total"
 
 path = osp.join(osp.dirname(osp.realpath(__file__)), '../../data/OGB')
 transform = T.ToUndirected(merge=True)
@@ -53,6 +55,7 @@
 ])
 model = to_hetero(model, data.metadata(), aggr='sum').to(device)
 
+
 def trace_handler(p):
     output = p.key_averages().table(sort_by=profile_sort)
     print(output)
@@ -61,6 +64,7 @@ def trace_handler(p):
     timeline_file = profile_dir + 'timeline-to-hetero-mag' + '.json'
     p.export_chrome_trace(timeline_file)
 
+
 @torch.no_grad()
 def init_params():
     # Initialize lazy parameters via forwarding a single batch to the model:
@@ -104,6 +108,7 @@ def test(loader):
 
     return total_correct / total_examples
 
+
 @torch.no_grad()
 def inference(loader):
     model.eval()
@@ -112,6 +117,7 @@ def inference(loader):
         batch_size = batch['paper'].batch_size
         model(batch.x_dict, batch.edge_index_dict)
 
+
 init_params()  # Initialize parameters.
 if not args.inference:
     optimizer = torch.optim.Adam(model.parameters(), lr=0.01)
@@ -124,11 +130,12 @@ def inference(loader):
     for epoch in range(1, 21):
         if epoch == 20:
             if args.profile:
-                with profile(activities=[
-                    ProfilerActivity.CPU, ProfilerActivity.CUDA],
-                    on_trace_ready=trace_handler) as p:
-                     inference(val_loader)
-                     p.step()
+                with profile(
+                        activities=[
+                            ProfilerActivity.CPU, ProfilerActivity.CUDA
+                        ], on_trace_ready=trace_handler) as p:
+                    inference(val_loader)
+                    p.step()
             else:
                 if torch.cuda.is_available():
                     torch.cuda.synchronize()

From d5dfd35e22d400f5d8849bb275dd54443f53d3ea Mon Sep 17 00:00:00 2001
From: yanbing-j <yanbing.jiang@intel.com>
Date: Mon, 4 Jul 2022 09:27:46 +0800
Subject: [PATCH 07/24] Add inference for pna

---
 examples/pna.py | 64 ++++++++++++++++++++++++++++++++++++++++++-------
 1 file changed, 56 insertions(+), 8 deletions(-)

diff --git a/examples/pna.py b/examples/pna.py
index 4697f49d7121..8f530c2fce39 100644
--- a/examples/pna.py
+++ b/examples/pna.py
@@ -1,15 +1,24 @@
 import os.path as osp
+import time
+import argparse
 
 import torch
 import torch.nn.functional as F
 from torch.nn import Embedding, Linear, ModuleList, ReLU, Sequential
 from torch.optim.lr_scheduler import ReduceLROnPlateau
+from torch.profiler import ProfilerActivity, profile
 
 from torch_geometric.datasets import ZINC
 from torch_geometric.loader import DataLoader
 from torch_geometric.nn import BatchNorm, PNAConv, global_add_pool
 from torch_geometric.utils import degree
 
+parser = argparse.ArgumentParser()
+parser.add_argument('--inference', type=bool, default=False)
+parser.add_argument('--profile', type=bool,
+                    default=False)  # Currently support profile in inference
+args = parser.parse_args()
+
 path = osp.join(osp.dirname(osp.realpath(__file__)), '..', 'data', 'ZINC')
 train_dataset = ZINC(path, subset=True, split='train')
 val_dataset = ZINC(path, subset=True, split='val')
@@ -68,10 +77,20 @@ def forward(self, x, edge_index, edge_attr, batch):
 
 device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
 model = Net().to(device)
+profile_sort = "self_cuda_time_total" if torch.cuda.is_available(
+) else "self_cpu_time_total"
 optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
 scheduler = ReduceLROnPlateau(optimizer, mode='min', factor=0.5, patience=20,
                               min_lr=0.00001)
 
+def trace_handler(p):
+    output = p.key_averages().table(sort_by=profile_sort)
+    print(output)
+    import pathlib
+    profile_dir = str(pathlib.Path.cwd()) + '/'
+    timeline_file = profile_dir + 'timeline-to-pna' + '.json'
+    p.export_chrome_trace(timeline_file)
+
 
 def train(epoch):
     model.train()
@@ -99,11 +118,40 @@ def test(loader):
         total_error += (out.squeeze() - data.y).abs().sum().item()
     return total_error / len(loader.dataset)
 
-
-for epoch in range(1, 301):
-    loss = train(epoch)
-    val_mae = test(val_loader)
-    test_mae = test(test_loader)
-    scheduler.step(val_mae)
-    print(f'Epoch: {epoch:02d}, Loss: {loss:.4f}, Val: {val_mae:.4f}, '
-          f'Test: {test_mae:.4f}')
+@torch.no_grad()
+def inference(loader):
+    model.eval()
+    for data in loader:
+        data = data.to(device)
+        model(data.x, data.edge_index, data.edge_attr, data.batch)
+
+if not args.inference:
+    for epoch in range(1, 301):
+        loss = train(epoch)
+        val_mae = test(val_loader)
+        test_mae = test(test_loader)
+        scheduler.step(val_mae)
+        print(f'Epoch: {epoch:02d}, Loss: {loss:.4f}, Val: {val_mae:.4f}, '
+            f'Test: {test_mae:.4f}')
+else:
+    for epoch in range(1, 301):
+        if epoch == 300:
+            if args.profile:
+                with profile(
+                    activities=[
+                        ProfilerActivity.CPU, ProfilerActivity.CUDA
+                    ], on_trace_ready=trace_handler) as p:
+                    inference(test_loader)
+                    p.step()
+            else:
+                if torch.cuda.is_available():
+                    torch.cuda.synchronize()
+                t_start = time.time()
+                inference(test_loader)
+                if torch.cuda.is_available():
+                    torch.cuda.synchronize()
+                t_end = time.time()
+                duration = t_end - t_start
+                print("End-to-End time: {} s".format(duration), flush=True)
+        else:
+            inference(test_loader)

From 5cb18c8ada5a07c69930ed9a2b8e58bbd0174fac Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Mon, 4 Jul 2022 01:30:56 +0000
Subject: [PATCH 08/24] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 examples/pna.py | 13 ++++++++-----
 1 file changed, 8 insertions(+), 5 deletions(-)

diff --git a/examples/pna.py b/examples/pna.py
index 8f530c2fce39..c81e97eb4af9 100644
--- a/examples/pna.py
+++ b/examples/pna.py
@@ -1,6 +1,6 @@
+import argparse
 import os.path as osp
 import time
-import argparse
 
 import torch
 import torch.nn.functional as F
@@ -83,6 +83,7 @@ def forward(self, x, edge_index, edge_attr, batch):
 scheduler = ReduceLROnPlateau(optimizer, mode='min', factor=0.5, patience=20,
                               min_lr=0.00001)
 
+
 def trace_handler(p):
     output = p.key_averages().table(sort_by=profile_sort)
     print(output)
@@ -118,6 +119,7 @@ def test(loader):
         total_error += (out.squeeze() - data.y).abs().sum().item()
     return total_error / len(loader.dataset)
 
+
 @torch.no_grad()
 def inference(loader):
     model.eval()
@@ -125,6 +127,7 @@ def inference(loader):
         data = data.to(device)
         model(data.x, data.edge_index, data.edge_attr, data.batch)
 
+
 if not args.inference:
     for epoch in range(1, 301):
         loss = train(epoch)
@@ -132,15 +135,15 @@ def inference(loader):
         test_mae = test(test_loader)
         scheduler.step(val_mae)
         print(f'Epoch: {epoch:02d}, Loss: {loss:.4f}, Val: {val_mae:.4f}, '
-            f'Test: {test_mae:.4f}')
+              f'Test: {test_mae:.4f}')
 else:
     for epoch in range(1, 301):
         if epoch == 300:
             if args.profile:
                 with profile(
-                    activities=[
-                        ProfilerActivity.CPU, ProfilerActivity.CUDA
-                    ], on_trace_ready=trace_handler) as p:
+                        activities=[
+                            ProfilerActivity.CPU, ProfilerActivity.CUDA
+                        ], on_trace_ready=trace_handler) as p:
                     inference(test_loader)
                     p.step()
             else:

From c8d3b8c2f426942bf8f58abb7bd721592613f8e6 Mon Sep 17 00:00:00 2001
From: yanbing-j <yanbing.jiang@intel.com>
Date: Mon, 4 Jul 2022 15:53:24 +0800
Subject: [PATCH 09/24] Add inference for benchmark/points/edge_cnn

---
 benchmark/points/edge_cnn.py   | 13 +++++-
 benchmark/points/train_eval.py | 83 +++++++++++++++++++++++++---------
 2 files changed, 74 insertions(+), 22 deletions(-)

diff --git a/benchmark/points/edge_cnn.py b/benchmark/points/edge_cnn.py
index 9c4a2fa82548..40902e4a351b 100644
--- a/benchmark/points/edge_cnn.py
+++ b/benchmark/points/edge_cnn.py
@@ -17,6 +17,9 @@
 parser.add_argument('--lr_decay_factor', type=float, default=0.5)
 parser.add_argument('--lr_decay_step_size', type=int, default=50)
 parser.add_argument('--weight_decay', type=float, default=0)
+parser.add_argument('--inference', type=bool, default=False)
+parser.add_argument('--profile', type=bool,
+                    default=False)  # Currently support profile in inference
 args = parser.parse_args()
 
 
@@ -54,5 +57,13 @@ def forward(self, pos, batch):
 
 train_dataset, test_dataset = get_dataset(num_points=1024)
 model = Net(train_dataset.num_classes)
+print("edge_cnn", end=' ')
 run(train_dataset, test_dataset, model, args.epochs, args.batch_size, args.lr,
-    args.lr_decay_factor, args.lr_decay_step_size, args.weight_decay)
+    args.lr_decay_factor, args.lr_decay_step_size, args.weight_decay, args.inference, args.profile)
+
+if args.profile:
+    import os
+    import pathlib
+    profile_dir = str(pathlib.Path.cwd()) + '/'
+    timeline_file = profile_dir + 'profile-points-edge_cnn.json'
+    os.rename('timeline.json', timeline_file)
diff --git a/benchmark/points/train_eval.py b/benchmark/points/train_eval.py
index b19833c8eb33..6a7ddf835562 100644
--- a/benchmark/points/train_eval.py
+++ b/benchmark/points/train_eval.py
@@ -7,10 +7,19 @@
 from torch_geometric.loader import DataLoader
 
 device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+profile_sort = "self_cuda_time_total" if torch.cuda.is_available(
+) else "self_cpu_time_total"
 
+def trace_handler(p):
+    output = p.key_averages().table(sort_by=profile_sort)
+    print(output)
+    import pathlib
+    profile_dir = str(pathlib.Path.cwd()) + '/'
+    timeline_file = profile_dir + 'timeline' + '.json'
+    p.export_chrome_trace(timeline_file)
 
 def run(train_dataset, test_dataset, model, epochs, batch_size, lr,
-        lr_decay_factor, lr_decay_step_size, weight_decay):
+        lr_decay_factor, lr_decay_step_size, weight_decay, inference, profiling):
 
     model = model.to(device)
     optimizer = Adam(model.parameters(), lr=lr, weight_decay=weight_decay)
@@ -18,26 +27,52 @@ def run(train_dataset, test_dataset, model, epochs, batch_size, lr,
     train_loader = DataLoader(train_dataset, batch_size, shuffle=True)
     test_loader = DataLoader(test_dataset, batch_size, shuffle=False)
 
-    for epoch in range(1, epochs + 1):
-        if torch.cuda.is_available():
-            torch.cuda.synchronize()
-
-        t_start = time.perf_counter()
-
-        train(model, optimizer, train_loader, device)
-        test_acc = test(model, test_loader, device)
-
-        if torch.cuda.is_available():
-            torch.cuda.synchronize()
-
-        t_end = time.perf_counter()
-
-        print(f'Epoch: {epoch:03d}, Test: {test_acc:.4f}, '
-              f'Duration: {t_end - t_start:.2f}')
-
-        if epoch % lr_decay_step_size == 0:
-            for param_group in optimizer.param_groups:
-                param_group['lr'] = lr_decay_factor * param_group['lr']
+    if not inference:
+        for epoch in range(1, epochs + 1):
+            if torch.cuda.is_available():
+                torch.cuda.synchronize()
+
+            t_start = time.perf_counter()
+
+            train(model, optimizer, train_loader, device)
+            test_acc = test(model, test_loader, device)
+
+            if torch.cuda.is_available():
+                torch.cuda.synchronize()
+
+            t_end = time.perf_counter()
+
+            print(f'Epoch: {epoch:03d}, Test: {test_acc:.4f}, '
+                f'Duration: {t_end - t_start:.2f}')
+
+            if epoch % lr_decay_step_size == 0:
+                for param_group in optimizer.param_groups:
+                    param_group['lr'] = lr_decay_factor * param_group['lr']
+    else:
+        for epoch in range(1, epochs + 1):
+            if epoch == epochs:
+                if profiling:
+                    with profile(
+                        activities=[
+                            ProfilerActivity.CPU, ProfilerActivity.CUDA
+                        ], on_trace_ready=trace_handler) as p:
+                        inference(model, test_loader, device)
+                        p.step()
+                else:
+                    if torch.cuda.is_available():
+                        torch.cuda.synchronize()
+                    t_start = time.time()
+
+                    inference(model, test_loader, device)
+
+                    if torch.cuda.is_available():
+                        torch.cuda.synchronize()
+                    t_end = time.time()
+                    duration = t_end - t_start
+                    print("End-to-End time: {} s".format(duration),
+                              flush=True)
+            else:
+                inference(model, test_loader, device)
 
 
 def train(model, optimizer, train_loader, device):
@@ -63,3 +98,9 @@ def test(model, test_loader, device):
     test_acc = correct / len(test_loader.dataset)
 
     return test_acc
+
+def inference(model, test_loader, device):
+    model.eval()
+    for data in test_loader:
+        data = data.to(device)
+        model(data.pos, data.batch)

From e563390cb2e481aa17ebdc688cb20652c40c712a Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Mon, 4 Jul 2022 07:56:41 +0000
Subject: [PATCH 10/24] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 benchmark/points/edge_cnn.py   |  3 ++-
 benchmark/points/train_eval.py | 17 ++++++++++-------
 2 files changed, 12 insertions(+), 8 deletions(-)

diff --git a/benchmark/points/edge_cnn.py b/benchmark/points/edge_cnn.py
index 40902e4a351b..7431344ae133 100644
--- a/benchmark/points/edge_cnn.py
+++ b/benchmark/points/edge_cnn.py
@@ -59,7 +59,8 @@ def forward(self, pos, batch):
 model = Net(train_dataset.num_classes)
 print("edge_cnn", end=' ')
 run(train_dataset, test_dataset, model, args.epochs, args.batch_size, args.lr,
-    args.lr_decay_factor, args.lr_decay_step_size, args.weight_decay, args.inference, args.profile)
+    args.lr_decay_factor, args.lr_decay_step_size, args.weight_decay,
+    args.inference, args.profile)
 
 if args.profile:
     import os
diff --git a/benchmark/points/train_eval.py b/benchmark/points/train_eval.py
index 6a7ddf835562..30c52808c98e 100644
--- a/benchmark/points/train_eval.py
+++ b/benchmark/points/train_eval.py
@@ -10,6 +10,7 @@
 profile_sort = "self_cuda_time_total" if torch.cuda.is_available(
 ) else "self_cpu_time_total"
 
+
 def trace_handler(p):
     output = p.key_averages().table(sort_by=profile_sort)
     print(output)
@@ -18,8 +19,10 @@ def trace_handler(p):
     timeline_file = profile_dir + 'timeline' + '.json'
     p.export_chrome_trace(timeline_file)
 
+
 def run(train_dataset, test_dataset, model, epochs, batch_size, lr,
-        lr_decay_factor, lr_decay_step_size, weight_decay, inference, profiling):
+        lr_decay_factor, lr_decay_step_size, weight_decay, inference,
+        profiling):
 
     model = model.to(device)
     optimizer = Adam(model.parameters(), lr=lr, weight_decay=weight_decay)
@@ -43,7 +46,7 @@ def run(train_dataset, test_dataset, model, epochs, batch_size, lr,
             t_end = time.perf_counter()
 
             print(f'Epoch: {epoch:03d}, Test: {test_acc:.4f}, '
-                f'Duration: {t_end - t_start:.2f}')
+                  f'Duration: {t_end - t_start:.2f}')
 
             if epoch % lr_decay_step_size == 0:
                 for param_group in optimizer.param_groups:
@@ -53,9 +56,9 @@ def run(train_dataset, test_dataset, model, epochs, batch_size, lr,
             if epoch == epochs:
                 if profiling:
                     with profile(
-                        activities=[
-                            ProfilerActivity.CPU, ProfilerActivity.CUDA
-                        ], on_trace_ready=trace_handler) as p:
+                            activities=[
+                                ProfilerActivity.CPU, ProfilerActivity.CUDA
+                            ], on_trace_ready=trace_handler) as p:
                         inference(model, test_loader, device)
                         p.step()
                 else:
@@ -69,8 +72,7 @@ def run(train_dataset, test_dataset, model, epochs, batch_size, lr,
                         torch.cuda.synchronize()
                     t_end = time.time()
                     duration = t_end - t_start
-                    print("End-to-End time: {} s".format(duration),
-                              flush=True)
+                    print("End-to-End time: {} s".format(duration), flush=True)
             else:
                 inference(model, test_loader, device)
 
@@ -99,6 +101,7 @@ def test(model, test_loader, device):
 
     return test_acc
 
+
 def inference(model, test_loader, device):
     model.eval()
     for data in test_loader:

From b129f9d15ee49b614e6a67c4967d81e978a90f8a Mon Sep 17 00:00:00 2001
From: yanbing-j <yanbing.jiang@intel.com>
Date: Mon, 4 Jul 2022 16:16:10 +0800
Subject: [PATCH 11/24] Fix error

---
 benchmark/points/train_eval.py | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/benchmark/points/train_eval.py b/benchmark/points/train_eval.py
index 30c52808c98e..b17b4184b8ed 100644
--- a/benchmark/points/train_eval.py
+++ b/benchmark/points/train_eval.py
@@ -3,6 +3,7 @@
 import torch
 import torch.nn.functional as F
 from torch.optim import Adam
+from torch.profiler import ProfilerActivity, profile
 
 from torch_geometric.loader import DataLoader
 
@@ -59,14 +60,14 @@ def run(train_dataset, test_dataset, model, epochs, batch_size, lr,
                             activities=[
                                 ProfilerActivity.CPU, ProfilerActivity.CUDA
                             ], on_trace_ready=trace_handler) as p:
-                        inference(model, test_loader, device)
+                        inference_run(model, test_loader, device)
                         p.step()
                 else:
                     if torch.cuda.is_available():
                         torch.cuda.synchronize()
                     t_start = time.time()
 
-                    inference(model, test_loader, device)
+                    inference_run(model, test_loader, device)
 
                     if torch.cuda.is_available():
                         torch.cuda.synchronize()
@@ -74,7 +75,7 @@ def run(train_dataset, test_dataset, model, epochs, batch_size, lr,
                     duration = t_end - t_start
                     print("End-to-End time: {} s".format(duration), flush=True)
             else:
-                inference(model, test_loader, device)
+                inference_run(model, test_loader, device)
 
 
 def train(model, optimizer, train_loader, device):
@@ -102,7 +103,7 @@ def test(model, test_loader, device):
     return test_acc
 
 
-def inference(model, test_loader, device):
+def inference_run(model, test_loader, device):
     model.eval()
     for data in test_loader:
         data = data.to(device)

From b80cd4107565230c1e0e852068dd3f0d3cd472be Mon Sep 17 00:00:00 2001
From: yanbing-j <yanbing.jiang@intel.com>
Date: Mon, 11 Jul 2022 14:36:02 +0800
Subject: [PATCH 12/24] Update scripts

---
 benchmark/citation/appnp.py         |  19 ++-
 benchmark/citation/arma.py          |  29 ++---
 benchmark/citation/cheb.py          |  21 ++--
 benchmark/citation/gat.py           |  27 ++--
 benchmark/citation/gcn.py           |  21 ++--
 benchmark/citation/inference.sh     | 144 ++++++++++-----------
 benchmark/citation/sgc.py           |  21 ++--
 benchmark/citation/train_eval.py    | 189 ++++++++++++++--------------
 benchmark/points/edge_cnn.py        |  13 +-
 benchmark/points/train_eval.py      |  13 +-
 examples/hetero/to_hetero_mag.py    |  18 +--
 examples/pna.py                     |  18 +--
 torch_geometric/profile/__init__.py |   3 +
 torch_geometric/profile/profile.py  |  24 ++++
 14 files changed, 260 insertions(+), 300 deletions(-)

diff --git a/benchmark/citation/appnp.py b/benchmark/citation/appnp.py
index 4a9875921b2d..a6aa31357175 100644
--- a/benchmark/citation/appnp.py
+++ b/benchmark/citation/appnp.py
@@ -5,7 +5,8 @@
 from citation import get_planetoid_dataset, random_planetoid_splits, run
 from torch.nn import Linear
 
-from torch_geometric.nn import APPNP
+from torch_geometric.nn import APPNP as Conv
+from torch_geometric.profile import rename_profile_file
 
 parser = argparse.ArgumentParser()
 parser.add_argument('--dataset', type=str, required=True)
@@ -20,9 +21,8 @@
 parser.add_argument('--normalize_features', type=bool, default=True)
 parser.add_argument('--K', type=int, default=10)
 parser.add_argument('--alpha', type=float, default=0.1)
-parser.add_argument('--inference', type=bool, default=False)
-parser.add_argument('--profile', type=bool,
-                    default=False)  # Currently support profile in inference
+parser.add_argument('--inference', action='store_true')
+parser.add_argument('--profile', action='store_true')
 args = parser.parse_args()
 
 
@@ -31,7 +31,7 @@ def __init__(self, dataset):
         super().__init__()
         self.lin1 = Linear(dataset.num_features, args.hidden)
         self.lin2 = Linear(args.hidden, dataset.num_classes)
-        self.prop1 = APPNP(args.K, args.alpha)
+        self.prop1 = Conv(args.K, args.alpha)
 
     def reset_parameters(self):
         self.lin1.reset_parameters()
@@ -49,14 +49,9 @@ def forward(self, data):
 
 dataset = get_planetoid_dataset(args.dataset, args.normalize_features)
 permute_masks = random_planetoid_splits if args.random_splits else None
-print("appnp-{}-{}:".format(args.dataset, args.random_splits), end=' ')
 run(dataset, Net(dataset), args.runs, args.epochs, args.lr, args.weight_decay,
     args.early_stopping, args.inference, args.profile, permute_masks)
 
 if args.profile:
-    import os
-    import pathlib
-    profile_dir = str(pathlib.Path.cwd()) + '/'
-    timeline_file = profile_dir + 'profile-citation-APPNP-' + args.dataset + '-random_splits-' + str(
-        args.random_splits) + '.json'
-    os.rename('timeline.json', timeline_file)
+    rename_profile_file('citation', Conv.__name__, args.dataset,
+                        str(args.random_splits))
diff --git a/benchmark/citation/arma.py b/benchmark/citation/arma.py
index 5c65e9e0d814..37a87a17cd60 100644
--- a/benchmark/citation/arma.py
+++ b/benchmark/citation/arma.py
@@ -4,7 +4,8 @@
 import torch.nn.functional as F
 from citation import get_planetoid_dataset, random_planetoid_splits, run
 
-from torch_geometric.nn import ARMAConv
+from torch_geometric.nn import ARMAConv as Conv
+from torch_geometric.profile import rename_profile_file
 
 parser = argparse.ArgumentParser()
 parser.add_argument('--dataset', type=str, required=True)
@@ -21,21 +22,20 @@
 parser.add_argument('--num_layers', type=int, default=1)
 parser.add_argument('--shared_weights', type=bool, default=False)
 parser.add_argument('--skip_dropout', type=float, default=0.75)
-parser.add_argument('--inference', type=bool, default=False)
-parser.add_argument('--profile', type=bool,
-                    default=False)  # Currently support profile in inference
+parser.add_argument('--inference', action='store_true')
+parser.add_argument('--profile', action='store_true')
 args = parser.parse_args()
 
 
 class Net(torch.nn.Module):
     def __init__(self, dataset):
         super().__init__()
-        self.conv1 = ARMAConv(dataset.num_features, args.hidden,
-                              args.num_stacks, args.num_layers,
-                              args.shared_weights, dropout=args.skip_dropout)
-        self.conv2 = ARMAConv(args.hidden, dataset.num_classes,
-                              args.num_stacks, args.num_layers,
-                              args.shared_weights, dropout=args.skip_dropout)
+        self.conv1 = Conv(dataset.num_features, args.hidden, args.num_stacks,
+                          args.num_layers, args.shared_weights,
+                          dropout=args.skip_dropout)
+        self.conv2 = Conv(args.hidden, dataset.num_classes, args.num_stacks,
+                          args.num_layers, args.shared_weights,
+                          dropout=args.skip_dropout)
 
     def reset_parameters(self):
         self.conv1.reset_parameters()
@@ -51,14 +51,9 @@ def forward(self, data):
 
 dataset = get_planetoid_dataset(args.dataset, args.normalize_features)
 permute_masks = random_planetoid_splits if args.random_splits else None
-print("arma-{}-{}:".format(args.dataset, args.random_splits), end=' ')
 run(dataset, Net(dataset), args.runs, args.epochs, args.lr, args.weight_decay,
     args.early_stopping, args.inference, args.profile, permute_masks)
 
 if args.profile:
-    import os
-    import pathlib
-    profile_dir = str(pathlib.Path.cwd()) + '/'
-    timeline_file = profile_dir + 'profile-citation-ARMA-' + args.dataset + '-random_splits-' + str(
-        args.random_splits) + '.json'
-    os.rename('timeline.json', timeline_file)
+    rename_profile_file('citation', Conv.__name__, args.dataset,
+                        str(args.random_splits))
diff --git a/benchmark/citation/cheb.py b/benchmark/citation/cheb.py
index b33c2fe2e96d..4038beab2c07 100644
--- a/benchmark/citation/cheb.py
+++ b/benchmark/citation/cheb.py
@@ -4,7 +4,8 @@
 import torch.nn.functional as F
 from citation import get_planetoid_dataset, random_planetoid_splits, run
 
-from torch_geometric.nn import ChebConv
+from torch_geometric.nn import ChebConv as Conv
+from torch_geometric.profile import rename_profile_file
 
 parser = argparse.ArgumentParser()
 parser.add_argument('--dataset', type=str, required=True)
@@ -18,17 +19,16 @@
 parser.add_argument('--dropout', type=float, default=0.5)
 parser.add_argument('--normalize_features', type=bool, default=True)
 parser.add_argument('--num_hops', type=int, default=3)
-parser.add_argument('--inference', type=bool, default=False)
-parser.add_argument('--profile', type=bool,
-                    default=False)  # Currently support profile in inference
+parser.add_argument('--inference', action='store_true')
+parser.add_argument('--profile', action='store_true')
 args = parser.parse_args()
 
 
 class Net(torch.nn.Module):
     def __init__(self, dataset):
         super().__init__()
-        self.conv1 = ChebConv(dataset.num_features, args.hidden, args.num_hops)
-        self.conv2 = ChebConv(args.hidden, dataset.num_classes, args.num_hops)
+        self.conv1 = Conv(dataset.num_features, args.hidden, args.num_hops)
+        self.conv2 = Conv(args.hidden, dataset.num_classes, args.num_hops)
 
     def reset_parameters(self):
         self.conv1.reset_parameters()
@@ -44,14 +44,9 @@ def forward(self, data):
 
 dataset = get_planetoid_dataset(args.dataset, args.normalize_features)
 permute_masks = random_planetoid_splits if args.random_splits else None
-print("cheby-{}-{}:".format(args.dataset, args.random_splits), end=' ')
 run(dataset, Net(dataset), args.runs, args.epochs, args.lr, args.weight_decay,
     args.early_stopping, args.inference, args.profile, permute_masks)
 
 if args.profile:
-    import os
-    import pathlib
-    profile_dir = str(pathlib.Path.cwd()) + '/'
-    timeline_file = profile_dir + 'profile-citation-CHEBY-' + args.dataset + '-random_splits-' + str(
-        args.random_splits) + '.json'
-    os.rename('timeline.json', timeline_file)
+    rename_profile_file('citation', Conv.__name__, args.dataset,
+                        str(args.random_splits))
diff --git a/benchmark/citation/gat.py b/benchmark/citation/gat.py
index 0ac29696a416..bd202c8f8184 100644
--- a/benchmark/citation/gat.py
+++ b/benchmark/citation/gat.py
@@ -4,7 +4,8 @@
 import torch.nn.functional as F
 from citation import get_planetoid_dataset, random_planetoid_splits, run
 
-from torch_geometric.nn import GATConv
+from torch_geometric.nn import GATConv as Conv
+from torch_geometric.profile import rename_profile_file
 
 parser = argparse.ArgumentParser()
 parser.add_argument('--dataset', type=str, required=True)
@@ -19,20 +20,19 @@
 parser.add_argument('--normalize_features', type=bool, default=True)
 parser.add_argument('--heads', type=int, default=8)
 parser.add_argument('--output_heads', type=int, default=1)
-parser.add_argument('--inference', type=bool, default=False)
-parser.add_argument('--profile', type=bool,
-                    default=False)  # Currently support profile in inference
+parser.add_argument('--inference', action='store_true')
+parser.add_argument('--profile', action='store_true')
 args = parser.parse_args()
 
 
 class Net(torch.nn.Module):
     def __init__(self, dataset):
         super().__init__()
-        self.conv1 = GATConv(dataset.num_features, args.hidden,
-                             heads=args.heads, dropout=args.dropout)
-        self.conv2 = GATConv(args.hidden * args.heads, dataset.num_classes,
-                             heads=args.output_heads, concat=False,
-                             dropout=args.dropout)
+        self.conv1 = Conv(dataset.num_features, args.hidden, heads=args.heads,
+                          dropout=args.dropout)
+        self.conv2 = Conv(args.hidden * args.heads, dataset.num_classes,
+                          heads=args.output_heads, concat=False,
+                          dropout=args.dropout)
 
     def reset_parameters(self):
         self.conv1.reset_parameters()
@@ -49,14 +49,9 @@ def forward(self, data):
 
 dataset = get_planetoid_dataset(args.dataset, args.normalize_features)
 permute_masks = random_planetoid_splits if args.random_splits else None
-print("gat-{}-{}:".format(args.dataset, args.random_splits), end=' ')
 run(dataset, Net(dataset), args.runs, args.epochs, args.lr, args.weight_decay,
     args.early_stopping, args.inference, args.profile, permute_masks)
 
 if args.profile:
-    import os
-    import pathlib
-    profile_dir = str(pathlib.Path.cwd()) + '/'
-    timeline_file = profile_dir + 'profile-citation-GAT-' + args.dataset + '-random_splits-' + str(
-        args.random_splits) + '.json'
-    os.rename('timeline.json', timeline_file)
+    rename_profile_file('citation', Conv.__name__, args.dataset,
+                        str(args.random_splits))
diff --git a/benchmark/citation/gcn.py b/benchmark/citation/gcn.py
index d1f7683e1c28..96695f46d187 100644
--- a/benchmark/citation/gcn.py
+++ b/benchmark/citation/gcn.py
@@ -4,7 +4,8 @@
 import torch.nn.functional as F
 from citation import get_planetoid_dataset, random_planetoid_splits, run
 
-from torch_geometric.nn import GCNConv
+from torch_geometric.nn import GCNConv as Conv
+from torch_geometric.profile import rename_profile_file
 
 parser = argparse.ArgumentParser()
 parser.add_argument('--dataset', type=str, required=True)
@@ -17,17 +18,16 @@
 parser.add_argument('--hidden', type=int, default=16)
 parser.add_argument('--dropout', type=float, default=0.5)
 parser.add_argument('--normalize_features', type=bool, default=True)
-parser.add_argument('--inference', type=bool, default=False)
-parser.add_argument('--profile', type=bool,
-                    default=False)  # Currently support profile in inference
+parser.add_argument('--inference', action='store_true')
+parser.add_argument('--profile', action='store_true')
 args = parser.parse_args()
 
 
 class Net(torch.nn.Module):
     def __init__(self, dataset):
         super().__init__()
-        self.conv1 = GCNConv(dataset.num_features, args.hidden)
-        self.conv2 = GCNConv(args.hidden, dataset.num_classes)
+        self.conv1 = Conv(dataset.num_features, args.hidden)
+        self.conv2 = Conv(args.hidden, dataset.num_classes)
 
     def reset_parameters(self):
         self.conv1.reset_parameters()
@@ -43,14 +43,9 @@ def forward(self, data):
 
 dataset = get_planetoid_dataset(args.dataset, args.normalize_features)
 permute_masks = random_planetoid_splits if args.random_splits else None
-print("gcn-{}-{}:".format(args.dataset, args.random_splits), end=' ')
 run(dataset, Net(dataset), args.runs, args.epochs, args.lr, args.weight_decay,
     args.early_stopping, args.inference, args.profile, permute_masks)
 
 if args.profile:
-    import os
-    import pathlib
-    profile_dir = str(pathlib.Path.cwd()) + '/'
-    timeline_file = profile_dir + 'profile-citation-GCN-' + args.dataset + '-random_splits-' + str(
-        args.random_splits) + '.json'
-    os.rename('timeline.json', timeline_file)
+    rename_profile_file('citation', Conv.__name__, args.dataset,
+                        str(args.random_splits))
diff --git a/benchmark/citation/inference.sh b/benchmark/citation/inference.sh
index 7c8180573bee..cb9f3e8f23c6 100755
--- a/benchmark/citation/inference.sh
+++ b/benchmark/citation/inference.sh
@@ -4,115 +4,115 @@ echo "Cora"
 echo "===="
 
 echo "GCN"
-python gcn.py --dataset=Cora --inference=True
-python gcn.py --dataset=Cora --random_splits=True --inference=True
-python gcn.py --dataset=Cora --inference=True --profile=True
-python gcn.py --dataset=Cora --random_splits=True --inference=True --profile=True
+python gcn.py --dataset=Cora --inference
+python gcn.py --dataset=Cora --random_splits=True --inference
+python gcn.py --dataset=Cora --inference --profile
+python gcn.py --dataset=Cora --random_splits=True --inference --profile
 
 echo "GAT"
-python gat.py --dataset=Cora --inference=True
-python gat.py --dataset=Cora --random_splits=True --inference=True
-python gat.py --dataset=Cora --inference=True --profile=True
-python gat.py --dataset=Cora --random_splits=True --inference=True --profile=True
+python gat.py --dataset=Cora --inference
+python gat.py --dataset=Cora --random_splits=True --inference
+python gat.py --dataset=Cora --inference --profile
+python gat.py --dataset=Cora --random_splits=True --inference --profile
 
 echo "Cheby"
-python cheb.py --dataset=Cora --num_hops=3 --inference=True
-python cheb.py --dataset=Cora --num_hops=3 --random_splits=True --inference=True
-python cheb.py --dataset=Cora --num_hops=3 --inference=True --profile=True
-python cheb.py --dataset=Cora --num_hops=3 --random_splits=True --inference=True --profile=True
+python cheb.py --dataset=Cora --num_hops=3 --inference
+python cheb.py --dataset=Cora --num_hops=3 --random_splits=True --inference
+python cheb.py --dataset=Cora --num_hops=3 --inference --profile
+python cheb.py --dataset=Cora --num_hops=3 --random_splits=True --inference --profile
 
 echo "SGC"
-python sgc.py --dataset=Cora --K=3 --weight_decay=0.0005 --inference=True
-python sgc.py --dataset=Cora --K=3 --weight_decay=0.0005 --random_splits=True --inference=True
-python sgc.py --dataset=Cora --K=3 --weight_decay=0.0005 --inference=True --profile=True
-python sgc.py --dataset=Cora --K=3 --weight_decay=0.0005 --random_splits=True --inference=True --profile=True
+python sgc.py --dataset=Cora --K=3 --weight_decay=0.0005 --inference
+python sgc.py --dataset=Cora --K=3 --weight_decay=0.0005 --random_splits=True --inference
+python sgc.py --dataset=Cora --K=3 --weight_decay=0.0005 --inference --profile
+python sgc.py --dataset=Cora --K=3 --weight_decay=0.0005 --random_splits=True --inference --profile
 
 echo "ARMA"
-python arma.py --dataset=Cora --num_stacks=2 --num_layers=1 --shared_weights=True --inference=True
-python arma.py --dataset=Cora --num_stacks=3 --num_layers=1 --shared_weights=True --random_splits=True --inference=True
-python arma.py --dataset=Cora --num_stacks=2 --num_layers=1 --shared_weights=True --inference=True --profile=True
-python arma.py --dataset=Cora --num_stacks=3 --num_layers=1 --shared_weights=True --random_splits=True --inference=True --profile=True
+python arma.py --dataset=Cora --num_stacks=2 --num_layers=1 --shared_weights=True --inference
+python arma.py --dataset=Cora --num_stacks=3 --num_layers=1 --shared_weights=True --random_splits=True --inference
+python arma.py --dataset=Cora --num_stacks=2 --num_layers=1 --shared_weights=True --inference --profile
+python arma.py --dataset=Cora --num_stacks=3 --num_layers=1 --shared_weights=True --random_splits=True --inference --profile
 
 echo "APPNP"
-python appnp.py --dataset=Cora --alpha=0.1 --inference=True
-python appnp.py --dataset=Cora --alpha=0.1 --random_splits=True --inference=True
-python appnp.py --dataset=Cora --alpha=0.1 --inference=True --profile=True
-python appnp.py --dataset=Cora --alpha=0.1 --random_splits=True --inference=True --profile=True
+python appnp.py --dataset=Cora --alpha=0.1 --inference
+python appnp.py --dataset=Cora --alpha=0.1 --random_splits=True --inference
+python appnp.py --dataset=Cora --alpha=0.1 --inference --profile
+python appnp.py --dataset=Cora --alpha=0.1 --random_splits=True --inference --profile
 
 echo "CiteSeer"
 echo "========"
 
 echo "GCN"
-python gcn.py --dataset=CiteSeer --inference=True
-python gcn.py --dataset=CiteSeer --random_splits=True --inference=True
-python gcn.py --dataset=CiteSeer --inference=True --profile=True
-python gcn.py --dataset=CiteSeer --random_splits=True --inference=True --profile=True
+python gcn.py --dataset=CiteSeer --inference
+python gcn.py --dataset=CiteSeer --random_splits=True --inference
+python gcn.py --dataset=CiteSeer --inference --profile
+python gcn.py --dataset=CiteSeer --random_splits=True --inference --profile
 
 echo "GAT"
-python gat.py --dataset=CiteSeer --inference=True
-python gat.py --dataset=CiteSeer --random_splits=True --inference=True
-python gat.py --dataset=CiteSeer --inference=True --profile=True
-python gat.py --dataset=CiteSeer --random_splits=True --inference=True --profile=True
+python gat.py --dataset=CiteSeer --inference
+python gat.py --dataset=CiteSeer --random_splits=True --inference
+python gat.py --dataset=CiteSeer --inference --profile
+python gat.py --dataset=CiteSeer --random_splits=True --inference --profile
 
 echo "Cheby"
-python cheb.py --dataset=CiteSeer --num_hops=2 --inference=True
-python cheb.py --dataset=CiteSeer --num_hops=3 --random_splits=True --inference=True
-python cheb.py --dataset=CiteSeer --num_hops=2 --inference=True --profile=True
-python cheb.py --dataset=CiteSeer --num_hops=3 --random_splits=True --inference=True --profile=True
+python cheb.py --dataset=CiteSeer --num_hops=2 --inference
+python cheb.py --dataset=CiteSeer --num_hops=3 --random_splits=True --inference
+python cheb.py --dataset=CiteSeer --num_hops=2 --inference --profile
+python cheb.py --dataset=CiteSeer --num_hops=3 --random_splits=True --inference --profile
 
 echo "SGC"
-python sgc.py --dataset=CiteSeer --K=2 --weight_decay=0.005 --inference=True
-python sgc.py --dataset=CiteSeer --K=2 --weight_decay=0.005 --random_splits=True --inference=True
-python sgc.py --dataset=CiteSeer --K=2 --weight_decay=0.005 --inference=True --profile=True
-python sgc.py --dataset=CiteSeer --K=2 --weight_decay=0.005 --random_splits=True --inference=True --profile=True
+python sgc.py --dataset=CiteSeer --K=2 --weight_decay=0.005 --inference
+python sgc.py --dataset=CiteSeer --K=2 --weight_decay=0.005 --random_splits=True --inference
+python sgc.py --dataset=CiteSeer --K=2 --weight_decay=0.005 --inference --profile
+python sgc.py --dataset=CiteSeer --K=2 --weight_decay=0.005 --random_splits=True --inference --profile
 
 echo "ARMA"
-python arma.py --dataset=CiteSeer --num_stacks=3 --num_layers=1 --shared_weights=True --inference=True
-python arma.py --dataset=CiteSeer --num_stacks=3 --num_layers=1 --shared_weights=True --random_splits=True --inference=True
-python arma.py --dataset=CiteSeer --num_stacks=3 --num_layers=1 --shared_weights=True --inference=True --profile=True
-python arma.py --dataset=CiteSeer --num_stacks=3 --num_layers=1 --shared_weights=True --random_splits=True --inference=True --profile=True
+python arma.py --dataset=CiteSeer --num_stacks=3 --num_layers=1 --shared_weights=True --inference
+python arma.py --dataset=CiteSeer --num_stacks=3 --num_layers=1 --shared_weights=True --random_splits=True --inference
+python arma.py --dataset=CiteSeer --num_stacks=3 --num_layers=1 --shared_weights=True --inference --profile
+python arma.py --dataset=CiteSeer --num_stacks=3 --num_layers=1 --shared_weights=True --random_splits=True --inference --profile
 
 echo "APPNP"
-python appnp.py --dataset=CiteSeer --alpha=0.1 --inference=True
-python appnp.py --dataset=CiteSeer --alpha=0.1 --random_splits=True --inference=True
-python appnp.py --dataset=CiteSeer --alpha=0.1 --inference=True --profile=True
-python appnp.py --dataset=CiteSeer --alpha=0.1 --random_splits=True --inference=True --profile=True
+python appnp.py --dataset=CiteSeer --alpha=0.1 --inference
+python appnp.py --dataset=CiteSeer --alpha=0.1 --random_splits=True --inference
+python appnp.py --dataset=CiteSeer --alpha=0.1 --inference --profile
+python appnp.py --dataset=CiteSeer --alpha=0.1 --random_splits=True --inference --profile
 
 echo "PubMed"
 echo "======"
 
 echo "GCN"
-python gcn.py --dataset=PubMed --inference=True
-python gcn.py --dataset=PubMed --random_splits=True --inference=True
-python gcn.py --dataset=PubMed --inference=True --profile=True
-python gcn.py --dataset=PubMed --random_splits=True --inference=True --profile=True
+python gcn.py --dataset=PubMed --inference
+python gcn.py --dataset=PubMed --random_splits=True --inference
+python gcn.py --dataset=PubMed --inference --profile
+python gcn.py --dataset=PubMed --random_splits=True --inference --profile
 
 echo "GAT"
-python gat.py --dataset=PubMed --lr=0.01 --weight_decay=0.001 --output_heads=8 --inference=True
-python gat.py --dataset=PubMed --lr=0.01 --weight_decay=0.001 --output_heads=8 --random_splits=True --inference=True
-python gat.py --dataset=PubMed --lr=0.01 --weight_decay=0.001 --output_heads=8 --inference=True --profile=True
-python gat.py --dataset=PubMed --lr=0.01 --weight_decay=0.001 --output_heads=8 --random_splits=True --inference=True --profile=True
+python gat.py --dataset=PubMed --lr=0.01 --weight_decay=0.001 --output_heads=8 --inference
+python gat.py --dataset=PubMed --lr=0.01 --weight_decay=0.001 --output_heads=8 --random_splits=True --inference
+python gat.py --dataset=PubMed --lr=0.01 --weight_decay=0.001 --output_heads=8 --inference --profile
+python gat.py --dataset=PubMed --lr=0.01 --weight_decay=0.001 --output_heads=8 --random_splits=True --inference --profile
 
 echo "Cheby"
-python cheb.py --dataset=PubMed --num_hops=2 --inference=True
-python cheb.py --dataset=PubMed --num_hops=2 --random_splits=True --inference=True
-python cheb.py --dataset=PubMed --num_hops=2 --inference=True --profile=True
-python cheb.py --dataset=PubMed --num_hops=2 --random_splits=True --inference=True --profile=True
+python cheb.py --dataset=PubMed --num_hops=2 --inference
+python cheb.py --dataset=PubMed --num_hops=2 --random_splits=True --inference
+python cheb.py --dataset=PubMed --num_hops=2 --inference --profile
+python cheb.py --dataset=PubMed --num_hops=2 --random_splits=True --inference --profile
 
 echo "SGC"
-python sgc.py --dataset=PubMed --K=2 --weight_decay=0.0005 --inference=True
-python sgc.py --dataset=PubMed --K=2 --weight_decay=0.0005 --random_splits=True --inference=True
-python sgc.py --dataset=PubMed --K=2 --weight_decay=0.0005 --inference=True --profile=True
-python sgc.py --dataset=PubMed --K=2 --weight_decay=0.0005 --random_splits=True --inference=True --profile=True
+python sgc.py --dataset=PubMed --K=2 --weight_decay=0.0005 --inference
+python sgc.py --dataset=PubMed --K=2 --weight_decay=0.0005 --random_splits=True --inference
+python sgc.py --dataset=PubMed --K=2 --weight_decay=0.0005 --inference --profile
+python sgc.py --dataset=PubMed --K=2 --weight_decay=0.0005 --random_splits=True --inference --profile
 
 echo "ARMA"
-python arma.py --dataset=PubMed --num_stacks=2 --num_layers=1 --skip_dropout=0 --inference=True
-python arma.py --dataset=PubMed --num_stacks=2 --num_layers=1 --skip_dropout=0.5 --random_splits=True --inference=True
-python arma.py --dataset=PubMed --num_stacks=2 --num_layers=1 --skip_dropout=0 --inference=True --profile=True
-python arma.py --dataset=PubMed --num_stacks=2 --num_layers=1 --skip_dropout=0.5 --random_splits=True --inference=True --profile=True
+python arma.py --dataset=PubMed --num_stacks=2 --num_layers=1 --skip_dropout=0 --inference
+python arma.py --dataset=PubMed --num_stacks=2 --num_layers=1 --skip_dropout=0.5 --random_splits=True --inference
+python arma.py --dataset=PubMed --num_stacks=2 --num_layers=1 --skip_dropout=0 --inference --profile
+python arma.py --dataset=PubMed --num_stacks=2 --num_layers=1 --skip_dropout=0.5 --random_splits=True --inference --profile
 
 echo "APPNP"
-python appnp.py --dataset=PubMed --alpha=0.1 --inference=True
-python appnp.py --dataset=PubMed --alpha=0.1 --random_splits=True --inference=True
-python appnp.py --dataset=PubMed --alpha=0.1 --inference=True --profile=True
-python appnp.py --dataset=PubMed --alpha=0.1 --random_splits=True --inference=True --profile=True
+python appnp.py --dataset=PubMed --alpha=0.1 --inference
+python appnp.py --dataset=PubMed --alpha=0.1 --random_splits=True --inference
+python appnp.py --dataset=PubMed --alpha=0.1 --inference --profile
+python appnp.py --dataset=PubMed --alpha=0.1 --random_splits=True --inference --profile
diff --git a/benchmark/citation/sgc.py b/benchmark/citation/sgc.py
index 5ad43c3ada59..a0e5e062f42c 100644
--- a/benchmark/citation/sgc.py
+++ b/benchmark/citation/sgc.py
@@ -4,7 +4,8 @@
 import torch.nn.functional as F
 from citation import get_planetoid_dataset, random_planetoid_splits, run
 
-from torch_geometric.nn import SGConv
+from torch_geometric.nn import SGConv as Conv
+from torch_geometric.profile import rename_profile_file
 
 parser = argparse.ArgumentParser()
 parser.add_argument('--dataset', type=str, required=True)
@@ -16,17 +17,16 @@
 parser.add_argument('--early_stopping', type=int, default=10)
 parser.add_argument('--normalize_features', type=bool, default=False)
 parser.add_argument('--K', type=int, default=2)
-parser.add_argument('--inference', type=bool, default=False)
-parser.add_argument('--profile', type=bool,
-                    default=False)  # Currently support profile in inference
+parser.add_argument('--inference', action='store_true')
+parser.add_argument('--profile', action='store_true')
 args = parser.parse_args()
 
 
 class Net(torch.nn.Module):
     def __init__(self, dataset):
         super().__init__()
-        self.conv1 = SGConv(dataset.num_features, dataset.num_classes,
-                            K=args.K, cached=True)
+        self.conv1 = Conv(dataset.num_features, dataset.num_classes, K=args.K,
+                          cached=True)
 
     def reset_parameters(self):
         self.conv1.reset_parameters()
@@ -39,14 +39,9 @@ def forward(self, data):
 
 dataset = get_planetoid_dataset(args.dataset, args.normalize_features)
 permute_masks = random_planetoid_splits if args.random_splits else None
-print("sgc-{}-{}:".format(args.dataset, args.random_splits), end=' ')
 run(dataset, Net(dataset), args.runs, args.epochs, args.lr, args.weight_decay,
     args.early_stopping, args.inference, args.profile, permute_masks)
 
 if args.profile:
-    import os
-    import pathlib
-    profile_dir = str(pathlib.Path.cwd()) + '/'
-    timeline_file = profile_dir + 'profile-citation-SGC-' + args.dataset + '-random_splits-' + str(
-        args.random_splits) + '.json'
-    os.rename('timeline.json', timeline_file)
+    rename_profile_file('citation', Conv.__name__, args.dataset,
+                        str(args.random_splits))
diff --git a/benchmark/citation/train_eval.py b/benchmark/citation/train_eval.py
index 2710109f87ef..9485a834e51c 100644
--- a/benchmark/citation/train_eval.py
+++ b/benchmark/citation/train_eval.py
@@ -6,11 +6,10 @@
 from torch.optim import Adam
 from torch.profiler import ProfilerActivity, profile
 
+from torch_geometric.profile import trace_handler
 from torch_geometric.utils import index_to_mask
 
 device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
-profile_sort = "self_cuda_time_total" if torch.cuda.is_available(
-) else "self_cpu_time_total"
 
 
 def random_planetoid_splits(data, num_classes):
@@ -37,104 +36,103 @@ def random_planetoid_splits(data, num_classes):
     return data
 
 
-def trace_handler(p):
-    output = p.key_averages().table(sort_by=profile_sort)
-    print(output)
-    import pathlib
-    profile_dir = str(pathlib.Path.cwd()) + '/'
-    timeline_file = profile_dir + 'timeline' + '.json'
-    p.export_chrome_trace(timeline_file)
+def run_train(dataset, model, runs, epochs, lr, weight_decay, early_stopping,
+              permute_masks=None, logger=None):
+    val_losses, accs, durations = [], [], []
+    for _ in range(runs):
+        data = dataset[0]
+        if permute_masks is not None:
+            data = permute_masks(data, dataset.num_classes)
+        data = data.to(device)
+
+        model.to(device).reset_parameters()
+        optimizer = Adam(model.parameters(), lr=lr, weight_decay=weight_decay)
+
+        if torch.cuda.is_available():
+            torch.cuda.synchronize()
+
+        t_start = time.perf_counter()
+
+        best_val_loss = float('inf')
+        test_acc = 0
+        val_loss_history = []
+
+        for epoch in range(1, epochs + 1):
+            train(model, optimizer, data)
+            eval_info = evaluate(model, data)
+            eval_info['epoch'] = epoch
+
+            if logger is not None:
+                logger(eval_info)
+
+            if eval_info['val_loss'] < best_val_loss:
+                best_val_loss = eval_info['val_loss']
+                test_acc = eval_info['test_acc']
+
+            val_loss_history.append(eval_info['val_loss'])
+            if early_stopping > 0 and epoch > epochs // 2:
+                tmp = tensor(val_loss_history[-(early_stopping + 1):-1])
+                if eval_info['val_loss'] > tmp.mean().item():
+                    break
+
+        if torch.cuda.is_available():
+            torch.cuda.synchronize()
+
+        t_end = time.perf_counter()
+
+        val_losses.append(best_val_loss)
+        accs.append(test_acc)
+        durations.append(t_end - t_start)
+    loss, acc, duration = tensor(val_losses), tensor(accs), tensor(durations)
+
+    print(f'Val Loss: {float(loss.mean()):.4f}, '
+          f'Test Accuracy: {float(acc.mean()):.3f} ± {float(acc.std()):.3f}, '
+          f'Duration: {float(duration.mean()):.3f}s')
+
+
+def run_inference(dataset, model, runs, epochs, profiling, permute_masks=None,
+                  logger=None):
+    for i in range(runs):
+        data = dataset[0]
+    if permute_masks is not None:
+        data = permute_masks(data, dataset.num_classes)
+    data = data.to(device)
+
+    model.to(device).reset_parameters()
+
+    for epoch in range(1, epochs + 1):
+        if i == runs - 1 and epoch == epochs:
+            if profiling:
+                with profile(
+                        activities=[
+                            ProfilerActivity.CPU, ProfilerActivity.CUDA
+                        ], on_trace_ready=trace_handler) as p:
+                    inference(model, data)
+                    p.step()
+            else:
+                if torch.cuda.is_available():
+                    torch.cuda.synchronize()
+                t_start = time.time()
+
+                inference(model, data)
+
+                if torch.cuda.is_available():
+                    torch.cuda.synchronize()
+                t_end = time.time()
+                duration = t_end - t_start
+                print("End-to-End time: {} s".format(duration), flush=True)
+        else:
+            inference(model, data)
 
 
 def run(dataset, model, runs, epochs, lr, weight_decay, early_stopping,
         inference, profiling, permute_masks=None, logger=None):
-    val_losses, accs, durations = [], [], []
     if not inference:
-        for _ in range(runs):
-            data = dataset[0]
-            if permute_masks is not None:
-                data = permute_masks(data, dataset.num_classes)
-            data = data.to(device)
-
-            model.to(device).reset_parameters()
-            optimizer = Adam(model.parameters(), lr=lr,
-                             weight_decay=weight_decay)
-
-            if torch.cuda.is_available():
-                torch.cuda.synchronize()
-
-            t_start = time.perf_counter()
-
-            best_val_loss = float('inf')
-            test_acc = 0
-            val_loss_history = []
-
-            for epoch in range(1, epochs + 1):
-                train(model, optimizer, data)
-                eval_info = evaluate(model, data)
-                eval_info['epoch'] = epoch
-
-                if logger is not None:
-                    logger(eval_info)
-
-                if eval_info['val_loss'] < best_val_loss:
-                    best_val_loss = eval_info['val_loss']
-                    test_acc = eval_info['test_acc']
-
-                val_loss_history.append(eval_info['val_loss'])
-                if early_stopping > 0 and epoch > epochs // 2:
-                    tmp = tensor(val_loss_history[-(early_stopping + 1):-1])
-                    if eval_info['val_loss'] > tmp.mean().item():
-                        break
-
-            if torch.cuda.is_available():
-                torch.cuda.synchronize()
-
-            t_end = time.perf_counter()
-
-            val_losses.append(best_val_loss)
-            accs.append(test_acc)
-            durations.append(t_end - t_start)
-        loss, acc, duration = tensor(val_losses), tensor(accs), tensor(
-            durations)
-
-        print(
-            f'Val Loss: {float(loss.mean()):.4f}, '
-            f'Test Accuracy: {float(acc.mean()):.3f} ± {float(acc.std()):.3f}, '
-            f'Duration: {float(duration.mean()):.3f}s')
+        run_train(dataset, model, runs, epochs, lr, weight_decay,
+                  early_stopping, permute_masks, logger)
     else:
-        for i in range(runs):
-            data = dataset[0]
-            if permute_masks is not None:
-                data = permute_masks(data, dataset.num_classes)
-            data = data.to(device)
-
-            model.to(device).reset_parameters()
-
-            for epoch in range(1, epochs + 1):
-                if i == int(runs / 2) and epoch == int(epochs / 2):
-                    if profiling:
-                        with profile(
-                                activities=[
-                                    ProfilerActivity.CPU, ProfilerActivity.CUDA
-                                ], on_trace_ready=trace_handler) as p:
-                            test(model, data)
-                            p.step()
-                    else:
-                        if torch.cuda.is_available():
-                            torch.cuda.synchronize()
-                        t_start = time.time()
-
-                        test(model, data)
-
-                        if torch.cuda.is_available():
-                            torch.cuda.synchronize()
-                        t_end = time.time()
-                        duration = t_end - t_start
-                        print("End-to-End time: {} s".format(duration),
-                              flush=True)
-                else:
-                    test(model, data)
+        run_inference(dataset, model, runs, epochs, profiling, permute_masks,
+                      logger)
 
 
 def train(model, optimizer, data):
@@ -165,7 +163,8 @@ def evaluate(model, data):
     return outs
 
 
-def test(model, data):
+@torch.no_grad()
+def inference(model, data):
     model.eval()
     with torch.no_grad():
         model(data)
diff --git a/benchmark/points/edge_cnn.py b/benchmark/points/edge_cnn.py
index 7431344ae133..1541216906fb 100644
--- a/benchmark/points/edge_cnn.py
+++ b/benchmark/points/edge_cnn.py
@@ -9,6 +9,7 @@
 from torch.nn import Sequential as Seq
 
 from torch_geometric.nn import DynamicEdgeConv, global_max_pool
+from torch_geometric.profile import rename_profile_file
 
 parser = argparse.ArgumentParser()
 parser.add_argument('--epochs', type=int, default=200)
@@ -17,9 +18,8 @@
 parser.add_argument('--lr_decay_factor', type=float, default=0.5)
 parser.add_argument('--lr_decay_step_size', type=int, default=50)
 parser.add_argument('--weight_decay', type=float, default=0)
-parser.add_argument('--inference', type=bool, default=False)
-parser.add_argument('--profile', type=bool,
-                    default=False)  # Currently support profile in inference
+parser.add_argument('--inference', action='store_true')
+parser.add_argument('--profile', action='store_true')
 args = parser.parse_args()
 
 
@@ -57,14 +57,9 @@ def forward(self, pos, batch):
 
 train_dataset, test_dataset = get_dataset(num_points=1024)
 model = Net(train_dataset.num_classes)
-print("edge_cnn", end=' ')
 run(train_dataset, test_dataset, model, args.epochs, args.batch_size, args.lr,
     args.lr_decay_factor, args.lr_decay_step_size, args.weight_decay,
     args.inference, args.profile)
 
 if args.profile:
-    import os
-    import pathlib
-    profile_dir = str(pathlib.Path.cwd()) + '/'
-    timeline_file = profile_dir + 'profile-points-edge_cnn.json'
-    os.rename('timeline.json', timeline_file)
+    rename_profile_file('points', 'DynamicEdgeConv')
diff --git a/benchmark/points/train_eval.py b/benchmark/points/train_eval.py
index b17b4184b8ed..abdd303e38dd 100644
--- a/benchmark/points/train_eval.py
+++ b/benchmark/points/train_eval.py
@@ -6,19 +6,9 @@
 from torch.profiler import ProfilerActivity, profile
 
 from torch_geometric.loader import DataLoader
+from torch_geometric.profile import trace_handler
 
 device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
-profile_sort = "self_cuda_time_total" if torch.cuda.is_available(
-) else "self_cpu_time_total"
-
-
-def trace_handler(p):
-    output = p.key_averages().table(sort_by=profile_sort)
-    print(output)
-    import pathlib
-    profile_dir = str(pathlib.Path.cwd()) + '/'
-    timeline_file = profile_dir + 'timeline' + '.json'
-    p.export_chrome_trace(timeline_file)
 
 
 def run(train_dataset, test_dataset, model, epochs, batch_size, lr,
@@ -103,6 +93,7 @@ def test(model, test_loader, device):
     return test_acc
 
 
+@torch.no_grad()
 def inference_run(model, test_loader, device):
     model.eval()
     for data in test_loader:
diff --git a/examples/hetero/to_hetero_mag.py b/examples/hetero/to_hetero_mag.py
index bbad09c1aa57..d0b0bd4a525b 100644
--- a/examples/hetero/to_hetero_mag.py
+++ b/examples/hetero/to_hetero_mag.py
@@ -12,17 +12,15 @@
 from torch_geometric.datasets import OGB_MAG
 from torch_geometric.loader import HGTLoader, NeighborLoader
 from torch_geometric.nn import Linear, SAGEConv, Sequential, to_hetero
+from torch_geometric.profile import trace_handler
 
 parser = argparse.ArgumentParser()
 parser.add_argument('--use_hgt_loader', action='store_true')
-parser.add_argument('--inference', type=bool, default=False)
-parser.add_argument('--profile', type=bool,
-                    default=False)  # Currently support profile in inference
+parser.add_argument('--inference', action='store_true')
+parser.add_argument('--profile', action='store_true')
 args = parser.parse_args()
 
 device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
-profile_sort = "self_cuda_time_total" if torch.cuda.is_available(
-) else "self_cpu_time_total"
 
 path = osp.join(osp.dirname(osp.realpath(__file__)), '../../data/OGB')
 transform = T.ToUndirected(merge=True)
@@ -56,15 +54,6 @@
 model = to_hetero(model, data.metadata(), aggr='sum').to(device)
 
 
-def trace_handler(p):
-    output = p.key_averages().table(sort_by=profile_sort)
-    print(output)
-    import pathlib
-    profile_dir = str(pathlib.Path.cwd()) + '/'
-    timeline_file = profile_dir + 'timeline-to-hetero-mag' + '.json'
-    p.export_chrome_trace(timeline_file)
-
-
 @torch.no_grad()
 def init_params():
     # Initialize lazy parameters via forwarding a single batch to the model:
@@ -114,7 +103,6 @@ def inference(loader):
     model.eval()
     for batch in tqdm(loader):
         batch = batch.to(device, 'edge_index')
-        batch_size = batch['paper'].batch_size
         model(batch.x_dict, batch.edge_index_dict)
 
 
diff --git a/examples/pna.py b/examples/pna.py
index c81e97eb4af9..a7ee23bd6367 100644
--- a/examples/pna.py
+++ b/examples/pna.py
@@ -11,12 +11,12 @@
 from torch_geometric.datasets import ZINC
 from torch_geometric.loader import DataLoader
 from torch_geometric.nn import BatchNorm, PNAConv, global_add_pool
+from torch_geometric.profile import rename_profile_file, trace_handler
 from torch_geometric.utils import degree
 
 parser = argparse.ArgumentParser()
-parser.add_argument('--inference', type=bool, default=False)
-parser.add_argument('--profile', type=bool,
-                    default=False)  # Currently support profile in inference
+parser.add_argument('--inference', action='store_true')
+parser.add_argument('--profile', action='store_true')
 args = parser.parse_args()
 
 path = osp.join(osp.dirname(osp.realpath(__file__)), '..', 'data', 'ZINC')
@@ -77,22 +77,11 @@ def forward(self, x, edge_index, edge_attr, batch):
 
 device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
 model = Net().to(device)
-profile_sort = "self_cuda_time_total" if torch.cuda.is_available(
-) else "self_cpu_time_total"
 optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
 scheduler = ReduceLROnPlateau(optimizer, mode='min', factor=0.5, patience=20,
                               min_lr=0.00001)
 
 
-def trace_handler(p):
-    output = p.key_averages().table(sort_by=profile_sort)
-    print(output)
-    import pathlib
-    profile_dir = str(pathlib.Path.cwd()) + '/'
-    timeline_file = profile_dir + 'timeline-to-pna' + '.json'
-    p.export_chrome_trace(timeline_file)
-
-
 def train(epoch):
     model.train()
 
@@ -146,6 +135,7 @@ def inference(loader):
                         ], on_trace_ready=trace_handler) as p:
                     inference(test_loader)
                     p.step()
+                rename_profile_file('pna')
             else:
                 if torch.cuda.is_available():
                     torch.cuda.synchronize()
diff --git a/torch_geometric/profile/__init__.py b/torch_geometric/profile/__init__.py
index 9856d4eca56f..7e0fce58c498 100644
--- a/torch_geometric/profile/__init__.py
+++ b/torch_geometric/profile/__init__.py
@@ -1,4 +1,5 @@
 from .profile import profileit, timeit, get_stats_summary
+from .profile import trace_handler, rename_profile_file
 from .utils import count_parameters
 from .utils import get_model_size
 from .utils import get_data_size
@@ -10,6 +11,8 @@
     'profileit',
     'timeit',
     'get_stats_summary',
+    'trace_handler',
+    'rename_profile_file',
     'count_parameters',
     'get_model_size',
     'get_data_size',
diff --git a/torch_geometric/profile/profile.py b/torch_geometric/profile/profile.py
index fa2477535332..f9c5ceb76eff 100644
--- a/torch_geometric/profile/profile.py
+++ b/torch_geometric/profile/profile.py
@@ -172,3 +172,27 @@ def std(values: List[float]):
 
 def mean(values: List[float]):
     return float(torch.tensor(values).mean())
+
+
+def trace_handler(p):
+    if torch.cuda.is_available():
+        profile_sort = 'self_cuda_time_total'
+    else:
+        profile_sort = 'self_cpu_time_total'
+    output = p.key_averages().table(sort_by=profile_sort)
+    print(output)
+    import pathlib
+    profile_dir = str(pathlib.Path.cwd()) + '/'
+    timeline_file = profile_dir + 'timeline' + '.json'
+    p.export_chrome_trace(timeline_file)
+
+
+def rename_profile_file(*args):
+    import os
+    import pathlib
+    profile_dir = str(pathlib.Path.cwd()) + '/'
+    timeline_file = profile_dir + 'profile'
+    for arg in args:
+        timeline_file += '-' + arg
+    timeline_file += '.json'
+    os.rename('timeline.json', timeline_file)

From 9bd310a7e9e00c397f01ae05b4214e5eb04d73ed Mon Sep 17 00:00:00 2001
From: yanbing-j <yanbing.jiang@intel.com>
Date: Mon, 11 Jul 2022 16:01:56 +0800
Subject: [PATCH 13/24] Add profile test to increase code coverage

---
 test/profile/test_profile.py | 19 +++++++++++++++++--
 1 file changed, 17 insertions(+), 2 deletions(-)

diff --git a/test/profile/test_profile.py b/test/profile/test_profile.py
index 2019a39506fd..06bcb0f908d0 100644
--- a/test/profile/test_profile.py
+++ b/test/profile/test_profile.py
@@ -1,8 +1,15 @@
 import torch
 import torch.nn.functional as F
+from torch.profiler import ProfilerActivity, profile
 
 from torch_geometric.nn import GraphSAGE
-from torch_geometric.profile import get_stats_summary, profileit, timeit
+from torch_geometric.profile import (
+    get_stats_summary,
+    profileit,
+    rename_profile_file,
+    timeit,
+    trace_handler,
+)
 from torch_geometric.testing import withCUDA
 
 
@@ -41,7 +48,11 @@ def test(model, x, edge_index, y):
         assert stats.nvidia_smi_free_cuda > 0
         assert stats.nvidia_smi_used_cuda > 0
 
-        _, time = test(model, data.x, data.edge_index, data.y)
+        with profile(activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA],
+                     on_trace_ready=trace_handler) as p:
+            _, time = test(model, data.x, data.edge_index, data.y)
+            p.step()
+
         assert time > 0
 
         if epoch >= 2:  # Warm-up
@@ -56,3 +67,7 @@ def test(model, x, edge_index, y):
     assert stats_summary.max_active_cuda > 0
     assert stats_summary.min_nvidia_smi_free_cuda > 0
     assert stats_summary.max_nvidia_smi_used_cuda > 0
+
+    rename_profile_file('test_profile')
+    import os.path
+    assert os.path.exists('profile-test_profile.json')

From 5e90e8192c71877bfaeff3934ab2b89dd76d3155 Mon Sep 17 00:00:00 2001
From: yanbing-j <yanbing.jiang@intel.com>
Date: Mon, 11 Jul 2022 16:33:58 +0800
Subject: [PATCH 14/24] Update script of points benchmark

---
 benchmark/points/train_eval.py | 102 ++++++++++++++++++---------------
 1 file changed, 57 insertions(+), 45 deletions(-)

diff --git a/benchmark/points/train_eval.py b/benchmark/points/train_eval.py
index abdd303e38dd..ca0cfa6fe6fc 100644
--- a/benchmark/points/train_eval.py
+++ b/benchmark/points/train_eval.py
@@ -11,61 +11,73 @@
 device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
 
 
-def run(train_dataset, test_dataset, model, epochs, batch_size, lr,
-        lr_decay_factor, lr_decay_step_size, weight_decay, inference,
-        profiling):
-
-    model = model.to(device)
+def run_train(train_dataset, test_dataset, model, epochs, batch_size, lr,
+              lr_decay_factor, lr_decay_step_size, weight_decay):
     optimizer = Adam(model.parameters(), lr=lr, weight_decay=weight_decay)
-
     train_loader = DataLoader(train_dataset, batch_size, shuffle=True)
     test_loader = DataLoader(test_dataset, batch_size, shuffle=False)
 
-    if not inference:
-        for epoch in range(1, epochs + 1):
-            if torch.cuda.is_available():
-                torch.cuda.synchronize()
+    for epoch in range(1, epochs + 1):
+        print("Epoch {} starts".format(epoch))
+        if torch.cuda.is_available():
+            torch.cuda.synchronize()
 
-            t_start = time.perf_counter()
+        t_start = time.perf_counter()
 
-            train(model, optimizer, train_loader, device)
-            test_acc = test(model, test_loader, device)
+        train(model, optimizer, train_loader, device)
+        test_acc = test(model, test_loader, device)
 
-            if torch.cuda.is_available():
-                torch.cuda.synchronize()
+        if torch.cuda.is_available():
+            torch.cuda.synchronize()
 
-            t_end = time.perf_counter()
+        t_end = time.perf_counter()
 
-            print(f'Epoch: {epoch:03d}, Test: {test_acc:.4f}, '
-                  f'Duration: {t_end - t_start:.2f}')
+        print(f'Epoch: {epoch:03d}, Test: {test_acc:.4f}, '
+              f'Duration: {t_end - t_start:.2f}')
 
-            if epoch % lr_decay_step_size == 0:
-                for param_group in optimizer.param_groups:
-                    param_group['lr'] = lr_decay_factor * param_group['lr']
-    else:
-        for epoch in range(1, epochs + 1):
-            if epoch == epochs:
-                if profiling:
-                    with profile(
-                            activities=[
-                                ProfilerActivity.CPU, ProfilerActivity.CUDA
-                            ], on_trace_ready=trace_handler) as p:
-                        inference_run(model, test_loader, device)
-                        p.step()
-                else:
-                    if torch.cuda.is_available():
-                        torch.cuda.synchronize()
-                    t_start = time.time()
-
-                    inference_run(model, test_loader, device)
-
-                    if torch.cuda.is_available():
-                        torch.cuda.synchronize()
-                    t_end = time.time()
-                    duration = t_end - t_start
-                    print("End-to-End time: {} s".format(duration), flush=True)
+        if epoch % lr_decay_step_size == 0:
+            for param_group in optimizer.param_groups:
+                param_group['lr'] = lr_decay_factor * param_group['lr']
+
+
+def run_inference(test_dataset, model, epochs, batch_size, profiling):
+    model = model.to(device)
+    test_loader = DataLoader(test_dataset, batch_size, shuffle=False)
+
+    for epoch in range(1, epochs + 1):
+        print("Epoch {} starts".format(epoch))
+        if epoch == epochs:
+            if profiling:
+                with profile(
+                        activities=[
+                            ProfilerActivity.CPU, ProfilerActivity.CUDA
+                        ], on_trace_ready=trace_handler) as p:
+                    inference(model, test_loader, device)
+                    p.step()
             else:
-                inference_run(model, test_loader, device)
+                if torch.cuda.is_available():
+                    torch.cuda.synchronize()
+                t_start = time.time()
+
+                inference(model, test_loader, device)
+
+                if torch.cuda.is_available():
+                    torch.cuda.synchronize()
+                t_end = time.time()
+                duration = t_end - t_start
+                print("End-to-End time: {} s".format(duration), flush=True)
+        else:
+            inference(model, test_loader, device)
+
+
+def run(train_dataset, test_dataset, model, epochs, batch_size, lr,
+        lr_decay_factor, lr_decay_step_size, weight_decay, inference,
+        profiling):
+    if not inference:
+        run_train(train_dataset, test_dataset, model, epochs, batch_size, lr,
+                  lr_decay_factor, lr_decay_step_size, weight_decay)
+    else:
+        run_inference(test_dataset, model, epochs, batch_size, profiling)
 
 
 def train(model, optimizer, train_loader, device):
@@ -94,7 +106,7 @@ def test(model, test_loader, device):
 
 
 @torch.no_grad()
-def inference_run(model, test_loader, device):
+def inference(model, test_loader, device):
     model.eval()
     for data in test_loader:
         data = data.to(device)

From f888faff2734c51aa3a8e68907523c9a6e5fca6c Mon Sep 17 00:00:00 2001
From: yanbing-j <yanbing.jiang@intel.com>
Date: Tue, 12 Jul 2022 19:45:48 +0800
Subject: [PATCH 15/24] Update script for missing rename

---
 examples/hetero/to_hetero_mag.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/examples/hetero/to_hetero_mag.py b/examples/hetero/to_hetero_mag.py
index d0b0bd4a525b..aa002c961ce0 100644
--- a/examples/hetero/to_hetero_mag.py
+++ b/examples/hetero/to_hetero_mag.py
@@ -12,7 +12,7 @@
 from torch_geometric.datasets import OGB_MAG
 from torch_geometric.loader import HGTLoader, NeighborLoader
 from torch_geometric.nn import Linear, SAGEConv, Sequential, to_hetero
-from torch_geometric.profile import trace_handler
+from torch_geometric.profile import rename_profile_file, trace_handler
 
 parser = argparse.ArgumentParser()
 parser.add_argument('--use_hgt_loader', action='store_true')
@@ -124,6 +124,7 @@ def inference(loader):
                         ], on_trace_ready=trace_handler) as p:
                     inference(val_loader)
                     p.step()
+                    rename_profile_file('to_hetero_mag')
             else:
                 if torch.cuda.is_available():
                     torch.cuda.synchronize()

From 55c5c42a13da47fa26e96b2f1f118822f90eb76d Mon Sep 17 00:00:00 2001
From: yanbing-j <yanbing.jiang@intel.com>
Date: Tue, 12 Jul 2022 21:17:14 +0800
Subject: [PATCH 16/24] Update scripts according to the comments

---
 benchmark/citation/train_eval.py   | 55 +++++++++++++++---------------
 torch_geometric/profile/profile.py |  5 ++-
 2 files changed, 29 insertions(+), 31 deletions(-)

diff --git a/benchmark/citation/train_eval.py b/benchmark/citation/train_eval.py
index 9485a834e51c..88a2b6c9e96e 100644
--- a/benchmark/citation/train_eval.py
+++ b/benchmark/citation/train_eval.py
@@ -94,36 +94,36 @@ def run_inference(dataset, model, runs, epochs, profiling, permute_masks=None,
                   logger=None):
     for i in range(runs):
         data = dataset[0]
-    if permute_masks is not None:
-        data = permute_masks(data, dataset.num_classes)
-    data = data.to(device)
-
-    model.to(device).reset_parameters()
-
-    for epoch in range(1, epochs + 1):
-        if i == runs - 1 and epoch == epochs:
-            if profiling:
-                with profile(
-                        activities=[
-                            ProfilerActivity.CPU, ProfilerActivity.CUDA
-                        ], on_trace_ready=trace_handler) as p:
+        if permute_masks is not None:
+            data = permute_masks(data, dataset.num_classes)
+        data = data.to(device)
+
+        model.to(device).reset_parameters()
+
+        for epoch in range(1, epochs + 1):
+            if i == runs - 1 and epoch == epochs:
+                if profiling:
+                    with profile(
+                            activities=[
+                                ProfilerActivity.CPU, ProfilerActivity.CUDA
+                            ], on_trace_ready=trace_handler) as p:
+                        inference(model, data)
+                        p.step()
+                else:
+                    if torch.cuda.is_available():
+                        torch.cuda.synchronize()
+                    t_start = time.time()
+
                     inference(model, data)
-                    p.step()
-            else:
-                if torch.cuda.is_available():
-                    torch.cuda.synchronize()
-                t_start = time.time()
 
+                    if torch.cuda.is_available():
+                        torch.cuda.synchronize()
+                    t_end = time.time()
+                    duration = t_end - t_start
+                    print("End-to-End time: {} s".format(duration), flush=True)
+            else:
                 inference(model, data)
 
-                if torch.cuda.is_available():
-                    torch.cuda.synchronize()
-                t_end = time.time()
-                duration = t_end - t_start
-                print("End-to-End time: {} s".format(duration), flush=True)
-        else:
-            inference(model, data)
-
 
 def run(dataset, model, runs, epochs, lr, weight_decay, early_stopping,
         inference, profiling, permute_masks=None, logger=None):
@@ -166,5 +166,4 @@ def evaluate(model, data):
 @torch.no_grad()
 def inference(model, data):
     model.eval()
-    with torch.no_grad():
-        model(data)
+    model(data)
diff --git a/torch_geometric/profile/profile.py b/torch_geometric/profile/profile.py
index f9c5ceb76eff..a889cfacdc2c 100644
--- a/torch_geometric/profile/profile.py
+++ b/torch_geometric/profile/profile.py
@@ -1,3 +1,5 @@
+import os
+import pathlib
 from typing import Any, List, NamedTuple, Tuple
 
 import torch
@@ -181,15 +183,12 @@ def trace_handler(p):
         profile_sort = 'self_cpu_time_total'
     output = p.key_averages().table(sort_by=profile_sort)
     print(output)
-    import pathlib
     profile_dir = str(pathlib.Path.cwd()) + '/'
     timeline_file = profile_dir + 'timeline' + '.json'
     p.export_chrome_trace(timeline_file)
 
 
 def rename_profile_file(*args):
-    import os
-    import pathlib
     profile_dir = str(pathlib.Path.cwd()) + '/'
     timeline_file = profile_dir + 'profile'
     for arg in args:

From 00bca650030adf6ffbc0103fdbb5de59e6534873 Mon Sep 17 00:00:00 2001
From: yanbing-j <yanbing.jiang@intel.com>
Date: Tue, 12 Jul 2022 22:12:31 +0800
Subject: [PATCH 17/24] Add CPU test for profile

---
 benchmark/citation/inference.sh | 72 ++++++++++++++++-----------------
 test/profile/test_profile.py    | 24 ++++++++---
 2 files changed, 54 insertions(+), 42 deletions(-)

diff --git a/benchmark/citation/inference.sh b/benchmark/citation/inference.sh
index cb9f3e8f23c6..5d425663bb8c 100755
--- a/benchmark/citation/inference.sh
+++ b/benchmark/citation/inference.sh
@@ -5,114 +5,114 @@ echo "===="
 
 echo "GCN"
 python gcn.py --dataset=Cora --inference
-python gcn.py --dataset=Cora --random_splits=True --inference
+python gcn.py --dataset=Cora --random_splits --inference
 python gcn.py --dataset=Cora --inference --profile
-python gcn.py --dataset=Cora --random_splits=True --inference --profile
+python gcn.py --dataset=Cora --random_splits --inference --profile
 
 echo "GAT"
 python gat.py --dataset=Cora --inference
-python gat.py --dataset=Cora --random_splits=True --inference
+python gat.py --dataset=Cora --random_splits --inference
 python gat.py --dataset=Cora --inference --profile
-python gat.py --dataset=Cora --random_splits=True --inference --profile
+python gat.py --dataset=Cora --random_splits --inference --profile
 
 echo "Cheby"
 python cheb.py --dataset=Cora --num_hops=3 --inference
-python cheb.py --dataset=Cora --num_hops=3 --random_splits=True --inference
+python cheb.py --dataset=Cora --num_hops=3 --random_splits --inference
 python cheb.py --dataset=Cora --num_hops=3 --inference --profile
-python cheb.py --dataset=Cora --num_hops=3 --random_splits=True --inference --profile
+python cheb.py --dataset=Cora --num_hops=3 --random_splits --inference --profile
 
 echo "SGC"
 python sgc.py --dataset=Cora --K=3 --weight_decay=0.0005 --inference
-python sgc.py --dataset=Cora --K=3 --weight_decay=0.0005 --random_splits=True --inference
+python sgc.py --dataset=Cora --K=3 --weight_decay=0.0005 --random_splits --inference
 python sgc.py --dataset=Cora --K=3 --weight_decay=0.0005 --inference --profile
-python sgc.py --dataset=Cora --K=3 --weight_decay=0.0005 --random_splits=True --inference --profile
+python sgc.py --dataset=Cora --K=3 --weight_decay=0.0005 --random_splits --inference --profile
 
 echo "ARMA"
 python arma.py --dataset=Cora --num_stacks=2 --num_layers=1 --shared_weights=True --inference
-python arma.py --dataset=Cora --num_stacks=3 --num_layers=1 --shared_weights=True --random_splits=True --inference
+python arma.py --dataset=Cora --num_stacks=3 --num_layers=1 --shared_weights=True --random_splits --inference
 python arma.py --dataset=Cora --num_stacks=2 --num_layers=1 --shared_weights=True --inference --profile
-python arma.py --dataset=Cora --num_stacks=3 --num_layers=1 --shared_weights=True --random_splits=True --inference --profile
+python arma.py --dataset=Cora --num_stacks=3 --num_layers=1 --shared_weights=True --random_splits --inference --profile
 
 echo "APPNP"
 python appnp.py --dataset=Cora --alpha=0.1 --inference
-python appnp.py --dataset=Cora --alpha=0.1 --random_splits=True --inference
+python appnp.py --dataset=Cora --alpha=0.1 --random_splits --inference
 python appnp.py --dataset=Cora --alpha=0.1 --inference --profile
-python appnp.py --dataset=Cora --alpha=0.1 --random_splits=True --inference --profile
+python appnp.py --dataset=Cora --alpha=0.1 --random_splits --inference --profile
 
 echo "CiteSeer"
 echo "========"
 
 echo "GCN"
 python gcn.py --dataset=CiteSeer --inference
-python gcn.py --dataset=CiteSeer --random_splits=True --inference
+python gcn.py --dataset=CiteSeer --random_splits --inference
 python gcn.py --dataset=CiteSeer --inference --profile
-python gcn.py --dataset=CiteSeer --random_splits=True --inference --profile
+python gcn.py --dataset=CiteSeer --random_splits --inference --profile
 
 echo "GAT"
 python gat.py --dataset=CiteSeer --inference
-python gat.py --dataset=CiteSeer --random_splits=True --inference
+python gat.py --dataset=CiteSeer --random_splits --inference
 python gat.py --dataset=CiteSeer --inference --profile
-python gat.py --dataset=CiteSeer --random_splits=True --inference --profile
+python gat.py --dataset=CiteSeer --random_splits --inference --profile
 
 echo "Cheby"
 python cheb.py --dataset=CiteSeer --num_hops=2 --inference
-python cheb.py --dataset=CiteSeer --num_hops=3 --random_splits=True --inference
+python cheb.py --dataset=CiteSeer --num_hops=3 --random_splits --inference
 python cheb.py --dataset=CiteSeer --num_hops=2 --inference --profile
-python cheb.py --dataset=CiteSeer --num_hops=3 --random_splits=True --inference --profile
+python cheb.py --dataset=CiteSeer --num_hops=3 --random_splits --inference --profile
 
 echo "SGC"
 python sgc.py --dataset=CiteSeer --K=2 --weight_decay=0.005 --inference
-python sgc.py --dataset=CiteSeer --K=2 --weight_decay=0.005 --random_splits=True --inference
+python sgc.py --dataset=CiteSeer --K=2 --weight_decay=0.005 --random_splits --inference
 python sgc.py --dataset=CiteSeer --K=2 --weight_decay=0.005 --inference --profile
-python sgc.py --dataset=CiteSeer --K=2 --weight_decay=0.005 --random_splits=True --inference --profile
+python sgc.py --dataset=CiteSeer --K=2 --weight_decay=0.005 --random_splits --inference --profile
 
 echo "ARMA"
 python arma.py --dataset=CiteSeer --num_stacks=3 --num_layers=1 --shared_weights=True --inference
-python arma.py --dataset=CiteSeer --num_stacks=3 --num_layers=1 --shared_weights=True --random_splits=True --inference
+python arma.py --dataset=CiteSeer --num_stacks=3 --num_layers=1 --shared_weights=True --random_splits --inference
 python arma.py --dataset=CiteSeer --num_stacks=3 --num_layers=1 --shared_weights=True --inference --profile
-python arma.py --dataset=CiteSeer --num_stacks=3 --num_layers=1 --shared_weights=True --random_splits=True --inference --profile
+python arma.py --dataset=CiteSeer --num_stacks=3 --num_layers=1 --shared_weights=True --random_splits --inference --profile
 
 echo "APPNP"
 python appnp.py --dataset=CiteSeer --alpha=0.1 --inference
-python appnp.py --dataset=CiteSeer --alpha=0.1 --random_splits=True --inference
+python appnp.py --dataset=CiteSeer --alpha=0.1 --random_splits --inference
 python appnp.py --dataset=CiteSeer --alpha=0.1 --inference --profile
-python appnp.py --dataset=CiteSeer --alpha=0.1 --random_splits=True --inference --profile
+python appnp.py --dataset=CiteSeer --alpha=0.1 --random_splits --inference --profile
 
 echo "PubMed"
 echo "======"
 
 echo "GCN"
 python gcn.py --dataset=PubMed --inference
-python gcn.py --dataset=PubMed --random_splits=True --inference
+python gcn.py --dataset=PubMed --random_splits --inference
 python gcn.py --dataset=PubMed --inference --profile
-python gcn.py --dataset=PubMed --random_splits=True --inference --profile
+python gcn.py --dataset=PubMed --random_splits --inference --profile
 
 echo "GAT"
 python gat.py --dataset=PubMed --lr=0.01 --weight_decay=0.001 --output_heads=8 --inference
-python gat.py --dataset=PubMed --lr=0.01 --weight_decay=0.001 --output_heads=8 --random_splits=True --inference
+python gat.py --dataset=PubMed --lr=0.01 --weight_decay=0.001 --output_heads=8 --random_splits --inference
 python gat.py --dataset=PubMed --lr=0.01 --weight_decay=0.001 --output_heads=8 --inference --profile
-python gat.py --dataset=PubMed --lr=0.01 --weight_decay=0.001 --output_heads=8 --random_splits=True --inference --profile
+python gat.py --dataset=PubMed --lr=0.01 --weight_decay=0.001 --output_heads=8 --random_splits --inference --profile
 
 echo "Cheby"
 python cheb.py --dataset=PubMed --num_hops=2 --inference
-python cheb.py --dataset=PubMed --num_hops=2 --random_splits=True --inference
+python cheb.py --dataset=PubMed --num_hops=2 --random_splits --inference
 python cheb.py --dataset=PubMed --num_hops=2 --inference --profile
-python cheb.py --dataset=PubMed --num_hops=2 --random_splits=True --inference --profile
+python cheb.py --dataset=PubMed --num_hops=2 --random_splits --inference --profile
 
 echo "SGC"
 python sgc.py --dataset=PubMed --K=2 --weight_decay=0.0005 --inference
-python sgc.py --dataset=PubMed --K=2 --weight_decay=0.0005 --random_splits=True --inference
+python sgc.py --dataset=PubMed --K=2 --weight_decay=0.0005 --random_splits --inference
 python sgc.py --dataset=PubMed --K=2 --weight_decay=0.0005 --inference --profile
-python sgc.py --dataset=PubMed --K=2 --weight_decay=0.0005 --random_splits=True --inference --profile
+python sgc.py --dataset=PubMed --K=2 --weight_decay=0.0005 --random_splits --inference --profile
 
 echo "ARMA"
 python arma.py --dataset=PubMed --num_stacks=2 --num_layers=1 --skip_dropout=0 --inference
-python arma.py --dataset=PubMed --num_stacks=2 --num_layers=1 --skip_dropout=0.5 --random_splits=True --inference
+python arma.py --dataset=PubMed --num_stacks=2 --num_layers=1 --skip_dropout=0.5 --random_splits --inference
 python arma.py --dataset=PubMed --num_stacks=2 --num_layers=1 --skip_dropout=0 --inference --profile
-python arma.py --dataset=PubMed --num_stacks=2 --num_layers=1 --skip_dropout=0.5 --random_splits=True --inference --profile
+python arma.py --dataset=PubMed --num_stacks=2 --num_layers=1 --skip_dropout=0.5 --random_splits --inference --profile
 
 echo "APPNP"
 python appnp.py --dataset=PubMed --alpha=0.1 --inference
-python appnp.py --dataset=PubMed --alpha=0.1 --random_splits=True --inference
+python appnp.py --dataset=PubMed --alpha=0.1 --random_splits --inference
 python appnp.py --dataset=PubMed --alpha=0.1 --inference --profile
-python appnp.py --dataset=PubMed --alpha=0.1 --random_splits=True --inference --profile
+python appnp.py --dataset=PubMed --alpha=0.1 --random_splits --inference --profile
diff --git a/test/profile/test_profile.py b/test/profile/test_profile.py
index 06bcb0f908d0..7fbd63422459 100644
--- a/test/profile/test_profile.py
+++ b/test/profile/test_profile.py
@@ -1,3 +1,5 @@
+import os.path
+
 import torch
 import torch.nn.functional as F
 from torch.profiler import ProfilerActivity, profile
@@ -48,11 +50,7 @@ def test(model, x, edge_index, y):
         assert stats.nvidia_smi_free_cuda > 0
         assert stats.nvidia_smi_used_cuda > 0
 
-        with profile(activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA],
-                     on_trace_ready=trace_handler) as p:
-            _, time = test(model, data.x, data.edge_index, data.y)
-            p.step()
-
+        _, time = test(model, data.x, data.edge_index, data.y)
         assert time > 0
 
         if epoch >= 2:  # Warm-up
@@ -68,6 +66,20 @@ def test(model, x, edge_index, y):
     assert stats_summary.min_nvidia_smi_free_cuda > 0
     assert stats_summary.max_nvidia_smi_used_cuda > 0
 
+
+def test_trace_handler(get_dataset):
+    dataset = get_dataset(name='PubMed')
+    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+    data = dataset[0].to(device)
+    model = GraphSAGE(dataset.num_features, hidden_channels=64, num_layers=3,
+                      out_channels=dataset.num_classes).to(device)
+    model.eval()
+
+    for epoch in range(3):
+        print("epoch ", epoch)
+        with profile(activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA],
+                     on_trace_ready=trace_handler) as p:
+            model(data.x, data.edge_index)
+            p.step()
     rename_profile_file('test_profile')
-    import os.path
     assert os.path.exists('profile-test_profile.json')

From cef334d6278c1144217f2a35abb259f7762c7a45 Mon Sep 17 00:00:00 2001
From: rusty1s <matthias.fey@tu-dortmund.de>
Date: Wed, 13 Jul 2022 12:59:01 +0000
Subject: [PATCH 18/24] update

---
 benchmark/citation/appnp.py |  6 +++---
 benchmark/citation/arma.py  | 16 ++++++++--------
 benchmark/citation/cheb.py  |  8 ++++----
 benchmark/citation/gat.py   | 14 +++++++-------
 benchmark/citation/gcn.py   |  8 ++++----
 benchmark/citation/sgc.py   |  8 ++++----
 6 files changed, 30 insertions(+), 30 deletions(-)

diff --git a/benchmark/citation/appnp.py b/benchmark/citation/appnp.py
index f7d7d63ac6a5..03573cacbccf 100644
--- a/benchmark/citation/appnp.py
+++ b/benchmark/citation/appnp.py
@@ -5,7 +5,7 @@
 from citation import get_planetoid_dataset, random_planetoid_splits, run
 from torch.nn import Linear
 
-from torch_geometric.nn import APPNP as Conv
+from torch_geometric.nn import APPNP
 from torch_geometric.profile import rename_profile_file
 
 parser = argparse.ArgumentParser()
@@ -31,7 +31,7 @@ def __init__(self, dataset):
         super().__init__()
         self.lin1 = Linear(dataset.num_features, args.hidden)
         self.lin2 = Linear(args.hidden, dataset.num_classes)
-        self.prop1 = Conv(args.K, args.alpha)
+        self.prop1 = APPNP(args.K, args.alpha)
 
     def reset_parameters(self):
         self.lin1.reset_parameters()
@@ -53,5 +53,5 @@ def forward(self, data):
     args.early_stopping, args.inference, args.profile, permute_masks)
 
 if args.profile:
-    rename_profile_file('citation', Conv.__name__, args.dataset,
+    rename_profile_file('citation', APPNP.__name__, args.dataset,
                         str(args.random_splits))
diff --git a/benchmark/citation/arma.py b/benchmark/citation/arma.py
index 482013be9878..65cc9029fe5f 100644
--- a/benchmark/citation/arma.py
+++ b/benchmark/citation/arma.py
@@ -4,7 +4,7 @@
 import torch.nn.functional as F
 from citation import get_planetoid_dataset, random_planetoid_splits, run
 
-from torch_geometric.nn import ARMAConv as Conv
+from torch_geometric.nn import ARMAConv
 from torch_geometric.profile import rename_profile_file
 
 parser = argparse.ArgumentParser()
@@ -30,12 +30,12 @@
 class Net(torch.nn.Module):
     def __init__(self, dataset):
         super().__init__()
-        self.conv1 = Conv(dataset.num_features, args.hidden, args.num_stacks,
-                          args.num_layers, args.shared_weights,
-                          dropout=args.skip_dropout)
-        self.conv2 = Conv(args.hidden, dataset.num_classes, args.num_stacks,
-                          args.num_layers, args.shared_weights,
-                          dropout=args.skip_dropout)
+        self.conv1 = ARMAConv(dataset.num_features, args.hidden,
+                              args.num_stacks, args.num_layers,
+                              args.shared_weights, dropout=args.skip_dropout)
+        self.conv2 = ARMAConv(args.hidden, dataset.num_classes,
+                              args.num_stacks, args.num_layers,
+                              args.shared_weights, dropout=args.skip_dropout)
 
     def reset_parameters(self):
         self.conv1.reset_parameters()
@@ -55,5 +55,5 @@ def forward(self, data):
     args.early_stopping, args.inference, args.profile, permute_masks)
 
 if args.profile:
-    rename_profile_file('citation', Conv.__name__, args.dataset,
+    rename_profile_file('citation', ARMAConv.__name__, args.dataset,
                         str(args.random_splits))
diff --git a/benchmark/citation/cheb.py b/benchmark/citation/cheb.py
index e258b76aab84..79f0182adc2c 100644
--- a/benchmark/citation/cheb.py
+++ b/benchmark/citation/cheb.py
@@ -4,7 +4,7 @@
 import torch.nn.functional as F
 from citation import get_planetoid_dataset, random_planetoid_splits, run
 
-from torch_geometric.nn import ChebConv as Conv
+from torch_geometric.nn import ChebConv
 from torch_geometric.profile import rename_profile_file
 
 parser = argparse.ArgumentParser()
@@ -27,8 +27,8 @@
 class Net(torch.nn.Module):
     def __init__(self, dataset):
         super().__init__()
-        self.conv1 = Conv(dataset.num_features, args.hidden, args.num_hops)
-        self.conv2 = Conv(args.hidden, dataset.num_classes, args.num_hops)
+        self.conv1 = ChebConv(dataset.num_features, args.hidden, args.num_hops)
+        self.conv2 = ChebConv(args.hidden, dataset.num_classes, args.num_hops)
 
     def reset_parameters(self):
         self.conv1.reset_parameters()
@@ -48,5 +48,5 @@ def forward(self, data):
     args.early_stopping, args.inference, args.profile, permute_masks)
 
 if args.profile:
-    rename_profile_file('citation', Conv.__name__, args.dataset,
+    rename_profile_file('citation', ChebConv.__name__, args.dataset,
                         str(args.random_splits))
diff --git a/benchmark/citation/gat.py b/benchmark/citation/gat.py
index 55b178c1bbae..6d171370c269 100644
--- a/benchmark/citation/gat.py
+++ b/benchmark/citation/gat.py
@@ -4,7 +4,7 @@
 import torch.nn.functional as F
 from citation import get_planetoid_dataset, random_planetoid_splits, run
 
-from torch_geometric.nn import GATConv as Conv
+from torch_geometric.nn import GATConv
 from torch_geometric.profile import rename_profile_file
 
 parser = argparse.ArgumentParser()
@@ -28,11 +28,11 @@
 class Net(torch.nn.Module):
     def __init__(self, dataset):
         super().__init__()
-        self.conv1 = Conv(dataset.num_features, args.hidden, heads=args.heads,
-                          dropout=args.dropout)
-        self.conv2 = Conv(args.hidden * args.heads, dataset.num_classes,
-                          heads=args.output_heads, concat=False,
-                          dropout=args.dropout)
+        self.conv1 = GATConv(dataset.num_features, args.hidden,
+                             heads=args.heads, dropout=args.dropout)
+        self.conv2 = GATConv(args.hidden * args.heads, dataset.num_classes,
+                             heads=args.output_heads, concat=False,
+                             dropout=args.dropout)
 
     def reset_parameters(self):
         self.conv1.reset_parameters()
@@ -53,5 +53,5 @@ def forward(self, data):
     args.early_stopping, args.inference, args.profile, permute_masks)
 
 if args.profile:
-    rename_profile_file('citation', Conv.__name__, args.dataset,
+    rename_profile_file('citation', GATConv.__name__, args.dataset,
                         str(args.random_splits))
diff --git a/benchmark/citation/gcn.py b/benchmark/citation/gcn.py
index ae1eed53bd1a..b42b531a2fdf 100644
--- a/benchmark/citation/gcn.py
+++ b/benchmark/citation/gcn.py
@@ -4,7 +4,7 @@
 import torch.nn.functional as F
 from citation import get_planetoid_dataset, random_planetoid_splits, run
 
-from torch_geometric.nn import GCNConv as Conv
+from torch_geometric.nn import GCNConv
 from torch_geometric.profile import rename_profile_file
 
 parser = argparse.ArgumentParser()
@@ -26,8 +26,8 @@
 class Net(torch.nn.Module):
     def __init__(self, dataset):
         super().__init__()
-        self.conv1 = Conv(dataset.num_features, args.hidden)
-        self.conv2 = Conv(args.hidden, dataset.num_classes)
+        self.conv1 = GCNConv(dataset.num_features, args.hidden)
+        self.conv2 = GCNConv(args.hidden, dataset.num_classes)
 
     def reset_parameters(self):
         self.conv1.reset_parameters()
@@ -47,5 +47,5 @@ def forward(self, data):
     args.early_stopping, args.inference, args.profile, permute_masks)
 
 if args.profile:
-    rename_profile_file('citation', Conv.__name__, args.dataset,
+    rename_profile_file('citation', GCNConv.__name__, args.dataset,
                         str(args.random_splits))
diff --git a/benchmark/citation/sgc.py b/benchmark/citation/sgc.py
index 1acec1fc1d0f..633ffb208d25 100644
--- a/benchmark/citation/sgc.py
+++ b/benchmark/citation/sgc.py
@@ -4,7 +4,7 @@
 import torch.nn.functional as F
 from citation import get_planetoid_dataset, random_planetoid_splits, run
 
-from torch_geometric.nn import SGConv as Conv
+from torch_geometric.nn import SGConv
 from torch_geometric.profile import rename_profile_file
 
 parser = argparse.ArgumentParser()
@@ -25,8 +25,8 @@
 class Net(torch.nn.Module):
     def __init__(self, dataset):
         super().__init__()
-        self.conv1 = Conv(dataset.num_features, dataset.num_classes, K=args.K,
-                          cached=True)
+        self.conv1 = SGConv(dataset.num_features, dataset.num_classes,
+                            K=args.K, cached=True)
 
     def reset_parameters(self):
         self.conv1.reset_parameters()
@@ -43,5 +43,5 @@ def forward(self, data):
     args.early_stopping, args.inference, args.profile, permute_masks)
 
 if args.profile:
-    rename_profile_file('citation', Conv.__name__, args.dataset,
+    rename_profile_file('citation', SGConv.__name__, args.dataset,
                         str(args.random_splits))

From e07d068fd79b4d3a0449427ab691de71e05d441b Mon Sep 17 00:00:00 2001
From: rusty1s <matthias.fey@tu-dortmund.de>
Date: Wed, 13 Jul 2022 13:04:59 +0000
Subject: [PATCH 19/24] update

---
 benchmark/citation/train_eval.py | 60 +++++++++++++++-----------------
 1 file changed, 28 insertions(+), 32 deletions(-)

diff --git a/benchmark/citation/train_eval.py b/benchmark/citation/train_eval.py
index 88a2b6c9e96e..15a16fbc0b36 100644
--- a/benchmark/citation/train_eval.py
+++ b/benchmark/citation/train_eval.py
@@ -90,39 +90,36 @@ def run_train(dataset, model, runs, epochs, lr, weight_decay, early_stopping,
           f'Duration: {float(duration.mean()):.3f}s')
 
 
-def run_inference(dataset, model, runs, epochs, profiling, permute_masks=None,
+@torch.no_grad()
+def run_inference(dataset, model, epochs, profiling, permute_masks=None,
                   logger=None):
-    for i in range(runs):
-        data = dataset[0]
-        if permute_masks is not None:
-            data = permute_masks(data, dataset.num_classes)
-        data = data.to(device)
+    data = dataset[0]
+    if permute_masks is not None:
+        data = permute_masks(data, dataset.num_classes)
+    data = data.to(device)
 
-        model.to(device).reset_parameters()
+    model.to(device).reset_parameters()
 
-        for epoch in range(1, epochs + 1):
-            if i == runs - 1 and epoch == epochs:
-                if profiling:
-                    with profile(
-                            activities=[
-                                ProfilerActivity.CPU, ProfilerActivity.CUDA
-                            ], on_trace_ready=trace_handler) as p:
-                        inference(model, data)
-                        p.step()
-                else:
-                    if torch.cuda.is_available():
-                        torch.cuda.synchronize()
-                    t_start = time.time()
-
-                    inference(model, data)
-
-                    if torch.cuda.is_available():
-                        torch.cuda.synchronize()
-                    t_end = time.time()
-                    duration = t_end - t_start
-                    print("End-to-End time: {} s".format(duration), flush=True)
-            else:
-                inference(model, data)
+    for epoch in range(1, epochs + 1):
+        if epoch == epochs:
+            if torch.cuda.is_available():
+                torch.cuda.synchronize()
+            t_start = time.time()
+
+        inference(model, data)
+
+        if epoch == epochs:
+            if torch.cuda.is_available():
+                torch.cuda.synchronize()
+            t_end = time.time()
+            duration = t_end - t_start
+            print(f'End-to-End Inference Time: {duration:.8f}s', flush=True)
+
+    if profiling:
+        with profile(activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA],
+                     on_trace_ready=trace_handler) as p:
+            inference(model, data)
+            p.step()
 
 
 def run(dataset, model, runs, epochs, lr, weight_decay, early_stopping,
@@ -131,8 +128,7 @@ def run(dataset, model, runs, epochs, lr, weight_decay, early_stopping,
         run_train(dataset, model, runs, epochs, lr, weight_decay,
                   early_stopping, permute_masks, logger)
     else:
-        run_inference(dataset, model, runs, epochs, profiling, permute_masks,
-                      logger)
+        run_inference(dataset, model, epochs, profiling, permute_masks, logger)
 
 
 def train(model, optimizer, data):

From b1a4620283f7489c2c16c07992655f649955784d Mon Sep 17 00:00:00 2001
From: rusty1s <matthias.fey@tu-dortmund.de>
Date: Wed, 13 Jul 2022 13:08:50 +0000
Subject: [PATCH 20/24] update

---
 benchmark/points/edge_cnn.py   | 2 +-
 benchmark/points/mpnn.py       | 9 ++++++++-
 benchmark/points/point_cnn.py  | 9 ++++++++-
 benchmark/points/point_net.py  | 9 ++++++++-
 benchmark/points/spline_cnn.py | 9 ++++++++-
 5 files changed, 33 insertions(+), 5 deletions(-)

diff --git a/benchmark/points/edge_cnn.py b/benchmark/points/edge_cnn.py
index 1541216906fb..6aed1ae71a17 100644
--- a/benchmark/points/edge_cnn.py
+++ b/benchmark/points/edge_cnn.py
@@ -62,4 +62,4 @@ def forward(self, pos, batch):
     args.inference, args.profile)
 
 if args.profile:
-    rename_profile_file('points', 'DynamicEdgeConv')
+    rename_profile_file('points', DynamicEdgeConv.__name__)
diff --git a/benchmark/points/mpnn.py b/benchmark/points/mpnn.py
index f9be424dc5f9..8bf4633d3003 100644
--- a/benchmark/points/mpnn.py
+++ b/benchmark/points/mpnn.py
@@ -9,6 +9,7 @@
 from torch.nn import Sequential as Seq
 
 from torch_geometric.nn import NNConv, fps, global_mean_pool, radius_graph
+from torch_geometric.profile import rename_profile_file
 
 parser = argparse.ArgumentParser()
 parser.add_argument('--epochs', type=int, default=200)
@@ -17,6 +18,8 @@
 parser.add_argument('--lr_decay_factor', type=float, default=0.5)
 parser.add_argument('--lr_decay_step_size', type=int, default=50)
 parser.add_argument('--weight_decay', type=float, default=0)
+parser.add_argument('--inference', action='store_true')
+parser.add_argument('--profile', action='store_true')
 args = parser.parse_args()
 
 
@@ -72,4 +75,8 @@ def forward(self, pos, batch):
 train_dataset, test_dataset = get_dataset(num_points=1024)
 model = Net(train_dataset.num_classes)
 run(train_dataset, test_dataset, model, args.epochs, args.batch_size, args.lr,
-    args.lr_decay_factor, args.lr_decay_step_size, args.weight_decay)
+    args.lr_decay_factor, args.lr_decay_step_size, args.weight_decay,
+    args.inference, args.profile)
+
+if args.profile:
+    rename_profile_file('points', NNConv.__name__)
diff --git a/benchmark/points/point_cnn.py b/benchmark/points/point_cnn.py
index 3746945234ec..59501cc9a63a 100644
--- a/benchmark/points/point_cnn.py
+++ b/benchmark/points/point_cnn.py
@@ -7,6 +7,7 @@
 from torch.nn import Linear as Lin
 
 from torch_geometric.nn import XConv, fps, global_mean_pool
+from torch_geometric.profile import rename_profile_file
 
 parser = argparse.ArgumentParser()
 parser.add_argument('--epochs', type=int, default=200)
@@ -15,6 +16,8 @@
 parser.add_argument('--lr_decay_factor', type=float, default=0.5)
 parser.add_argument('--lr_decay_step_size', type=int, default=50)
 parser.add_argument('--weight_decay', type=float, default=0)
+parser.add_argument('--inference', action='store_true')
+parser.add_argument('--profile', action='store_true')
 args = parser.parse_args()
 
 
@@ -60,4 +63,8 @@ def forward(self, pos, batch):
 train_dataset, test_dataset = get_dataset(num_points=1024)
 model = Net(train_dataset.num_classes)
 run(train_dataset, test_dataset, model, args.epochs, args.batch_size, args.lr,
-    args.lr_decay_factor, args.lr_decay_step_size, args.weight_decay)
+    args.lr_decay_factor, args.lr_decay_step_size, args.weight_decay,
+    args.inference, args.profile)
+
+if args.profile:
+    rename_profile_file('points', XConv.__name__)
diff --git a/benchmark/points/point_net.py b/benchmark/points/point_net.py
index 2dc5fcf51da7..9ee7546e0f29 100644
--- a/benchmark/points/point_net.py
+++ b/benchmark/points/point_net.py
@@ -9,6 +9,7 @@
 from torch.nn import Sequential as Seq
 
 from torch_geometric.nn import PointConv, fps, global_max_pool, radius_graph
+from torch_geometric.profile import rename_profile_file
 
 parser = argparse.ArgumentParser()
 parser.add_argument('--epochs', type=int, default=200)
@@ -17,6 +18,8 @@
 parser.add_argument('--lr_decay_factor', type=float, default=0.5)
 parser.add_argument('--lr_decay_step_size', type=int, default=50)
 parser.add_argument('--weight_decay', type=float, default=0)
+parser.add_argument('--inference', action='store_true')
+parser.add_argument('--profile', action='store_true')
 args = parser.parse_args()
 
 
@@ -68,4 +71,8 @@ def forward(self, pos, batch):
 train_dataset, test_dataset = get_dataset(num_points=1024)
 model = Net(train_dataset.num_classes)
 run(train_dataset, test_dataset, model, args.epochs, args.batch_size, args.lr,
-    args.lr_decay_factor, args.lr_decay_step_size, args.weight_decay)
+    args.lr_decay_factor, args.lr_decay_step_size, args.weight_decay,
+    args.inference, args.profile)
+
+if args.profile:
+    rename_profile_file('points', PointConv.__name__)
diff --git a/benchmark/points/spline_cnn.py b/benchmark/points/spline_cnn.py
index 481556a84d83..383195a2d871 100644
--- a/benchmark/points/spline_cnn.py
+++ b/benchmark/points/spline_cnn.py
@@ -7,6 +7,7 @@
 from torch.nn import Linear as Lin
 
 from torch_geometric.nn import SplineConv, fps, global_mean_pool, radius_graph
+from torch_geometric.profile import rename_profile_file
 
 parser = argparse.ArgumentParser()
 parser.add_argument('--epochs', type=int, default=200)
@@ -15,6 +16,8 @@
 parser.add_argument('--lr_decay_factor', type=float, default=0.5)
 parser.add_argument('--lr_decay_step_size', type=int, default=50)
 parser.add_argument('--weight_decay', type=float, default=0)
+parser.add_argument('--inference', action='store_true')
+parser.add_argument('--profile', action='store_true')
 args = parser.parse_args()
 
 
@@ -69,4 +72,8 @@ def forward(self, pos, batch):
 train_dataset, test_dataset = get_dataset(num_points=1024)
 model = Net(train_dataset.num_classes)
 run(train_dataset, test_dataset, model, args.epochs, args.batch_size, args.lr,
-    args.lr_decay_factor, args.lr_decay_step_size, args.weight_decay)
+    args.lr_decay_factor, args.lr_decay_step_size, args.weight_decay,
+    args.inference, args.profile)
+
+if args.profile:
+    rename_profile_file('points', SplineConv.__name__)

From e6ffafe6720052b3219f14186cfae89e6f2dcad3 Mon Sep 17 00:00:00 2001
From: rusty1s <matthias.fey@tu-dortmund.de>
Date: Wed, 13 Jul 2022 13:12:21 +0000
Subject: [PATCH 21/24] update

---
 benchmark/points/train_eval.py | 43 +++++++++++++++++-----------------
 1 file changed, 21 insertions(+), 22 deletions(-)

diff --git a/benchmark/points/train_eval.py b/benchmark/points/train_eval.py
index ca0cfa6fe6fc..4fee844987fd 100644
--- a/benchmark/points/train_eval.py
+++ b/benchmark/points/train_eval.py
@@ -13,12 +13,13 @@
 
 def run_train(train_dataset, test_dataset, model, epochs, batch_size, lr,
               lr_decay_factor, lr_decay_step_size, weight_decay):
+    model = model.to(device)
     optimizer = Adam(model.parameters(), lr=lr, weight_decay=weight_decay)
+
     train_loader = DataLoader(train_dataset, batch_size, shuffle=True)
     test_loader = DataLoader(test_dataset, batch_size, shuffle=False)
 
     for epoch in range(1, epochs + 1):
-        print("Epoch {} starts".format(epoch))
         if torch.cuda.is_available():
             torch.cuda.synchronize()
 
@@ -40,34 +41,31 @@ def run_train(train_dataset, test_dataset, model, epochs, batch_size, lr,
                 param_group['lr'] = lr_decay_factor * param_group['lr']
 
 
+@torch.no_grad()
 def run_inference(test_dataset, model, epochs, batch_size, profiling):
     model = model.to(device)
     test_loader = DataLoader(test_dataset, batch_size, shuffle=False)
 
     for epoch in range(1, epochs + 1):
-        print("Epoch {} starts".format(epoch))
         if epoch == epochs:
-            if profiling:
-                with profile(
-                        activities=[
-                            ProfilerActivity.CPU, ProfilerActivity.CUDA
-                        ], on_trace_ready=trace_handler) as p:
-                    inference(model, test_loader, device)
-                    p.step()
-            else:
-                if torch.cuda.is_available():
-                    torch.cuda.synchronize()
-                t_start = time.time()
-
-                inference(model, test_loader, device)
-
-                if torch.cuda.is_available():
-                    torch.cuda.synchronize()
-                t_end = time.time()
-                duration = t_end - t_start
-                print("End-to-End time: {} s".format(duration), flush=True)
-        else:
+            if torch.cuda.is_available():
+                torch.cuda.synchronize()
+            t_start = time.time()
+
+        inference(model, test_loader, device)
+
+        if epoch == epochs:
+            if torch.cuda.is_available():
+                torch.cuda.synchronize()
+            t_end = time.time()
+            duration = t_end - t_start
+            print(f'End-to-End Inference Time: {duration:.8f}s', flush=True)
+
+    if profiling:
+        with profile(activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA],
+                     on_trace_ready=trace_handler) as p:
             inference(model, test_loader, device)
+            p.step()
 
 
 def run(train_dataset, test_dataset, model, epochs, batch_size, lr,
@@ -92,6 +90,7 @@ def train(model, optimizer, train_loader, device):
         optimizer.step()
 
 
+@torch.no_grad()
 def test(model, test_loader, device):
     model.eval()
 

From fea0f4301c084a440d91a7e86fb7c84c85b50e24 Mon Sep 17 00:00:00 2001
From: rusty1s <matthias.fey@tu-dortmund.de>
Date: Wed, 13 Jul 2022 13:13:32 +0000
Subject: [PATCH 22/24] reset

---
 examples/hetero/to_hetero_mag.py | 47 +++------------------------
 examples/pna.py                  | 55 ++++----------------------------
 2 files changed, 12 insertions(+), 90 deletions(-)

diff --git a/examples/hetero/to_hetero_mag.py b/examples/hetero/to_hetero_mag.py
index aa002c961ce0..6605038c9af3 100644
--- a/examples/hetero/to_hetero_mag.py
+++ b/examples/hetero/to_hetero_mag.py
@@ -1,23 +1,18 @@
 import argparse
 import os.path as osp
-import time
 
 import torch
 import torch.nn.functional as F
 from torch.nn import ReLU
-from torch.profiler import ProfilerActivity, profile
 from tqdm import tqdm
 
 import torch_geometric.transforms as T
 from torch_geometric.datasets import OGB_MAG
 from torch_geometric.loader import HGTLoader, NeighborLoader
 from torch_geometric.nn import Linear, SAGEConv, Sequential, to_hetero
-from torch_geometric.profile import rename_profile_file, trace_handler
 
 parser = argparse.ArgumentParser()
 parser.add_argument('--use_hgt_loader', action='store_true')
-parser.add_argument('--inference', action='store_true')
-parser.add_argument('--profile', action='store_true')
 args = parser.parse_args()
 
 device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
@@ -98,42 +93,10 @@ def test(loader):
     return total_correct / total_examples
 
 
-@torch.no_grad()
-def inference(loader):
-    model.eval()
-    for batch in tqdm(loader):
-        batch = batch.to(device, 'edge_index')
-        model(batch.x_dict, batch.edge_index_dict)
-
-
 init_params()  # Initialize parameters.
-if not args.inference:
-    optimizer = torch.optim.Adam(model.parameters(), lr=0.01)
+optimizer = torch.optim.Adam(model.parameters(), lr=0.01)
 
-    for epoch in range(1, 21):
-        loss = train()
-        val_acc = test(val_loader)
-        print(f'Epoch: {epoch:02d}, Loss: {loss:.4f}, Val: {val_acc:.4f}')
-else:
-    for epoch in range(1, 21):
-        if epoch == 20:
-            if args.profile:
-                with profile(
-                        activities=[
-                            ProfilerActivity.CPU, ProfilerActivity.CUDA
-                        ], on_trace_ready=trace_handler) as p:
-                    inference(val_loader)
-                    p.step()
-                    rename_profile_file('to_hetero_mag')
-            else:
-                if torch.cuda.is_available():
-                    torch.cuda.synchronize()
-                t_start = time.time()
-                inference(val_loader)
-                if torch.cuda.is_available():
-                    torch.cuda.synchronize()
-                t_end = time.time()
-                duration = t_end - t_start
-                print("End-to-End time: {} s".format(duration), flush=True)
-        else:
-            inference(val_loader)
+for epoch in range(1, 21):
+    loss = train()
+    val_acc = test(val_loader)
+    print(f'Epoch: {epoch:02d}, Loss: {loss:.4f}, Val: {val_acc:.4f}')
diff --git a/examples/pna.py b/examples/pna.py
index a7ee23bd6367..4697f49d7121 100644
--- a/examples/pna.py
+++ b/examples/pna.py
@@ -1,24 +1,15 @@
-import argparse
 import os.path as osp
-import time
 
 import torch
 import torch.nn.functional as F
 from torch.nn import Embedding, Linear, ModuleList, ReLU, Sequential
 from torch.optim.lr_scheduler import ReduceLROnPlateau
-from torch.profiler import ProfilerActivity, profile
 
 from torch_geometric.datasets import ZINC
 from torch_geometric.loader import DataLoader
 from torch_geometric.nn import BatchNorm, PNAConv, global_add_pool
-from torch_geometric.profile import rename_profile_file, trace_handler
 from torch_geometric.utils import degree
 
-parser = argparse.ArgumentParser()
-parser.add_argument('--inference', action='store_true')
-parser.add_argument('--profile', action='store_true')
-args = parser.parse_args()
-
 path = osp.join(osp.dirname(osp.realpath(__file__)), '..', 'data', 'ZINC')
 train_dataset = ZINC(path, subset=True, split='train')
 val_dataset = ZINC(path, subset=True, split='val')
@@ -109,42 +100,10 @@ def test(loader):
     return total_error / len(loader.dataset)
 
 
-@torch.no_grad()
-def inference(loader):
-    model.eval()
-    for data in loader:
-        data = data.to(device)
-        model(data.x, data.edge_index, data.edge_attr, data.batch)
-
-
-if not args.inference:
-    for epoch in range(1, 301):
-        loss = train(epoch)
-        val_mae = test(val_loader)
-        test_mae = test(test_loader)
-        scheduler.step(val_mae)
-        print(f'Epoch: {epoch:02d}, Loss: {loss:.4f}, Val: {val_mae:.4f}, '
-              f'Test: {test_mae:.4f}')
-else:
-    for epoch in range(1, 301):
-        if epoch == 300:
-            if args.profile:
-                with profile(
-                        activities=[
-                            ProfilerActivity.CPU, ProfilerActivity.CUDA
-                        ], on_trace_ready=trace_handler) as p:
-                    inference(test_loader)
-                    p.step()
-                rename_profile_file('pna')
-            else:
-                if torch.cuda.is_available():
-                    torch.cuda.synchronize()
-                t_start = time.time()
-                inference(test_loader)
-                if torch.cuda.is_available():
-                    torch.cuda.synchronize()
-                t_end = time.time()
-                duration = t_end - t_start
-                print("End-to-End time: {} s".format(duration), flush=True)
-        else:
-            inference(test_loader)
+for epoch in range(1, 301):
+    loss = train(epoch)
+    val_mae = test(val_loader)
+    test_mae = test(test_loader)
+    scheduler.step(val_mae)
+    print(f'Epoch: {epoch:02d}, Loss: {loss:.4f}, Val: {val_mae:.4f}, '
+          f'Test: {test_mae:.4f}')

From fd39e2691fd9dd37fe2650129fdbe4daed5aadf5 Mon Sep 17 00:00:00 2001
From: rusty1s <matthias.fey@tu-dortmund.de>
Date: Wed, 13 Jul 2022 13:19:43 +0000
Subject: [PATCH 23/24] update

---
 test/profile/test_profile.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/test/profile/test_profile.py b/test/profile/test_profile.py
index 7fbd63422459..8e1d5e197f2f 100644
--- a/test/profile/test_profile.py
+++ b/test/profile/test_profile.py
@@ -12,10 +12,11 @@
     timeit,
     trace_handler,
 )
-from torch_geometric.testing import withCUDA
+from torch_geometric.testing import onlyFullTest, withCUDA
 
 
 @withCUDA
+@onlyFullTest
 def test_profile(get_dataset):
     dataset = get_dataset(name='PubMed')
     data = dataset[0].cuda()
@@ -67,6 +68,7 @@ def test(model, x, edge_index, y):
     assert stats_summary.max_nvidia_smi_used_cuda > 0
 
 
+@onlyFullTest
 def test_trace_handler(get_dataset):
     dataset = get_dataset(name='PubMed')
     device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
@@ -76,7 +78,6 @@ def test_trace_handler(get_dataset):
     model.eval()
 
     for epoch in range(3):
-        print("epoch ", epoch)
         with profile(activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA],
                      on_trace_ready=trace_handler) as p:
             model(data.x, data.edge_index)

From 7fd675be387f86efea6d439e65a82390bb26b62a Mon Sep 17 00:00:00 2001
From: rusty1s <matthias.fey@tu-dortmund.de>
Date: Wed, 13 Jul 2022 13:22:07 +0000
Subject: [PATCH 24/24] changelog

---
 CHANGELOG.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index e5c350e81fb8..5902d685449e 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -5,6 +5,7 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
 
 ## [2.0.5] - 2022-MM-DD
 ### Added
+- Added inference benchmarks ([#4892](https://github.com/pyg-team/pytorch_geometric/pull/4892))
 - Added `unbatch_edge_index` functionality for splitting an `edge_index` tensor according to a `batch` vector ([#4903](https://github.com/pyg-team/pytorch_geometric/pull/4903))
 - Added node-wise normalization mode in `LayerNorm` ([#4944](https://github.com/pyg-team/pytorch_geometric/pull/4944))
 - Added support for `normalization_resolver` ([#4926](https://github.com/pyg-team/pytorch_geometric/pull/4926), [#4951](https://github.com/pyg-team/pytorch_geometric/pull/4951), [#4958](https://github.com/pyg-team/pytorch_geometric/pull/4958), [#4959](https://github.com/pyg-team/pytorch_geometric/pull/4959))