From 29170ae976f3ca970abba64043cfd86e1dff8f90 Mon Sep 17 00:00:00 2001
From: Yukuo Cen <cenyk1230@qq.com>
Date: Fri, 24 Dec 2021 19:08:24 +0800
Subject: [PATCH] [Dataset] Update rd2cd datasets (#323)

---
 cogdl/datasets/geom_data.py                  |  2 +
 cogdl/datasets/rd2cd_data.py                 | 59 +++++++++++---------
 docs/source/tutorial/node_classification.rst |  2 +-
 3 files changed, 35 insertions(+), 28 deletions(-)

diff --git a/cogdl/datasets/geom_data.py b/cogdl/datasets/geom_data.py
index fbb7fead..2631fbd6 100644
--- a/cogdl/datasets/geom_data.py
+++ b/cogdl/datasets/geom_data.py
@@ -99,6 +99,8 @@ def __init__(self, root, name, split=0):
         self.data.val_mask = self.data.all_masks[split]["val"]
         self.data.test_mask = self.data.all_masks[split]["test"]
 
+        self.data.set_asymmetric()
+
     @property
     def raw_file_names(self):
         names = ["out1_graph_edges.txt", "out1_node_feature_label.txt"] + [
diff --git a/cogdl/datasets/rd2cd_data.py b/cogdl/datasets/rd2cd_data.py
index 2147bbb5..c449c28a 100644
--- a/cogdl/datasets/rd2cd_data.py
+++ b/cogdl/datasets/rd2cd_data.py
@@ -1,14 +1,12 @@
-import os
+import os.path as osp
 import random
-import tarfile
 
 import numpy as np
-import requests
 import torch
 from torch import Tensor
 
-from cogdl.data import Graph
-from cogdl.datasets import NodeDataset
+from cogdl.data import Dataset, Graph
+from cogdl.utils import download_url, untar
 
 base_url = "https://cloud.tsinghua.edu.cn/d/65d7c53dd8474d7091a9/files/?p=%2F"
 
@@ -67,44 +65,51 @@ def check_train_containing(train_mask, y):
     return True
 
 
-class RD2CD(NodeDataset):
+class RD2CD(Dataset):
     def __init__(self, root, name):
-        self.root = root
         self.name = name
-        self.source_path = root + "/" + name + "/raw"
-        if not os.path.exists(self.source_path):
-            os.makedirs(self.source_path)
-        dst_path = root + "/" + name + "/processed"
-        if not os.path.exists(dst_path):
-            os.makedirs(dst_path)
-        self.data_path = dst_path + "/data.pt"
-        if not os.path.exists(self.data_path):
-            self.download()
-        super(RD2CD, self).__init__(path=self.data_path, scale_feat=False)
+        path = osp.join(root, name)
+
+        super(RD2CD, self).__init__(path)
+        self.data = torch.load(self.processed_paths[0])
+
+    @property
+    def raw_file_names(self):
+        names = ["x.npy", "y.npy", "edge_index.npy"]
+        return names
+
+    @property
+    def processed_file_names(self):
+        return "data.pt"
+
+    @property
+    def num_nodes(self):
+        assert hasattr(self.data, "y")
+        return self.data.y.shape[0]
 
     def download(self):
-        r = requests.get(base_url + self.name + ".tgz&dl=1")
-        tarfile_path = self.source_path + "/" + self.name + ".tgz"
-        with open(tarfile_path, "wb") as f:
-            f.write(r.content)
-        with tarfile.open(tarfile_path, "r") as f:
-            f.extractall(self.source_path)
+        fname = "{}.tgz".format(self.name.lower())
+        download_url("{}{}.tgz&dl=1".format(base_url, self.name), self.raw_dir, fname)
+        untar(self.raw_dir, fname)
 
     def process(self):
-        numpy_x = np.load(self.source_path + "/x.npy")
+        numpy_x = np.load(self.raw_dir + "/x.npy")
         x = torch.from_numpy(numpy_x).to(torch.float)
-        numpy_y = np.load(self.source_path + "/y.npy")
+        numpy_y = np.load(self.raw_dir + "/y.npy")
         y = torch.from_numpy(numpy_y).to(torch.long)
-        numpy_edge_index = np.load(self.source_path + "/edge_index.npy")
+        numpy_edge_index = np.load(self.raw_dir + "/edge_index.npy")
         edge_index = torch.from_numpy(numpy_edge_index).to(torch.long)
 
         # set train/val/test mask in node_classification task
         random_seed = 14530529  # a fixed seed
         (train_mask, val_mask, test_mask) = get_whole_mask(y, "6-2-2", random_seed)
         data = Graph(x=x, edge_index=edge_index, y=y, train_mask=train_mask, val_mask=val_mask, test_mask=test_mask)
-        torch.save(data, self.data_path)
+        torch.save(data, self.processed_paths[0])
         return data
 
+    def get(self, idx):
+        return self.data
+
 
 class Github(RD2CD):
     def __init__(self, root="data"):
diff --git a/docs/source/tutorial/node_classification.rst b/docs/source/tutorial/node_classification.rst
index 42599008..1a4a42d5 100644
--- a/docs/source/tutorial/node_classification.rst
+++ b/docs/source/tutorial/node_classification.rst
@@ -48,7 +48,7 @@ CogDL supports saving the trained model with ``checkpoint_path`` in command line
     experiment(model="gcn", dataset="cora", checkpoint_path="gcn_cora.pt")
 
 
-When the training stops, the model will be saved in `gcn_cora.py`. If you want to continue the training from previous checkpoint
+When the training stops, the model will be saved in `gcn_cora.pt`. If you want to continue the training from previous checkpoint
 with different parameters(such as learning rate, weight decay and etc.), keep the same model parameters (such as hidden size, model layers)
 and do it as follows: