diff --git a/README.md b/README.md index 876b9473..f0d8a32f 100644 --- a/README.md +++ b/README.md @@ -21,6 +21,8 @@ We summarize the contributions of CogDL as follows: ## ❗ News +- The new **v0.5.1 release** adds fast operators including SpMM (cpu version) and scatter_max (cuda version). It also adds lots of datasets for node classification which can be found in [this link](./cogdl/datasets/rd2cd_data.py). 🎉 + - The new **v0.5.0 release** designs and implements a unified training loop for GNN. It introduces `DataWrapper` to help prepare the training/validation/test data and `ModelWrapper` to define the training/validation/test steps. 🎉 - The new **v0.4.1 release** adds the implementation of Deep GNNs and the recommendation task. It also supports new pipelines for generating embeddings and recommendation. Welcome to join our tutorial on KDD 2021 at 10:30 am - 12:00 am, Aug. 14th (Singapore Time). More details can be found in https://kdd2021graph.github.io/. 🎉 diff --git a/README_CN.md b/README_CN.md index 39b398ea..6a6aa14a 100644 --- a/README_CN.md +++ b/README_CN.md @@ -21,6 +21,8 @@ CogDL的特性包括: ## ❗ 最新 +- 最新的 **v0.5.1 release** 添加了一些高效的算子,包括cpu版本的SpMM和cuda版本的scatter_max。这个版本同时增加了很多用于节点分类的[数据集](./cogdl/datasets/rd2cd_data.py)。 🎉 + - 最新的 **v0.5.0 release** 为图神经网络的训练设计了一套统一的流程. 这个版本去除了原先的`Task`类,引入了`DataWrapper`来准备training/validation/test过程中所需的数据,引入了`ModelWrapper`来定义模型training/validation/test的步骤. 🎉 - 最新的 **v0.4.1 release** 增加了深层GNN的实现和推荐任务。这个版本同时提供了新的一些pipeline用于直接获取图表示和搭建推荐应用。欢迎大家参加我们在KDD 2021上的tutorial,时间是8月14号上午10:30 - 12:00(北京时间)。 更多的内容可以查看 https://kdd2021graph.github.io/. 🎉 diff --git a/cogdl/__init__.py b/cogdl/__init__.py index 1407ccee..bc830ca0 100644 --- a/cogdl/__init__.py +++ b/cogdl/__init__.py @@ -1,4 +1,4 @@ -__version__ = "0.5.0" +__version__ = "0.5.1" from .experiments import experiment from .oag import oagbert diff --git a/cogdl/configs.py b/cogdl/configs.py index 7b5a4a61..9033d3ef 100644 --- a/cogdl/configs.py +++ b/cogdl/configs.py @@ -1,505 +1,483 @@ BEST_CONFIGS = { - "node_classification": { - "chebyshev": {"general": {}}, - "dropedge_gcn": {"general": {}}, - "gat": { - "general": {"lr": 0.005, "epochs": 1000}, - "citeseer": {"weight_decay": 0.001}, - "pubmed": {"weight_decay": 0.001}, - "ppi-large": { - "nhead": 4, - "hidden_size": 256, - "lr": 0.005, - "residual": True, - "num_layers": 3, - "weight_decay": 0.0, - "attn_drop": 0, - "dropout": 0, - }, - }, - "gcn": { - "general": {}, - "ppi-large": { - "lr": 0.005, - "hidden_size": 2048, - "num_layers": 2, - "dropout": 0.3, - "epochs": 7000, - # 78.04 - }, - "flickr": { - "lr": 0.01, - "dropout": 0.5, - "weight_decay": 0.0, - "hidden_size": 256, - }, - }, - "sage": { - "general": {}, - }, - "gcnii": { - "general": { - "epochs": 1000, - "dropout": 0.5, - "wd1": 0.001, - "wd2": 5e-4, - }, - "cora": { - "num_layers": 64, - "hidden_size": 64, - "dropout": 0.6, - }, - "citeseer": { - "num_layers": 32, - "hidden_size": 256, - "lr": 0.001, - "patience": 200, - "epochs": 2000, - "lmbda": 0.6, - "dropout": 0.7, - }, - "pubmed": { - "num_layers": 16, - "hidden_size": 256, - "lmbda": 0.4, - "dropout": 0.5, - "wd1": 5e-4, - }, - "reddit": { - "num_layers": 3, - "hidden_size": 256, - "alpha": 0.2, - "wd1": 0, - "wd2": 0, - "lr": 0.001, - "residual": True, - "epochs": 1000, - "lmbda": 1.0, - # 96.42 - }, - "flickr": { - "lr": 0.002, - "num_layers": 4, - "residual": True, - "wd1": 0.0005, - "wd2": 0.0005, - "lmbda": 1.0, - "alpha": 0.1, - } - # 52.54 - }, - "gdc_gcn": { - "general": {"hidden_size": 16}, - }, - "grand": { - "general": { - "epochs": 1000, - }, - "cora": { - "order": 8, - "sample": 4, - "lam": 1.0, - "tem": 0.5, - "alpha": 0.5, - "patience": 200, - "input_dropout": 0.5, - "hidden_dropout": 0.5, - }, - "citeseer": { - "order": 2, - "sample": 2, - "lam": 0.7, - "tem": 0.3, - "alpha": 0.5, - "input_dropout": 0.0, - "hidden_dropout": 0.2, - "patience": 200, - }, - "pubmed": { - "order": 5, - "sample": 4, - "lam": 1.0, - "tem": 0.2, - "alpha": 0.5, - "lr": 0.2, - "bn": True, - "input_dropout": 0.6, - "hidden_dropout": 0.8, - }, - }, - "graphsage": { - "general": {}, - }, - "sgc": { - "general": { - "hidden_size": 16, - "dropout": 0.5, - }, - }, - "sgcpn": { - "general": { - "lr": 0.005, - "epochs": 1000, - "patience": 1000, - "norm_mode": "PN", - "norm_scale": 10, - "dropout": 0.6, - }, - }, - "sign": { - "general": { - "lr": 0.00005, - "hidden_size": 2048, - "dropout": 0.5, - "dropedge_rate": 0.2, - }, - }, - "srgcn": { - "general": { - "lr": 0.005, - "epochs": 1000, - }, - "cora": {"dropout": 0.6}, - "citeseer": {"dropout": 0.6}, - }, - "unet": { - "general": { - "epochs": 1000, - "n_dropout": 0.90, - "adj_dropout": 0.05, - "hidden_size": 128, - "aug_adj": False, - "improved": False, - "n_pool": 4, - "pool_rate": [0.7, 0.5, 0.5, 0.4], - }, - }, - "ppnp": { - "general": {}, - "flickr": { - "lr": 0.005, - "weight_decay": 0.001, - "dropout": 0.3, - "num_layers": 2, - "hidden_size": 256, - "epochs": 1000, - "num_iterations": 4, - "alpha": 0.5, - # 0.5227 - }, - "cora": { - "lr": 0.01, - "weight_decay": 0.000887, - "alpha": 0.1087, - "num_iterations": 7, - "num_layers": 2, - "epochs": 1000, - "hidden_size": 32, - "dropout": 0.8185, - # - }, - "citeseer": { - "num_iterations": 4, - "hidden_size": 128, - "lr": 0.01, - "dropout": 0.3095, - "alpha": 0.171, - "weight_decay": 0.0004117, - "num_layers": 2, - "epochs": 1000, - # 0.7199 - }, - "pubmed": { - "num_iterations": 10, - "hidden_size": 256, - "dropout": 0.3766, - "alpha": 0.1201, - "weight_decay": 0.0002754, - "epochs": 1000, - "lr": 0.001, - }, - "reddit": { - "lr": 0.005, - "hidden_size": 256, - "dropout": 0.3751, - "alpha": 0.1995, - "num_layers": 3, - "weight_decay": 5.918e-6, - "num_iterations": 3, - # 96.26 - }, - }, - "correct_smooth_mlp": { - "general": {}, - "ogbn_arxiv": { - "correct_norm": "row", - "smooth_norm": "col", - "correct_alpha": 0.9791632871592579, - "smooth_alpha": 0.7564990804200602, - "num_correct_prop": 50, - "num_smooth_prop": 50, - "autoscale": True, - "norm": "batchnorm", - }, - "ogbn_products": { - "correct_norm": "sym", - "smooth_norm": "row", - "correct_alpha": 1.0, - "smooth_alpha": 0.8, - "num_correct_prop": 50, - "num_smooth_prop": 50, - "autoscale": False, - "scale": 10.0, - "norm": "batchnorm", - "act_first": True, - }, - }, - "sagn": { - "general": { - "data_gpu": True, - "lr": 0.001, - "hidden-size": 512, - "attn-drop": 0.0, - "dropout": 0.7, - }, - "flickr": { - "threshold": 0.5, - "label-hop": 2, - "weight-decay": 3e-6, - "nstage": [50, 50, 50], - "nhop": 2, - "batch-size": 256, - }, - "reddit": { - "threshold": 0.9, - "lr": 0.0001, - "batch-size": 1000, - "nhop": 2, - "label-nhop": 4, - "weight-decay": 0.0, - "nstage": [500, 500, 500], - }, - }, - }, - "unsupervised_node_classification": { - "deepwalk": { - "general": {}, - }, - "dngr": { - "general": { - "hidden_size": 128, - "lr": 0.001, - "epochs": 500, - "hidden_size1": 1000, - "hidden_size2": 128, - "noise": 0.2, - "alpha": 0.1, - "step": 10, - }, - }, - "grarep": { - "general": {}, - }, - "hope": { - "general": {}, - }, - "line": { - "general": {}, - "blogcatalog": {"walk_num": 40}, - }, - "netmf": { - "general": {}, - "ppi-ne": {"window_size": 10, "is_large": True}, - "blogcatalog": {"window_size": 10, "is_large": True}, - "wikipedia": {"window_size": 1}, - }, - "netsmf": { - "general": {"window_size": 10, "num_round": 1000}, - "wikipedia": {"window_size": 1}, - "blogcatalog": {"num_round": 10000}, - }, - "node2vec": { - "general": {}, - }, - "prone": { - "general": {"step": 10}, - "ppi-ne": {"mu": 0.0}, - "wikipedia": {"mu": -4.0}, - "dblp-ne": {"mu": -1.2, "theta": 2.0}, - }, - "sdne": { - "general": {}, - }, - "spectral": { - "general": {}, - }, - "dgi": { - "general": {"weight_decay": 0}, - }, - "gcc": { - "general": {}, - }, - "grace": { - "general": { - "weight_decay": 0, - "epochs": 1000, - "patience": 20, - }, - "cora": { - "lr": 0.0005, - "weight_decay": 0.00001, - "tau": 0.4, - "drop_feature_rates": [0.3, 0.4], - "drop_edge_rates": [0.2, 0.4], - "epochs": 200, - "hidden_size": 128, - "proj_hidden_size": 128, - }, - "citeseer": { - "hidden_size": 256, - "proj_hidden_size": 256, - "drop_feature_rates": [0.3, 0.2], - "drop_edge_rates": [0.2, 0.0], - "lr": 0.001, - "_weight_decay": 0.00001, - "tau": 0.9, - "activation": "prelu", - }, - "pubmed": { - "hidden_size": 256, - "proj_hidden_size": 256, - "drop_edge_rates": [0.4, 0.1], - "drop_feature_rates": [0.0, 0.2], - "tau": 0.7, - "lr": 0.001, - "weight_decay": 0.00001, - }, - }, - "unsup_graphsage": { + "chebyshev": {"general": {}}, + "dropedge_gcn": {"general": {}}, + "gat": { + "general": {"lr": 0.005, "epochs": 1000}, + "citeseer": {"weight_decay": 0.001}, + "pubmed": {"weight_decay": 0.001}, + "ppi-large": { + "nhead": 4, + "hidden_size": 256, + "lr": 0.005, + "residual": True, + "num_layers": 3, + "weight_decay": 0.0, + "attn_drop": 0, + "dropout": 0, + }, + }, + "gcn": { + "general": {}, + "ppi-large": { + "lr": 0.005, + "hidden_size": 2048, + "num_layers": 2, + "dropout": 0.3, + "epochs": 7000, + # 78.04 + }, + "flickr": { + "lr": 0.01, + "dropout": 0.5, + "weight_decay": 0.0, + "hidden_size": 256, + }, + }, + "sage": { + "general": {}, + }, + "gcnii": { + "general": { + "epochs": 1000, + "dropout": 0.5, + "wd1": 0.001, + "wd2": 5e-4, + }, + "cora": { + "num_layers": 64, + "hidden_size": 64, + "dropout": 0.6, + }, + "citeseer": { + "num_layers": 32, + "hidden_size": 256, "lr": 0.001, - "weight_decay": 0, - "epochs": 3000, - }, - "revgcn": { - "general": {}, - "cora": { - "hidden_size": 128, - "lr": 0.001, - "dropout": 0.4706458854, - "weight_decay": 0.0008907, - "norm": "layernorm", - "num_layers": 10, - # 81.40 - }, - }, - }, - "graph_classification": { - "gin": { - "general": {"lr": 0.001}, - "imdb-b": {"degree_feature": True}, - "imdb-m": {"degree_feature": True}, - "collab": {"degree_feature": True}, - "proteins": { - "num_layers": 5, - "dropout": 0.0, - }, - "nci1": { - "num_layers": 5, - "dropout": 0.3, - "hidden_size": 64, - }, - }, - "infograph": { - "general": { - "lr": 0.0001, - "weight_decay": 5e-4, - "sup": False, - }, - "mutag": { - "num_layers": 1, - "epochs": 20, - }, - "imdb-b": {"degree_feature": True}, - "imdb-m": {"degree_feature": True}, - "collab": {"degree_feature": True}, - "nci1": {"num_layers": 3}, - }, - "sortpool": { - "nci1": { - "dropout": 0.3, - "hidden_size": 64, - "num_layers": 5, - }, - }, - "patchy_san": { - "general": { - "lr": 0.001, - "hidden_size": 32, - "gamma": 0.5, - "dropout": 0.5, - }, - "imdb-b": {"degree_feature": True}, - "imdb-m": {"degree_feature": True}, - "collab": {"degree_feature": True}, - }, - }, - "unsupervised_graph_classification": { - "graph2vec": { - "general": {}, - "nci1": { - "lr": 0.001, - "window_size": 8, - "epochs": 10, - "iteration": 4, - }, - "reddit-b": { - "lr": 0.01, - "degree_feature": True, - "hidden_size": 128, - }, + "patience": 200, + "epochs": 2000, + "lmbda": 0.6, + "dropout": 0.7, + }, + "pubmed": { + "num_layers": 16, + "hidden_size": 256, + "lmbda": 0.4, + "dropout": 0.5, + "wd1": 5e-4, + }, + "reddit": { + "num_layers": 3, + "hidden_size": 256, + "alpha": 0.2, + "wd1": 0, + "wd2": 0, + "lr": 0.001, + "residual": True, + "epochs": 1000, + "lmbda": 1.0, + # 96.42 + }, + "flickr": { + "lr": 0.002, + "num_layers": 4, + "residual": True, + "wd1": 0.0005, + "wd2": 0.0005, + "lmbda": 1.0, + "alpha": 0.1, } + # 52.54 }, - "link_prediction": {}, - "multiplex_link_prediction": { - "gatne": { - "general": {}, - "twitter": {"eval_type": "1"}, - } + "gdc_gcn": { + "general": {"hidden_size": 16}, + }, + "grand": { + "general": { + "epochs": 1000, + }, + "cora": { + "order": 8, + "sample": 4, + "lam": 1.0, + "tem": 0.5, + "alpha": 0.5, + "patience": 200, + "input_dropout": 0.5, + "hidden_dropout": 0.5, + }, + "citeseer": { + "order": 2, + "sample": 2, + "lam": 0.7, + "tem": 0.3, + "alpha": 0.5, + "input_dropout": 0.0, + "hidden_dropout": 0.2, + "patience": 200, + }, + "pubmed": { + "order": 5, + "sample": 4, + "lam": 1.0, + "tem": 0.2, + "alpha": 0.5, + "lr": 0.2, + "bn": True, + "input_dropout": 0.6, + "hidden_dropout": 0.8, + }, + }, + "graphsage": { + "general": {}, + }, + "sgc": { + "general": { + "hidden_size": 16, + "dropout": 0.5, + }, + }, + "sgcpn": { + "general": { + "lr": 0.005, + "epochs": 1000, + "patience": 1000, + "norm_mode": "PN", + "norm_scale": 10, + "dropout": 0.6, + }, + }, + "sign": { + "general": { + "lr": 0.00005, + "hidden_size": 2048, + "dropout": 0.5, + "dropedge_rate": 0.2, + }, }, - "multiplex_node_classification": { - "hin2vec": { - "general": { - "lr": 0.025, - }, + "srgcn": { + "general": { + "lr": 0.005, + "epochs": 1000, }, - "metapath2vec": { - "general": { - "walk_num": 40, - }, + "cora": {"dropout": 0.6}, + "citeseer": {"dropout": 0.6}, + }, + "unet": { + "general": { + "epochs": 1000, + "n_dropout": 0.90, + "adj_dropout": 0.05, + "hidden_size": 128, + "aug_adj": False, + "improved": False, + "n_pool": 4, + "pool_rate": [0.7, 0.5, 0.5, 0.4], }, - "pte": {}, }, - "heterogeneous_node_classification": { - "gtn": { - "general": { - "hidden_size": 128, - "lr": 0.005, - "weight_decay": 0.001, - }, + "ppnp": { + "general": {}, + "flickr": { + "lr": 0.005, + "weight_decay": 0.001, + "dropout": 0.3, + "num_layers": 2, + "hidden_size": 256, + "epochs": 1000, + "num_iterations": 4, + "alpha": 0.5, + # 0.5227 + }, + "cora": { + "lr": 0.01, + "weight_decay": 0.000887, + "alpha": 0.1087, + "num_iterations": 7, + "num_layers": 2, + "epochs": 1000, + "hidden_size": 32, + "dropout": 0.8185, + # + }, + "citeseer": { + "num_iterations": 4, + "hidden_size": 128, + "lr": 0.01, + "dropout": 0.3095, + "alpha": 0.171, + "weight_decay": 0.0004117, + "num_layers": 2, + "epochs": 1000, + # 0.7199 + }, + "pubmed": { + "num_iterations": 10, + "hidden_size": 256, + "dropout": 0.3766, + "alpha": 0.1201, + "weight_decay": 0.0002754, + "epochs": 1000, + "lr": 0.001, }, - "han": { - "general": { - "hidden_size": 128, - "lr": 0.005, - "weight_decay": 0.001, - } + "reddit": { + "lr": 0.005, + "hidden_size": 256, + "dropout": 0.3751, + "alpha": 0.1995, + "num_layers": 3, + "weight_decay": 5.918e-6, + "num_iterations": 3, + # 96.26 }, }, - "pretrain": {}, - "similarity_search": { - "gcc": { - "general": {}, + "correct_smooth_mlp": { + "general": {}, + "ogbn_arxiv": { + "correct_norm": "row", + "smooth_norm": "col", + "correct_alpha": 0.9791632871592579, + "smooth_alpha": 0.7564990804200602, + "num_correct_prop": 50, + "num_smooth_prop": 50, + "autoscale": True, + "norm": "batchnorm", + }, + "ogbn_products": { + "correct_norm": "sym", + "smooth_norm": "row", + "correct_alpha": 1.0, + "smooth_alpha": 0.8, + "num_correct_prop": 50, + "num_smooth_prop": 50, + "autoscale": False, + "scale": 10.0, + "norm": "batchnorm", + "act_first": True, + }, + }, + "sagn": { + "general": { + "data_gpu": True, + "lr": 0.001, + "hidden-size": 512, + "attn-drop": 0.0, + "dropout": 0.7, + }, + "flickr": { + "threshold": 0.5, + "label-hop": 2, + "weight-decay": 3e-6, + "nstage": [50, 50, 50], + "nhop": 2, + "batch-size": 256, + }, + "reddit": { + "threshold": 0.9, + "lr": 0.0001, + "batch-size": 1000, + "nhop": 2, + "label-nhop": 4, + "weight-decay": 0.0, + "nstage": [500, 500, 500], + }, + }, + "deepwalk": { + "general": {}, + }, + "dngr": { + "general": { + "hidden_size": 128, + "lr": 0.001, + "epochs": 500, + "hidden_size1": 1000, + "hidden_size2": 128, + "noise": 0.2, + "alpha": 0.1, + "step": 10, }, }, - "attributed_graph_clustering": {}, + "grarep": { + "general": {}, + }, + "hope": { + "general": {}, + }, + "line": { + "general": {}, + "blogcatalog": {"walk_num": 40}, + }, + "netmf": { + "general": {}, + "ppi-ne": {"window_size": 10, "is_large": True}, + "blogcatalog": {"window_size": 10, "is_large": True}, + "wikipedia": {"window_size": 1}, + }, + "netsmf": { + "general": {"window_size": 10, "num_round": 1000}, + "wikipedia": {"window_size": 1}, + "blogcatalog": {"num_round": 10000}, + }, + "node2vec": { + "general": {}, + }, + "prone": { + "general": {"step": 10}, + "ppi-ne": {"mu": 0.0}, + "wikipedia": {"mu": -4.0}, + "dblp-ne": {"mu": -1.2, "theta": 2.0}, + }, + "sdne": { + "general": {}, + }, + "spectral": { + "general": {}, + }, + "dgi": { + "general": {"weight_decay": 0}, + }, + "gcc": { + "general": {}, + }, + "grace": { + "general": { + "weight_decay": 0, + "epochs": 1000, + "patience": 20, + }, + "cora": { + "lr": 0.0005, + "weight_decay": 0.00001, + "tau": 0.4, + "drop_feature_rates": [0.3, 0.4], + "drop_edge_rates": [0.2, 0.4], + "epochs": 200, + "hidden_size": 128, + "proj_hidden_size": 128, + }, + "citeseer": { + "hidden_size": 256, + "proj_hidden_size": 256, + "drop_feature_rates": [0.3, 0.2], + "drop_edge_rates": [0.2, 0.0], + "lr": 0.001, + "_weight_decay": 0.00001, + "tau": 0.9, + "activation": "prelu", + }, + "pubmed": { + "hidden_size": 256, + "proj_hidden_size": 256, + "drop_edge_rates": [0.4, 0.1], + "drop_feature_rates": [0.0, 0.2], + "tau": 0.7, + "lr": 0.001, + "weight_decay": 0.00001, + }, + }, + "unsup_graphsage": { + "lr": 0.001, + "weight_decay": 0, + "epochs": 3000, + }, + "revgcn": { + "general": {}, + "cora": { + "hidden_size": 128, + "lr": 0.001, + "dropout": 0.4706458854, + "weight_decay": 0.0008907, + "norm": "layernorm", + "num_layers": 10, + # 81.40 + }, + }, + "gin": { + "general": {"lr": 0.001}, + "imdb-b": {"degree_feature": True}, + "imdb-m": {"degree_feature": True}, + "collab": {"degree_feature": True}, + "proteins": { + "num_layers": 5, + "dropout": 0.0, + }, + "nci1": { + "num_layers": 5, + "dropout": 0.3, + "hidden_size": 64, + }, + }, + "infograph": { + "general": { + "lr": 0.0001, + "weight_decay": 5e-4, + "sup": False, + }, + "mutag": { + "num_layers": 1, + "epochs": 20, + }, + "imdb-b": {"degree_feature": True}, + "imdb-m": {"degree_feature": True}, + "collab": {"degree_feature": True}, + "nci1": {"num_layers": 3}, + }, + "sortpool": { + "nci1": { + "dropout": 0.3, + "hidden_size": 64, + "num_layers": 5, + }, + }, + "patchy_san": { + "general": { + "lr": 0.001, + "hidden_size": 32, + "gamma": 0.5, + "dropout": 0.5, + }, + "imdb-b": {"degree_feature": True}, + "imdb-m": {"degree_feature": True}, + "collab": {"degree_feature": True}, + }, + "graph2vec": { + "general": {}, + "nci1": { + "lr": 0.001, + "window_size": 8, + "epochs": 10, + "iteration": 4, + }, + "reddit-b": { + "lr": 0.01, + "degree_feature": True, + "hidden_size": 128, + }, + }, + "gatne": { + "general": {}, + "twitter": {"eval_type": "1"}, + }, + "hin2vec": { + "general": { + "lr": 0.025, + }, + }, + "metapath2vec": { + "general": { + "walk_num": 40, + }, + }, + "pte": {}, + "gtn": { + "general": { + "hidden_size": 128, + "lr": 0.005, + "weight_decay": 0.001, + }, + }, + "han": { + "general": { + "hidden_size": 128, + "lr": 0.005, + "weight_decay": 0.001, + } + }, } diff --git a/cogdl/datasets/__init__.py b/cogdl/datasets/__init__.py index d888021a..f553ce8a 100644 --- a/cogdl/datasets/__init__.py +++ b/cogdl/datasets/__init__.py @@ -158,4 +158,20 @@ def build_dataset_from_path(data_path, dataset=None): "yelp2018": "cogdl.datasets.rec_data.Yelp2018Dataset", "ali": "cogdl.datasets.rec_data.AliDataset", "amazon-rec": "cogdl.datasets.rec_data.AmazonRecDataset", + "Github": "cogdl.datasets.rd2cd_data.Github", + "Elliptic": "cogdl.datasets.rd2cd_data.Elliptic", + "Film": "cogdl.datasets.rd2cd_data.Film", + "Wiki": "cogdl.datasets.rd2cd_data.Wiki", + "Clothing": "cogdl.datasets.rd2cd_data.Clothing", + "Electronics": "cogdl.datasets.rd2cd_data.Electronics", + "Dblp": "cogdl.datasets.rd2cd_data.Dblp", + "Yelpchi": "cogdl.datasets.rd2cd_data.Yelpchi", + "Alpha": "cogdl.datasets.rd2cd_data.Alpha", + "Weibo": "cogdl.datasets.rd2cd_data.Weibo", + "bgp": "cogdl.datasets.rd2cd_data.bgp", + "ssn5": "cogdl.datasets.rd2cd_data.ssn5", + "ssn7": "cogdl.datasets.rd2cd_data.ssn7", + "Aids": "cogdl.datasets.rd2cd_data.Aids", + "Nba": "cogdl.datasets.rd2cd_data.Nba", + "Pokec_z": "cogdl.datasets.rd2cd_data.Pokec_z", } diff --git a/cogdl/datasets/rd2cd_data.py b/cogdl/datasets/rd2cd_data.py new file mode 100644 index 00000000..2147bbb5 --- /dev/null +++ b/cogdl/datasets/rd2cd_data.py @@ -0,0 +1,186 @@ +import os +import random +import tarfile + +import numpy as np +import requests +import torch +from torch import Tensor + +from cogdl.data import Graph +from cogdl.datasets import NodeDataset + +base_url = "https://cloud.tsinghua.edu.cn/d/65d7c53dd8474d7091a9/files/?p=%2F" + + +def get_whole_mask(y, ratio: str, seed: int = 1234567): + """split the whole dataset in proportion""" + y_have_label_mask = y != -1 + total_node_num = len(y) + y_index_tensor = torch.tensor(list(range(total_node_num)), dtype=int) + masked_index = y_index_tensor[y_have_label_mask] + while True: + (train_mask, val_mask, test_mask) = get_order(ratio, masked_index, total_node_num, seed) + if check_train_containing(train_mask, y): + return (train_mask, val_mask, test_mask) + else: + seed += 1 + + +def get_order(ratio: str, masked_index: Tensor, total_node_num: int, seed: int = 1234567): + """ + return:(train_mask,val_mask,test_mask) + """ + random.seed(seed) + + masked_node_num = len(masked_index) + shuffle_criterion = list(range(masked_node_num)) + random.shuffle(shuffle_criterion) + + train_val_test_list = [int(i) for i in ratio.split("-")] + tvt_sum = sum(train_val_test_list) + tvt_ratio_list = [i / tvt_sum for i in train_val_test_list] + + train_end_index = int(tvt_ratio_list[0] * masked_node_num) + val_end_index = train_end_index + int(tvt_ratio_list[1] * masked_node_num) + + train_mask_index = shuffle_criterion[:train_end_index] + val_mask_index = shuffle_criterion[train_end_index:val_end_index] + test_mask_index = shuffle_criterion[val_end_index:] + + train_mask = torch.zeros(total_node_num, dtype=torch.bool) + train_mask[masked_index[train_mask_index]] = True + val_mask = torch.zeros(total_node_num, dtype=torch.bool) + val_mask[masked_index[val_mask_index]] = True + test_mask = torch.zeros(total_node_num, dtype=torch.bool) + test_mask[masked_index[test_mask_index]] = True + + return (train_mask, val_mask, test_mask) + + +def check_train_containing(train_mask, y): + for label in y.unique(): + if label.item() == -1: + continue + if label.item() not in y[train_mask]: + return False + return True + + +class RD2CD(NodeDataset): + def __init__(self, root, name): + self.root = root + self.name = name + self.source_path = root + "/" + name + "/raw" + if not os.path.exists(self.source_path): + os.makedirs(self.source_path) + dst_path = root + "/" + name + "/processed" + if not os.path.exists(dst_path): + os.makedirs(dst_path) + self.data_path = dst_path + "/data.pt" + if not os.path.exists(self.data_path): + self.download() + super(RD2CD, self).__init__(path=self.data_path, scale_feat=False) + + def download(self): + r = requests.get(base_url + self.name + ".tgz&dl=1") + tarfile_path = self.source_path + "/" + self.name + ".tgz" + with open(tarfile_path, "wb") as f: + f.write(r.content) + with tarfile.open(tarfile_path, "r") as f: + f.extractall(self.source_path) + + def process(self): + numpy_x = np.load(self.source_path + "/x.npy") + x = torch.from_numpy(numpy_x).to(torch.float) + numpy_y = np.load(self.source_path + "/y.npy") + y = torch.from_numpy(numpy_y).to(torch.long) + numpy_edge_index = np.load(self.source_path + "/edge_index.npy") + edge_index = torch.from_numpy(numpy_edge_index).to(torch.long) + + # set train/val/test mask in node_classification task + random_seed = 14530529 # a fixed seed + (train_mask, val_mask, test_mask) = get_whole_mask(y, "6-2-2", random_seed) + data = Graph(x=x, edge_index=edge_index, y=y, train_mask=train_mask, val_mask=val_mask, test_mask=test_mask) + torch.save(data, self.data_path) + return data + + +class Github(RD2CD): + def __init__(self, root="data"): + super(Github, self).__init__(root, "Github") + + +class Elliptic(RD2CD): + def __init__(self, root="data"): + super(Elliptic, self).__init__(root, "Elliptic") + + +class Film(RD2CD): + def __init__(self, root="data"): + super(Film, self).__init__(root, "Film") + + +class Wiki(RD2CD): + def __init__(self, root="data"): + super(Wiki, self).__init__(root, "Wiki") + + +class Clothing(RD2CD): + def __init__(self, root="data"): + super(Clothing, self).__init__(root, "Clothing") + + +class Electronics(RD2CD): + def __init__(self, root="data"): + super(Electronics, self).__init__(root, "Electronics") + + +class Dblp(RD2CD): + def __init__(self, root="data"): + super(Dblp, self).__init__(root, "Dblp") + + +class Yelpchi(RD2CD): + def __init__(self, root="data"): + super(Yelpchi, self).__init__(root, "Yelpchi") + + +class Alpha(RD2CD): + def __init__(self, root="data"): + super(Alpha, self).__init__(root, "Alpha") + + +class Weibo(RD2CD): + def __init__(self, root="data"): + super(Weibo, self).__init__(root, "Weibo") + + +class bgp(RD2CD): + def __init__(self, root="data"): + super(bgp, self).__init__(root, "bgp") + + +class ssn5(RD2CD): + def __init__(self, root="data"): + super(ssn5, self).__init__(root, "ssn5") + + +class ssn7(RD2CD): + def __init__(self, root="data"): + super(ssn7, self).__init__(root, "ssn7") + + +class Aids(RD2CD): + def __init__(self, root="data"): + super(Aids, self).__init__(root, "Aids") + + +class Nba(RD2CD): + def __init__(self, root="data"): + super(Nba, self).__init__(root, "Nba") + + +class Pokec_z(RD2CD): + def __init__(self, root="data"): + super(Pokec_z, self).__init__(root, "Pokec_z") diff --git a/cogdl/experiments.py b/cogdl/experiments.py index 8f5a9c5f..055c4e3f 100644 --- a/cogdl/experiments.py +++ b/cogdl/experiments.py @@ -75,10 +75,9 @@ def run(self): def set_best_config(args): - configs = BEST_CONFIGS[args.task] - if args.model not in configs: + if args.model not in BEST_CONFIGS: return args - configs = configs[args.model] + configs = BEST_CONFIGS[args.model] for key, value in configs["general"].items(): args.__setattr__(key, value) if args.dataset not in configs: diff --git a/cogdl/models/__init__.py b/cogdl/models/__init__.py index 5221a112..d33177b5 100644 --- a/cogdl/models/__init__.py +++ b/cogdl/models/__init__.py @@ -70,6 +70,7 @@ def build_model(args): "mvgrl": "cogdl.models.nn.mvgrl.MVGRL", "patchy_san": "cogdl.models.nn.patchy_san.PatchySAN", "gcn": "cogdl.models.nn.gcn.GCN", + "actgcn": "cogdl.models.nn.actgcn.ActGCN", "gdc_gcn": "cogdl.models.nn.gdc_gcn.GDC_GCN", "graphsage": "cogdl.models.nn.graphsage.Graphsage", "compgcn": "cogdl.models.nn.compgcn.LinkPredictCompGCN", diff --git a/cogdl/models/nn/actgcn.py b/cogdl/models/nn/actgcn.py new file mode 100644 index 00000000..67bd57e9 --- /dev/null +++ b/cogdl/models/nn/actgcn.py @@ -0,0 +1,93 @@ +import torch.nn as nn + +try: + from cogdl.layers.actgcn_layer import ActGCNLayer +except Exception: + print("Please install the actnn library first.") + exit(1) + +from .. import BaseModel + + +class ActGCN(BaseModel): + r"""The GCN model from the `"Semi-Supervised Classification with Graph Convolutional Networks" + `_ paper + + Args: + in_features (int) : Number of input features. + out_features (int) : Number of classes. + hidden_size (int) : The dimension of node representation. + dropout (float) : Dropout rate for model training. + """ + + @staticmethod + def add_args(parser): + """Add model-specific arguments to the parser.""" + # fmt: off + parser.add_argument("--num-features", type=int) + parser.add_argument("--num-classes", type=int) + parser.add_argument("--num-layers", type=int, default=2) + parser.add_argument("--hidden-size", type=int, default=64) + parser.add_argument("--dropout", type=float, default=0.5) + parser.add_argument("--residual", action="store_true") + parser.add_argument("--norm", type=str, default=None) + parser.add_argument("--activation", type=str, default="relu") + # fmt: on + + @classmethod + def build_model_from_args(cls, args): + return cls( + args.num_features, + args.hidden_size, + args.num_classes, + args.num_layers, + args.dropout, + args.activation, + args.residual, + args.norm, + args.rp_ratio, + ) + + def __init__( + self, + in_feats, + hidden_size, + out_feats, + num_layers, + dropout, + activation="relu", + residual=False, + norm=None, + rp_ratio=1, + ): + super(ActGCN, self).__init__() + shapes = [in_feats] + [hidden_size] * (num_layers - 1) + [out_feats] + self.layers = nn.ModuleList( + [ + ActGCNLayer( + shapes[i], + shapes[i + 1], + dropout=dropout if i != num_layers - 1 else 0, + residual=residual if i != num_layers - 1 else None, + norm=norm if i != num_layers - 1 else None, + activation=activation if i != num_layers - 1 else None, + rp_ratio=rp_ratio, + ) + for i in range(num_layers) + ] + ) + self.num_layers = num_layers + + def embed(self, graph): + graph.sym_norm() + h = graph.x + for i in range(self.num_layers - 1): + h = self.layers[i](graph, h) + return h + + def forward(self, graph): + graph.sym_norm() + h = graph.x + for i in range(self.num_layers): + h = self.layers[i](graph, h) + return h diff --git a/cogdl/models/nn/gcn.py b/cogdl/models/nn/gcn.py index 70696df3..08a024b2 100644 --- a/cogdl/models/nn/gcn.py +++ b/cogdl/models/nn/gcn.py @@ -40,8 +40,6 @@ def build_model_from_args(cls, args): args.activation, args.residual, args.norm, - args.actnn, - args.rp_ratio, ) def __init__( @@ -54,29 +52,18 @@ def __init__( activation="relu", residual=False, norm=None, - actnn=False, - rp_ratio=1, ): super(GCN, self).__init__() shapes = [in_feats] + [hidden_size] * (num_layers - 1) + [out_feats] - Layer = GCNLayer - if actnn: - try: - from cogdl.layers.actgcn_layer import ActGCNLayer - except Exception: - print("Please install the actnn library first.") - exit(1) - Layer = ActGCNLayer self.layers = nn.ModuleList( [ - Layer( + GCNLayer( shapes[i], shapes[i + 1], dropout=dropout if i != num_layers - 1 else 0, residual=residual if i != num_layers - 1 else None, norm=norm if i != num_layers - 1 else None, activation=activation if i != num_layers - 1 else None, - rp_ratio=rp_ratio, ) for i in range(num_layers) ] @@ -96,6 +83,3 @@ def forward(self, graph): for i in range(self.num_layers): h = self.layers[i](graph, h) return h - - def predict(self, data): - return self.forward(data) diff --git a/docs/source/index.rst b/docs/source/index.rst index 2481e698..482c3d60 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -17,6 +17,7 @@ We summarize the contributions of CogDL as follows: ❗ News ------------ +- The new **v0.5.1 release** adds fast operators including SpMM (cpu version) and scatter_max (cuda version). It also adds lots of datasets for node classification. 🎉 - The new **v0.5.0 release** designs and implements a unified training loop for GNN. It introduces `DataWrapper` to help prepare the training/validation/test data and `ModelWrapper` to define the training/validation/test steps. - The new **v0.4.1 release** adds the implementation of Deep GNNs and the recommendation task. It also supports new pipelines for generating embeddings and recommendation. Welcome to join our tutorial on KDD 2021 at 10:30 am - 12:00 am, Aug. 14th (Singapore Time). More details can be found in https://kdd2021graph.github.io/. 🎉 - The new **v0.4.0 release** refactors the data storage (from ``Data`` to ``Graph``) and provides more fast operators to speed up GNN training. It also includes many self-supervised learning methods on graphs. BTW, we are glad to announce that we will give a tutorial on KDD 2021 in August. Please see this `link `_ for more details. 🎉 diff --git a/tests/datasets/test_rd2cd_data.py b/tests/datasets/test_rd2cd_data.py new file mode 100644 index 00000000..89d73b7f --- /dev/null +++ b/tests/datasets/test_rd2cd_data.py @@ -0,0 +1,55 @@ +from cogdl.datasets import build_dataset +from cogdl.utils import build_args_from_dict + + +def test_rd2cd_github(): + args = build_args_from_dict({"dataset": "Github"}) + assert args.dataset == "Github" + dataset = build_dataset(args) + data = dataset.data + assert data.num_nodes == 37700 + assert dataset.num_features == 4005 + + +def test_rd2cd_elliptic(): + args = build_args_from_dict({"dataset": "Elliptic"}) + assert args.dataset == "Elliptic" + dataset = build_dataset(args) + data = dataset.data + assert data.num_nodes == 203769 + assert dataset.num_features == 164 + + +def test_rd2cd_clothing(): + args = build_args_from_dict({"dataset": "Clothing"}) + assert args.dataset == "Clothing" + dataset = build_dataset(args) + data = dataset.data + assert data.num_nodes == 24919 + assert dataset.num_features == 9034 + + +def test_rd2cd_electronics(): + args = build_args_from_dict({"dataset": "Electronics"}) + assert args.dataset == "Electronics" + dataset = build_dataset(args) + data = dataset.data + assert data.num_nodes == 42318 + assert dataset.num_features == 8669 + + +def test_rd2cd_dblp(): + args = build_args_from_dict({"dataset": "Dblp"}) + assert args.dataset == "Dblp" + dataset = build_dataset(args) + data = dataset.data + assert data.num_nodes == 40672 + assert dataset.num_features == 7202 + + +if __name__ == "__main__": + test_rd2cd_github() + test_rd2cd_elliptic() + test_rd2cd_clothing() + test_rd2cd_electronics() + test_rd2cd_dblp()