Source code for

"""GNN Benchmark datasets for node classification."""
import os

import numpy as np
import scipy.sparse as sp

from .. import backend as F, transforms
from ..convert import graph as dgl_graph

from .dgl_dataset import DGLBuiltinDataset
from .utils import (

__all__ = [

def eliminate_self_loops(A):
    """Remove self-loops from the adjacency matrix."""
    A = A.tolil()
    A = A.tocsr()
    return A

class GNNBenchmarkDataset(DGLBuiltinDataset):
    r"""Base Class for GNN Benchmark dataset


    def __init__(
        _url = _get_dgl_url("dataset/" + name + ".zip")
        super(GNNBenchmarkDataset, self).__init__(

    def process(self):
        npz_path = os.path.join(self.raw_path, + ".npz")
        g = self._load_npz(npz_path)
        g = transforms.reorder_graph(
        self._graph = g
        self._data = [g]

    def has_cache(self):
        graph_path = os.path.join(self.save_path, "dgl_graph_v1.bin")
        if os.path.exists(graph_path):
            return True
        return False

    def save(self):
        graph_path = os.path.join(self.save_path, "dgl_graph_v1.bin")
        save_graphs(graph_path, self._graph)

    def load(self):
        graph_path = os.path.join(self.save_path, "dgl_graph_v1.bin")
        graphs, _ = load_graphs(graph_path)
        self._graph = graphs[0]
        self._data = [graphs[0]]

    def _print_info(self):
        if self.verbose:
            print("  NumNodes: {}".format(self._graph.num_nodes()))
            print("  NumEdges: {}".format(self._graph.num_edges()))
            print("  NumFeats: {}".format(self._graph.ndata["feat"].shape[-1]))
            print("  NumbClasses: {}".format(self.num_classes))

    def _load_npz(self, file_name):
        with np.load(file_name, allow_pickle=True) as loader:
            loader = dict(loader)
            num_nodes = loader["adj_shape"][0]
            adj_matrix = sp.csr_matrix(

            if "attr_data" in loader:
                # Attributes are stored as a sparse CSR matrix
                attr_matrix = sp.csr_matrix(
            elif "attr_matrix" in loader:
                # Attributes are stored as a (dense) np.ndarray
                attr_matrix = loader["attr_matrix"]
                attr_matrix = None

            if "labels_data" in loader:
                # Labels are stored as a CSR matrix
                labels = sp.csr_matrix(
            elif "labels" in loader:
                # Labels are stored as a numpy array
                labels = loader["labels"]
                labels = None
        g = dgl_graph((adj_matrix.row, adj_matrix.col))
        g = transforms.to_bidirected(g)
        g.ndata["feat"] = F.tensor(attr_matrix, F.data_type_dict["float32"])
        g.ndata["label"] = F.tensor(labels, F.data_type_dict["int64"])
        return g

    def num_classes(self):
        """Number of classes."""
        raise NotImplementedError

    def __getitem__(self, idx):
        r"""Get graph by index

        idx : int
            Item index


            The graph contains:

            - ``ndata['feat']``: node features
            - ``ndata['label']``: node labels
        assert idx == 0, "This dataset has only one graph"
        if self._transform is None:
            return self._graph
            return self._transform(self._graph)

    def __len__(self):
        r"""Number of graphs in the dataset"""
        return 1

[docs]class CoraFullDataset(GNNBenchmarkDataset): r"""CORA-Full dataset for node classification task. Extended Cora dataset. Nodes represent paper and edges represent citations. Reference: `<>`_ Statistics: - Nodes: 19,793 - Edges: 126,842 (note that the original dataset has 65,311 edges but DGL adds the reverse edges and remove the duplicates, hence with a different number) - Number of Classes: 70 - Node feature size: 8,710 Parameters ---------- raw_dir : str Raw file directory to download/contains the input data directory. Default: ~/.dgl/ force_reload : bool Whether to reload the dataset. Default: False verbose : bool Whether to print out progress information. Default: True. transform : callable, optional A transform that takes in a :class:`~dgl.DGLGraph` object and returns a transformed version. The :class:`~dgl.DGLGraph` object will be transformed before every access. Attributes ---------- num_classes : int Number of classes for each node. Examples -------- >>> data = CoraFullDataset() >>> g = data[0] >>> num_class = data.num_classes >>> feat = g.ndata['feat'] # get node feature >>> label = g.ndata['label'] # get node labels """ def __init__( self, raw_dir=None, force_reload=False, verbose=False, transform=None ): super(CoraFullDataset, self).__init__( name="cora_full", raw_dir=raw_dir, force_reload=force_reload, verbose=verbose, transform=transform, ) @property def num_classes(self): """Number of classes. Return ------- int """ return 70
[docs]class CoauthorCSDataset(GNNBenchmarkDataset): r"""'Computer Science (CS)' part of the Coauthor dataset for node classification task. Coauthor CS and Coauthor Physics are co-authorship graphs based on the Microsoft Academic Graph from the KDD Cup 2016 challenge. Here, nodes are authors, that are connected by an edge if they co-authored a paper; node features represent paper keywords for each author’s papers, and class labels indicate most active fields of study for each author. Reference: `<>`_ Statistics: - Nodes: 18,333 - Edges: 163,788 (note that the original dataset has 81,894 edges but DGL adds the reverse edges and remove the duplicates, hence with a different number) - Number of classes: 15 - Node feature size: 6,805 Parameters ---------- raw_dir : str Raw file directory to download/contains the input data directory. Default: ~/.dgl/ force_reload : bool Whether to reload the dataset. Default: False verbose : bool Whether to print out progress information. Default: True. transform : callable, optional A transform that takes in a :class:`~dgl.DGLGraph` object and returns a transformed version. The :class:`~dgl.DGLGraph` object will be transformed before every access. Attributes ---------- num_classes : int Number of classes for each node. Examples -------- >>> data = CoauthorCSDataset() >>> g = data[0] >>> num_class = data.num_classes >>> feat = g.ndata['feat'] # get node feature >>> label = g.ndata['label'] # get node labels """ def __init__( self, raw_dir=None, force_reload=False, verbose=False, transform=None ): super(CoauthorCSDataset, self).__init__( name="coauthor_cs", raw_dir=raw_dir, force_reload=force_reload, verbose=verbose, transform=transform, ) @property def num_classes(self): """Number of classes. Return ------- int """ return 15
[docs]class CoauthorPhysicsDataset(GNNBenchmarkDataset): r"""'Physics' part of the Coauthor dataset for node classification task. Coauthor CS and Coauthor Physics are co-authorship graphs based on the Microsoft Academic Graph from the KDD Cup 2016 challenge. Here, nodes are authors, that are connected by an edge if they co-authored a paper; node features represent paper keywords for each author’s papers, and class labels indicate most active fields of study for each author. Reference: `<>`_ Statistics - Nodes: 34,493 - Edges: 495,924 (note that the original dataset has 247,962 edges but DGL adds the reverse edges and remove the duplicates, hence with a different number) - Number of classes: 5 - Node feature size: 8,415 Parameters ---------- raw_dir : str Raw file directory to download/contains the input data directory. Default: ~/.dgl/ force_reload : bool Whether to reload the dataset. Default: False verbose : bool Whether to print out progress information. Default: True. transform : callable, optional A transform that takes in a :class:`~dgl.DGLGraph` object and returns a transformed version. The :class:`~dgl.DGLGraph` object will be transformed before every access. Attributes ---------- num_classes : int Number of classes for each node. Examples -------- >>> data = CoauthorPhysicsDataset() >>> g = data[0] >>> num_class = data.num_classes >>> feat = g.ndata['feat'] # get node feature >>> label = g.ndata['label'] # get node labels """ def __init__( self, raw_dir=None, force_reload=False, verbose=False, transform=None ): super(CoauthorPhysicsDataset, self).__init__( name="coauthor_physics", raw_dir=raw_dir, force_reload=force_reload, verbose=verbose, transform=transform, ) @property def num_classes(self): """Number of classes. Return ------- int """ return 5
[docs]class AmazonCoBuyComputerDataset(GNNBenchmarkDataset): r"""'Computer' part of the AmazonCoBuy dataset for node classification task. Amazon Computers and Amazon Photo are segments of the Amazon co-purchase graph [McAuley et al., 2015], where nodes represent goods, edges indicate that two goods are frequently bought together, node features are bag-of-words encoded product reviews, and class labels are given by the product category. Reference: `<>`_ Statistics: - Nodes: 13,752 - Edges: 491,722 (note that the original dataset has 245,778 edges but DGL adds the reverse edges and remove the duplicates, hence with a different number) - Number of classes: 10 - Node feature size: 767 Parameters ---------- raw_dir : str Raw file directory to download/contains the input data directory. Default: ~/.dgl/ force_reload : bool Whether to reload the dataset. Default: False verbose : bool Whether to print out progress information. Default: True. transform : callable, optional A transform that takes in a :class:`~dgl.DGLGraph` object and returns a transformed version. The :class:`~dgl.DGLGraph` object will be transformed before every access. Attributes ---------- num_classes : int Number of classes for each node. Examples -------- >>> data = AmazonCoBuyComputerDataset() >>> g = data[0] >>> num_class = data.num_classes >>> feat = g.ndata['feat'] # get node feature >>> label = g.ndata['label'] # get node labels """ def __init__( self, raw_dir=None, force_reload=False, verbose=False, transform=None ): super(AmazonCoBuyComputerDataset, self).__init__( name="amazon_co_buy_computer", raw_dir=raw_dir, force_reload=force_reload, verbose=verbose, transform=transform, ) @property def num_classes(self): """Number of classes. Return ------- int """ return 10
[docs]class AmazonCoBuyPhotoDataset(GNNBenchmarkDataset): r"""AmazonCoBuy dataset for node classification task. Amazon Computers and Amazon Photo are segments of the Amazon co-purchase graph [McAuley et al., 2015], where nodes represent goods, edges indicate that two goods are frequently bought together, node features are bag-of-words encoded product reviews, and class labels are given by the product category. Reference: `<>`_ Statistics - Nodes: 7,650 - Edges: 238,163 (note that the original dataset has 119,043 edges but DGL adds the reverse edges and remove the duplicates, hence with a different number) - Number of classes: 8 - Node feature size: 745 Parameters ---------- raw_dir : str Raw file directory to download/contains the input data directory. Default: ~/.dgl/ force_reload : bool Whether to reload the dataset. Default: False verbose : bool Whether to print out progress information. Default: True. transform : callable, optional A transform that takes in a :class:`~dgl.DGLGraph` object and returns a transformed version. The :class:`~dgl.DGLGraph` object will be transformed before every access. Attributes ---------- num_classes : int Number of classes for each node. Examples -------- >>> data = AmazonCoBuyPhotoDataset() >>> g = data[0] >>> num_class = data.num_classes >>> feat = g.ndata['feat'] # get node feature >>> label = g.ndata['label'] # get node labels """ def __init__( self, raw_dir=None, force_reload=False, verbose=False, transform=None ): super(AmazonCoBuyPhotoDataset, self).__init__( name="amazon_co_buy_photo", raw_dir=raw_dir, force_reload=force_reload, verbose=verbose, transform=transform, ) @property def num_classes(self): """Number of classes. Return ------- int """ return 8
class CoraFull(CoraFullDataset): def __init__(self, **kwargs): deprecate_class("CoraFull", "CoraFullDataset") super(CoraFull, self).__init__(**kwargs) def AmazonCoBuy(name): if name == "computers": deprecate_class("AmazonCoBuy", "AmazonCoBuyComputerDataset") return AmazonCoBuyComputerDataset() elif name == "photo": deprecate_class("AmazonCoBuy", "AmazonCoBuyPhotoDataset") return AmazonCoBuyPhotoDataset() else: raise ValueError('Dataset name should be "computers" or "photo".') def Coauthor(name): if name == "cs": deprecate_class("Coauthor", "CoauthorCSDataset") return CoauthorCSDataset() elif name == "physics": deprecate_class("Coauthor", "CoauthorPhysicsDataset") return CoauthorPhysicsDataset() else: raise ValueError('Dataset name should be "cs" or "physics".')