Source code for dgl.data.gindt

"""Datasets used in How Powerful Are Graph Neural Networks?
(chen jun)
Datasets include:
MUTAG, COLLAB, IMDBBINARY, IMDBMULTI, NCI1, PROTEINS, PTC, REDDITBINARY, REDDITMULTI5K
https://github.com/weihua916/powerful-gnns/blob/master/dataset.zip
"""

import os

import numpy as np

from .. import backend as F
from ..convert import graph as dgl_graph
from ..utils import retry_method_with_fix
from .dgl_dataset import DGLBuiltinDataset
from .utils import (
    download,
    extract_archive,
    load_graphs,
    load_info,
    loadtxt,
    save_graphs,
    save_info,
)


[docs]class GINDataset(DGLBuiltinDataset):
    """Dataset Class for `How Powerful Are Graph Neural Networks? <https://arxiv.org/abs/1810.00826>`_.

    This is adapted from `<https://github.com/weihua916/powerful-gnns/blob/master/dataset.zip>`_.

    The class provides an interface for nine datasets used in the paper along with the paper-specific
    settings. The datasets are ``'MUTAG'``, ``'COLLAB'``, ``'IMDBBINARY'``, ``'IMDBMULTI'``,
    ``'NCI1'``, ``'PROTEINS'``, ``'PTC'``, ``'REDDITBINARY'``, ``'REDDITMULTI5K'``.

    If ``degree_as_nlabel`` is set to ``False``, then ``ndata['label']`` stores the provided node label,
    otherwise ``ndata['label']`` stores the node in-degrees.

    For graphs that have node attributes, ``ndata['attr']`` stores the node attributes.
    For graphs that have no attribute, ``ndata['attr']`` stores the corresponding one-hot encoding
    of ``ndata['label']``.

    Parameters
    ---------
    name: str
        dataset name, one of
        (``'MUTAG'``, ``'COLLAB'``, \
        ``'IMDBBINARY'``, ``'IMDBMULTI'``, \
        ``'NCI1'``, ``'PROTEINS'``, ``'PTC'``, \
        ``'REDDITBINARY'``, ``'REDDITMULTI5K'``)
    self_loop: bool
        add self to self edge if true
    degree_as_nlabel: bool
        take node degree as label and feature if true
    transform: callable, optional
        A transform that takes in a :class:`~dgl.DGLGraph` object and returns
        a transformed version. The :class:`~dgl.DGLGraph` object will be
        transformed before every access.

    Attributes
    ----------
    num_classes : int
        Number of classes for multiclass classification

    Examples
    --------
    >>> data = GINDataset(name='MUTAG', self_loop=False)

    The dataset instance is an iterable

    >>> len(data)
    188
    >>> g, label = data[128]
    >>> g
    Graph(num_nodes=13, num_edges=26,
          ndata_schemes={'label': Scheme(shape=(), dtype=torch.int64), 'attr': Scheme(shape=(7,), dtype=torch.float32)}
          edata_schemes={})
    >>> label
    tensor(1)

    Batch the graphs and labels for mini-batch training

    >>> graphs, labels = zip(*[data[i] for i in range(16)])
    >>> batched_graphs = dgl.batch(graphs)
    >>> batched_labels = torch.tensor(labels)
    >>> batched_graphs
    Graph(num_nodes=330, num_edges=748,
          ndata_schemes={'label': Scheme(shape=(), dtype=torch.int64), 'attr': Scheme(shape=(7,), dtype=torch.float32)}
          edata_schemes={})
    """

    def __init__(
        self,
        name,
        self_loop,
        degree_as_nlabel=False,
        raw_dir=None,
        force_reload=False,
        verbose=False,
        transform=None,
    ):

        self._name = name  # MUTAG
        gin_url = "https://raw.githubusercontent.com/weihua916/powerful-gnns/master/dataset.zip"
        self.ds_name = "nig"

        self.self_loop = self_loop
        self.graphs = []
        self.labels = []

        # relabel
        self.glabel_dict = {}
        self.nlabel_dict = {}
        self.elabel_dict = {}
        self.ndegree_dict = {}

        # global num
        self.N = 0  # total graphs number
        self.n = 0  # total nodes number
        self.m = 0  # total edges number

        # global num of classes
        self.gclasses = 0
        self.nclasses = 0
        self.eclasses = 0
        self.dim_nfeats = 0

        # flags
        self.degree_as_nlabel = degree_as_nlabel
        self.nattrs_flag = False
        self.nlabels_flag = False

        super(GINDataset, self).__init__(
            name=name,
            url=gin_url,
            hash_key=(name, self_loop, degree_as_nlabel),
            raw_dir=raw_dir,
            force_reload=force_reload,
            verbose=verbose,
            transform=transform,
        )

    @property
    def raw_path(self):
        return os.path.join(self.raw_dir, "GINDataset")

    def download(self):
        r"""Automatically download data and extract it."""
        zip_file_path = os.path.join(self.raw_dir, "GINDataset.zip")
        download(self.url, path=zip_file_path)
        extract_archive(zip_file_path, self.raw_path)

[docs]    def __len__(self):
        """Return the number of graphs in the dataset."""
        return len(self.graphs)

[docs]    def __getitem__(self, idx):
        """Get the idx-th sample.

        Parameters
        ---------
        idx : int
            The sample index.

        Returns
        -------
        (:class:`dgl.Graph`, Tensor)
            The graph and its label.
        """
        if self._transform is None:
            g = self.graphs[idx]
        else:
            g = self._transform(self.graphs[idx])
        return g, self.labels[idx]

    def _file_path(self):
        return os.path.join(
            self.raw_dir,
            "GINDataset",
            "dataset",
            self.name,
            "{}.txt".format(self.name),
        )

    def process(self):
        """Loads input dataset from dataset/NAME/NAME.txt file"""
        if self.verbose:
            print("loading data...")
        self.file = self._file_path()
        with open(self.file, "r") as f:
            # line_1 == N, total number of graphs
            self.N = int(f.readline().strip())

            for i in range(self.N):
                if (i + 1) % 10 == 0 and self.verbose is True:
                    print("processing graph {}...".format(i + 1))

                grow = f.readline().strip().split()
                # line_2 == [n_nodes, l] is equal to
                # [node number of a graph, class label of a graph]
                n_nodes, glabel = [int(w) for w in grow]

                # relabel graphs
                if glabel not in self.glabel_dict:
                    mapped = len(self.glabel_dict)
                    self.glabel_dict[glabel] = mapped

                self.labels.append(self.glabel_dict[glabel])

                g = dgl_graph(([], []))
                g.add_nodes(n_nodes)

                nlabels = []  # node labels
                nattrs = []  # node attributes if it has
                m_edges = 0

                for j in range(n_nodes):
                    nrow = f.readline().strip().split()

                    # handle edges and attributes(if has)
                    tmp = int(nrow[1]) + 2  # tmp == 2 + #edges
                    if tmp == len(nrow):
                        # no node attributes
                        nrow = [int(w) for w in nrow]
                    elif tmp > len(nrow):
                        nrow = [int(w) for w in nrow[:tmp]]
                        nattr = [float(w) for w in nrow[tmp:]]
                        nattrs.append(nattr)
                    else:
                        raise Exception("edge number is incorrect!")

                    # relabel nodes if it has labels
                    # if it doesn't have node labels, then every nrow[0]==0
                    if not nrow[0] in self.nlabel_dict:
                        mapped = len(self.nlabel_dict)
                        self.nlabel_dict[nrow[0]] = mapped

                    nlabels.append(self.nlabel_dict[nrow[0]])

                    m_edges += nrow[1]
                    g.add_edges(j, nrow[2:])

                    # add self loop
                    if self.self_loop:
                        m_edges += 1
                        g.add_edges(j, j)

                    if (j + 1) % 10 == 0 and self.verbose is True:
                        print(
                            "processing node {} of graph {}...".format(
                                j + 1, i + 1
                            )
                        )
                        print("this node has {} edgs.".format(nrow[1]))

                if nattrs != []:
                    nattrs = np.stack(nattrs)
                    g.ndata["attr"] = F.tensor(nattrs, F.float32)
                    self.nattrs_flag = True

                g.ndata["label"] = F.tensor(nlabels)
                if len(self.nlabel_dict) > 1:
                    self.nlabels_flag = True

                assert g.number_of_nodes() == n_nodes

                # update statistics of graphs
                self.n += n_nodes
                self.m += m_edges

                self.graphs.append(g)

        self.labels = F.tensor(self.labels)
        # if no attr
        if not self.nattrs_flag:
            if self.verbose:
                print("there are no node features in this dataset!")
            # generate node attr by node degree
            if self.degree_as_nlabel:
                if self.verbose:
                    print("generate node features by node degree...")
                for g in self.graphs:
                    # actually this label shouldn't be updated
                    # in case users want to keep it
                    # but usually no features means no labels, fine.
                    g.ndata["label"] = g.in_degrees()
                    # extracting unique node labels

            # in case the labels/degrees are not continuous number
            nlabel_set = set([])
            for g in self.graphs:
                nlabel_set = nlabel_set.union(
                    set([F.as_scalar(nl) for nl in g.ndata["label"]])
                )
            nlabel_set = list(nlabel_set)
            is_label_valid = all(
                [label in self.nlabel_dict for label in nlabel_set]
            )
            if (
                is_label_valid
                and len(nlabel_set) == np.max(nlabel_set) + 1
                and np.min(nlabel_set) == 0
            ):
                # Note this is different from the author's implementation. In weihua916's implementation,
                # the labels are relabeled anyway. But here we didn't relabel it if the labels are contiguous
                # to make it consistent with the original dataset
                label2idx = self.nlabel_dict
            else:
                label2idx = {nlabel_set[i]: i for i in range(len(nlabel_set))}
            # generate node attr by node label
            for g in self.graphs:
                attr = np.zeros((g.number_of_nodes(), len(label2idx)))
                attr[
                    range(g.number_of_nodes()),
                    [
                        label2idx[nl]
                        for nl in F.asnumpy(g.ndata["label"]).tolist()
                    ],
                ] = 1
                g.ndata["attr"] = F.tensor(attr, F.float32)

        # after load, get the #classes and #dim
        self.gclasses = len(self.glabel_dict)
        self.nclasses = len(self.nlabel_dict)
        self.eclasses = len(self.elabel_dict)
        self.dim_nfeats = len(self.graphs[0].ndata["attr"][0])

        if self.verbose:
            print("Done.")
            print(
                """
                -------- Data Statistics --------'
                #Graphs: %d
                #Graph Classes: %d
                #Nodes: %d
                #Node Classes: %d
                #Node Features Dim: %d
                #Edges: %d
                #Edge Classes: %d
                Avg. of #Nodes: %.2f
                Avg. of #Edges: %.2f
                Graph Relabeled: %s
                Node Relabeled: %s
                Degree Relabeled(If degree_as_nlabel=True): %s \n """
                % (
                    self.N,
                    self.gclasses,
                    self.n,
                    self.nclasses,
                    self.dim_nfeats,
                    self.m,
                    self.eclasses,
                    self.n / self.N,
                    self.m / self.N,
                    self.glabel_dict,
                    self.nlabel_dict,
                    self.ndegree_dict,
                )
            )

    def save(self):
        graph_path = os.path.join(
            self.save_path, "gin_{}_{}.bin".format(self.name, self.hash)
        )
        info_path = os.path.join(
            self.save_path, "gin_{}_{}.pkl".format(self.name, self.hash)
        )
        label_dict = {"labels": self.labels}
        info_dict = {
            "N": self.N,
            "n": self.n,
            "m": self.m,
            "self_loop": self.self_loop,
            "gclasses": self.gclasses,
            "nclasses": self.nclasses,
            "eclasses": self.eclasses,
            "dim_nfeats": self.dim_nfeats,
            "degree_as_nlabel": self.degree_as_nlabel,
            "glabel_dict": self.glabel_dict,
            "nlabel_dict": self.nlabel_dict,
            "elabel_dict": self.elabel_dict,
            "ndegree_dict": self.ndegree_dict,
        }
        save_graphs(str(graph_path), self.graphs, label_dict)
        save_info(str(info_path), info_dict)

    def load(self):
        graph_path = os.path.join(
            self.save_path, "gin_{}_{}.bin".format(self.name, self.hash)
        )
        info_path = os.path.join(
            self.save_path, "gin_{}_{}.pkl".format(self.name, self.hash)
        )
        graphs, label_dict = load_graphs(str(graph_path))
        info_dict = load_info(str(info_path))

        self.graphs = graphs
        self.labels = label_dict["labels"]

        self.N = info_dict["N"]
        self.n = info_dict["n"]
        self.m = info_dict["m"]
        self.self_loop = info_dict["self_loop"]
        self.gclasses = info_dict["gclasses"]
        self.nclasses = info_dict["nclasses"]
        self.eclasses = info_dict["eclasses"]
        self.dim_nfeats = info_dict["dim_nfeats"]
        self.glabel_dict = info_dict["glabel_dict"]
        self.nlabel_dict = info_dict["nlabel_dict"]
        self.elabel_dict = info_dict["elabel_dict"]
        self.ndegree_dict = info_dict["ndegree_dict"]
        self.degree_as_nlabel = info_dict["degree_as_nlabel"]

    def has_cache(self):
        graph_path = os.path.join(
            self.save_path, "gin_{}_{}.bin".format(self.name, self.hash)
        )
        info_path = os.path.join(
            self.save_path, "gin_{}_{}.pkl".format(self.name, self.hash)
        )
        if os.path.exists(graph_path) and os.path.exists(info_path):
            return True
        return False

    @property
    def num_classes(self):
        return self.gclasses