Source code for dgl.data.heterophilous_graphs

"""
Datasets introduced in the 'A Critical Look at the Evaluation of GNNs under Heterophily: Are We
Really Making Progress? <https://arxiv.org/abs/2302.11640>'__ paper.
"""
import os

import numpy as np

from ..convert import graph
from ..transforms.functional import to_bidirected
from .dgl_dataset import DGLBuiltinDataset
from .utils import download


class HeterophilousGraphDataset(DGLBuiltinDataset):
    r"""Datasets introduced in the 'A Critical Look at the Evaluation of GNNs under Heterophily:
    Are We Really Making Progress? <https://arxiv.org/abs/2302.11640>'__ paper.

    Parameters
    ----------
    name : str
        Name of the dataset. One of 'roman-empire', 'amazon-ratings', 'minesweeper', 'tolokers',
        'questions'.
    raw_dir : str
        Raw file directory to store the processed data.
    force_reload : bool
        Whether to re-download the data source.
    verbose : bool
        Whether to print progress information.
    transform : callable
        A transform that takes in a :class:`~dgl.DGLGraph` object and returns
        a transformed version. The :class:`~dgl.DGLGraph` object will be
        transformed before every access.
    """

    def __init__(
        self,
        name,
        raw_dir=None,
        force_reload=False,
        verbose=True,
        transform=None,
    ):
        name = name.lower().replace("-", "_")
        url = f"https://github.com/yandex-research/heterophilous-graphs/raw/main/data/{name}.npz"
        super(HeterophilousGraphDataset, self).__init__(
            name=name,
            url=url,
            raw_dir=raw_dir,
            force_reload=force_reload,
            verbose=verbose,
            transform=transform,
        )

    def download(self):
        download(
            url=self.url, path=os.path.join(self.raw_path, f"{self.name}.npz")
        )

    def process(self):
        """Load and process the data."""
        try:
            import torch
        except ImportError:
            raise ModuleNotFoundError(
                "This dataset requires PyTorch to be the backend."
            )

        data = np.load(os.path.join(self.raw_path, f"{self.name}.npz"))
        src = torch.from_numpy(data["edges"][:, 0])
        dst = torch.from_numpy(data["edges"][:, 1])
        features = torch.from_numpy(data["node_features"])
        labels = torch.from_numpy(data["node_labels"])
        train_masks = torch.from_numpy(data["train_masks"].T)
        val_masks = torch.from_numpy(data["val_masks"].T)
        test_masks = torch.from_numpy(data["test_masks"].T)
        num_nodes = len(labels)
        num_classes = len(labels.unique())

        self._num_classes = num_classes

        self._g = to_bidirected(graph((src, dst), num_nodes=num_nodes))
        self._g.ndata["feat"] = features
        self._g.ndata["label"] = labels
        self._g.ndata["train_mask"] = train_masks
        self._g.ndata["val_mask"] = val_masks
        self._g.ndata["test_mask"] = test_masks

    def has_cache(self):
        return os.path.exists(self.raw_path)

    def load(self):
        self.process()

    def __getitem__(self, idx):
        assert idx == 0, "This dataset has only one graph."
        if self._transform is None:
            return self._g
        else:
            return self._transform(self._g)

    def __len__(self):
        return 1

    @property
    def num_classes(self):
        return self._num_classes


[docs]class RomanEmpireDataset(HeterophilousGraphDataset):
    r"""Roman-empire dataset from the 'A Critical Look at the Evaluation of GNNs under Heterophily:
    Are We Really Making Progress? <https://arxiv.org/abs/2302.11640>'__ paper.

    This dataset is based on the Roman Empire article from English Wikipedia, which was selected
    since it is one of the longest articles on Wikipedia. Each node in the graph corresponds to one
    (non-unique) word in the text. Thus, the number of nodes in the graph is equal to the article’s
    length. Two words are connected with an edge if at least one of the following two conditions
    holds: either these words follow each other in the text, or these words are connected in the
    dependency tree of the sentence (one word depends on the other). Thus, the graph is a chain
    graph with additional shortcut edges corresponding to syntactic dependencies between words. The
    class of a node is its syntactic role (17 most frequent roles were selected as unique classes
    and all the other roles were grouped into the 18th class). Node features are word embeddings.

    Statistics:

    - Nodes: 22662
    - Edges: 65854
    - Classes: 18
    - Node features: 300
    - 10 train/val/test splits

    Parameters
    ----------
    raw_dir : str, optional
        Raw file directory to store the processed data. Default: ~/.dgl/
    force_reload : bool, optional
        Whether to re-download the data source. Default: False
    verbose : bool, optional
        Whether to print progress information. Default: True
    transform : callable, optional
        A transform that takes in a :class:`~dgl.DGLGraph` object and returns
        a transformed version. The :class:`~dgl.DGLGraph` object will be
        transformed before every access. Default: None

    Attributes
    ----------
    num_classes : int
        Number of node classes


    Examples
    --------

    >>> from dgl.data import RomanEmpireDataset
    >>> dataset = RomanEmpireDataset()
    >>> g = dataset[0]
    >>> num_classes = dataset.num_classes

    >>> # get node features
    >>> feat = g.ndata["feat"]

    >>> # get the first data split
    >>> train_mask = g.ndata["train_mask"][:, 0]
    >>> val_mask = g.ndata["val_mask"][:, 0]
    >>> test_mask = g.ndata["test_mask"][:, 0]

    >>> # get labels
    >>> label = g.ndata['label']
    """

    def __init__(
        self, raw_dir=None, force_reload=False, verbose=True, transform=None
    ):
        super(RomanEmpireDataset, self).__init__(
            name="roman-empire",
            raw_dir=raw_dir,
            force_reload=force_reload,
            verbose=verbose,
            transform=transform,
        )


[docs]class AmazonRatingsDataset(HeterophilousGraphDataset):
    r"""Amazon-ratings dataset from the 'A Critical Look at the Evaluation of GNNs under
    Heterophily: Are We Really Making Progress? <https://arxiv.org/abs/2302.11640>'__ paper.

    This dataset is based on the Amazon product co-purchasing data. Nodes are products (books, music
    CDs, DVDs, VHS video tapes), and edges connect products that are frequently bought together. The
    task is to predict the average rating given to a product by reviewers. All possible rating
    values were grouped into five classes. Node features are the mean of word embeddings for words
    in the product description.

    Statistics:

    - Nodes: 24492
    - Edges: 186100
    - Classes: 5
    - Node features: 300
    - 10 train/val/test splits

    Parameters
    ----------
    raw_dir : str, optional
        Raw file directory to store the processed data. Default: ~/.dgl/
    force_reload : bool, optional
        Whether to re-download the data source. Default: False
    verbose : bool, optional
        Whether to print progress information. Default: True
    transform : callable, optional
        A transform that takes in a :class:`~dgl.DGLGraph` object and returns
        a transformed version. The :class:`~dgl.DGLGraph` object will be
        transformed before every access. Default: None

    Attributes
    ----------
    num_classes : int
        Number of node classes


    Examples
    --------

    >>> from dgl.data import AmazonRatingsDataset
    >>> dataset = AmazonRatingsDataset()
    >>> g = dataset[0]
    >>> num_classes = dataset.num_classes

    >>> # get node features
    >>> feat = g.ndata["feat"]

    >>> # get the first data split
    >>> train_mask = g.ndata["train_mask"][:, 0]
    >>> val_mask = g.ndata["val_mask"][:, 0]
    >>> test_mask = g.ndata["test_mask"][:, 0]

    >>> # get labels
    >>> label = g.ndata['label']
    """

    def __init__(
        self, raw_dir=None, force_reload=False, verbose=True, transform=None
    ):
        super(AmazonRatingsDataset, self).__init__(
            name="amazon-ratings",
            raw_dir=raw_dir,
            force_reload=force_reload,
            verbose=verbose,
            transform=transform,
        )


[docs]class MinesweeperDataset(HeterophilousGraphDataset):
    r"""Minesweeper dataset from the 'A Critical Look at the Evaluation of GNNs under Heterophily:
    Are We Really Making Progress? <https://arxiv.org/abs/2302.11640>'__ paper.

    This dataset is inspired by the Minesweeper game. The graph is a regular 100x100 grid where each
    node (cell) is connected to eight neighboring nodes (with the exception of nodes at the edge of
    the grid, which have fewer neighbors). 20% of the nodes are randomly selected as mines. The task
    is to predict which nodes are mines. The node features are one-hot-encoded numbers of
    neighboring mines. However, for randomly selected 50% of the nodes, the features are unknown,
    which is indicated by a separate binary feature.

    Statistics:

    - Nodes: 10000
    - Edges: 78804
    - Classes: 2
    - Node features: 7
    - 10 train/val/test splits

    Parameters
    ----------
    raw_dir : str, optional
        Raw file directory to store the processed data. Default: ~/.dgl/
    force_reload : bool, optional
        Whether to re-download the data source. Default: False
    verbose : bool, optional
        Whether to print progress information. Default: True
    transform : callable, optional
        A transform that takes in a :class:`~dgl.DGLGraph` object and returns
        a transformed version. The :class:`~dgl.DGLGraph` object will be
        transformed before every access. Default: None

    Attributes
    ----------
    num_classes : int
        Number of node classes


    Examples
    --------

    >>> from dgl.data import MinesweeperDataset
    >>> dataset = MinesweeperDataset()
    >>> g = dataset[0]
    >>> num_classes = dataset.num_classes

    >>> # get node features
    >>> feat = g.ndata["feat"]

    >>> # get the first data split
    >>> train_mask = g.ndata["train_mask"][:, 0]
    >>> val_mask = g.ndata["val_mask"][:, 0]
    >>> test_mask = g.ndata["test_mask"][:, 0]

    >>> # get labels
    >>> label = g.ndata['label']
    """

    def __init__(
        self, raw_dir=None, force_reload=False, verbose=True, transform=None
    ):
        super(MinesweeperDataset, self).__init__(
            name="minesweeper",
            raw_dir=raw_dir,
            force_reload=force_reload,
            verbose=verbose,
            transform=transform,
        )


[docs]class TolokersDataset(HeterophilousGraphDataset):
    r"""Tolokers dataset from the 'A Critical Look at the Evaluation of GNNs under Heterophily:
    Are We Really Making Progress? <https://arxiv.org/abs/2302.11640>'__ paper.

    This dataset is based on data from the Toloka crowdsourcing platform. The nodes represent
    tolokers (workers). An edge connects two tolokers if they have worked on the same task. The goal
    is to predict which tolokers have been banned in one of the projects. Node features are based on
    the worker’s profile information and task performance statistics.

    Statistics:

    - Nodes: 11758
    - Edges: 1038000
    - Classes: 2
    - Node features: 10
    - 10 train/val/test splits

    Parameters
    ----------
    raw_dir : str, optional
        Raw file directory to store the processed data. Default: ~/.dgl/
    force_reload : bool, optional
        Whether to re-download the data source. Default: False
    verbose : bool, optional
        Whether to print progress information. Default: True
    transform : callable, optional
        A transform that takes in a :class:`~dgl.DGLGraph` object and returns
        a transformed version. The :class:`~dgl.DGLGraph` object will be
        transformed before every access. Default: None

    Attributes
    ----------
    num_classes : int
        Number of node classes


    Examples
    --------

    >>> from dgl.data import TolokersDataset
    >>> dataset = TolokersDataset()
    >>> g = dataset[0]
    >>> num_classes = dataset.num_classes

    >>> # get node features
    >>> feat = g.ndata["feat"]

    >>> # get the first data split
    >>> train_mask = g.ndata["train_mask"][:, 0]
    >>> val_mask = g.ndata["val_mask"][:, 0]
    >>> test_mask = g.ndata["test_mask"][:, 0]

    >>> # get labels
    >>> label = g.ndata['label']
    """

    def __init__(
        self, raw_dir=None, force_reload=False, verbose=True, transform=None
    ):
        super(TolokersDataset, self).__init__(
            name="tolokers",
            raw_dir=raw_dir,
            force_reload=force_reload,
            verbose=verbose,
            transform=transform,
        )


[docs]class QuestionsDataset(HeterophilousGraphDataset):
    r"""Questions dataset from the 'A Critical Look at the Evaluation of GNNs under Heterophily:
    Are We Really Making Progress? <https://arxiv.org/abs/2302.11640>'__ paper.

    This dataset is based on data from the question-answering website Yandex Q. Nodes are users, and
    an edge connects two nodes if one user answered the other user’s question. The task is to
    predict which users remained active on the website (were not deleted or blocked). Node features
    are the mean of word embeddings for words in the user description. Users that do not have
    description are indicated by a separate binary feature.

    Statistics:

    - Nodes: 48921
    - Edges: 307080
    - Classes: 2
    - Node features: 301
    - 10 train/val/test splits

    Parameters
    ----------
    raw_dir : str, optional
        Raw file directory to store the processed data. Default: ~/.dgl/
    force_reload : bool, optional
        Whether to re-download the data source. Default: False
    verbose : bool, optional
        Whether to print progress information. Default: True
    transform : callable, optional
        A transform that takes in a :class:`~dgl.DGLGraph` object and returns
        a transformed version. The :class:`~dgl.DGLGraph` object will be
        transformed before every access. Default: None

    Attributes
    ----------
    num_classes : int
        Number of node classes


    Examples
    --------

    >>> from dgl.data import QuestionsDataset
    >>> dataset = QuestionsDataset()
    >>> g = dataset[0]
    >>> num_classes = dataset.num_classes

    >>> # get node features
    >>> feat = g.ndata["feat"]

    >>> # get the first data split
    >>> train_mask = g.ndata["train_mask"][:, 0]
    >>> val_mask = g.ndata["val_mask"][:, 0]
    >>> test_mask = g.ndata["test_mask"][:, 0]

    >>> # get labels
    >>> label = g.ndata['label']
    """

    def __init__(
        self, raw_dir=None, force_reload=False, verbose=True, transform=None
    ):
        super(QuestionsDataset, self).__init__(
            name="questions",
            raw_dir=raw_dir,
            force_reload=force_reload,
            verbose=verbose,
            transform=transform,
        )