Source code for dgl.sampling.neighbor

"""Neighbor sampling APIs"""

from .._ffi.function import _init_api
from .. import backend as F
from ..base import DGLError, EID
from ..heterograph import DGLHeteroGraph
from .. import ndarray as nd
from .. import utils

__all__ = [
    'sample_neighbors',
    'select_topk']

[docs]def sample_neighbors(g, nodes, fanout, edge_dir='in', prob=None, replace=False, copy_ndata=True, copy_edata=True, _dist_training=False): """Sample neighboring edges of the given nodes and return the induced subgraph. For each node, a number of inbound (or outbound when ``edge_dir == 'out'``) edges will be randomly chosen. The graph returned will then contain all the nodes in the original graph, but only the sampled edges. Node/edge features are not preserved. The original IDs of the sampled edges are stored as the `dgl.EID` feature in the returned graph. Parameters ---------- g : DGLGraph The graph. Must be on CPU. nodes : tensor or dict Node IDs to sample neighbors from. This argument can take a single ID tensor or a dictionary of node types and ID tensors. If a single tensor is given, the graph must only have one type of nodes. fanout : int or dict[etype, int] The number of edges to be sampled for each node on each edge type. This argument can take a single int or a dictionary of edge types and ints. If a single int is given, DGL will sample this number of edges for each node for every edge type. If -1 is given for a single edge type, all the neighboring edges with that edge type will be selected. edge_dir : str, optional Determines whether to sample inbound or outbound edges. Can take either ``in`` for inbound edges or ``out`` for outbound edges. prob : str, optional Feature name used as the (unnormalized) probabilities associated with each neighboring edge of a node. The feature must have only one element for each edge. The features must be non-negative floats, and the sum of the features of inbound/outbound edges for every node must be positive (though they don't have to sum up to one). Otherwise, the result will be undefined. replace : bool, optional If True, sample with replacement. copy_ndata: bool, optional If True, the node features of the new graph are copied from the original graph. If False, the new graph will not have any node features. (Default: True) copy_edata: bool, optional If True, the edge features of the new graph are copied from the original graph. If False, the new graph will not have any edge features. (Default: True) _dist_training : bool, optional Internal argument. Do not use. (Default: False) Returns ------- DGLGraph A sampled subgraph containing only the sampled neighboring edges. It is on CPU. Notes ----- If :attr:`copy_ndata` or :attr:`copy_edata` is True, same tensors are used as the node or edge features of the original graph and the new graph. As a result, users should avoid performing in-place operations on the node features of the new graph to avoid feature corruption. Examples -------- Assume that you have the following graph >>> g = dgl.graph(([0, 0, 1, 1, 2, 2], [1, 2, 0, 1, 2, 0])) And the weights >>> g.edata['prob'] = torch.FloatTensor([0., 1., 0., 1., 0., 1.]) To sample one inbound edge for node 0 and node 1: >>> sg = dgl.sampling.sample_neighbors(g, [0, 1], 1) >>> sg.edges(order='eid') (tensor([1, 0]), tensor([0, 1])) >>> sg.edata[dgl.EID] tensor([2, 0]) To sample one inbound edge for node 0 and node 1 with probability in edge feature ``prob``: >>> sg = dgl.sampling.sample_neighbors(g, [0, 1], 1, prob='prob') >>> sg.edges(order='eid') (tensor([2, 1]), tensor([0, 1])) With ``fanout`` greater than the number of actual neighbors and without replacement, DGL will take all neighbors instead: >>> sg = dgl.sampling.sample_neighbors(g, [0, 1], 3) >>> sg.edges(order='eid') (tensor([1, 2, 0, 1]), tensor([0, 0, 1, 1])) """ if not isinstance(nodes, dict): if len(g.ntypes) > 1: raise DGLError("Must specify node type when the graph is not homogeneous.") nodes = {g.ntypes[0] : nodes} assert g.device == F.cpu(), "Graph must be on CPU." nodes = utils.prepare_tensor_dict(g, nodes, 'nodes') nodes_all_types = [] for ntype in g.ntypes: if ntype in nodes: nodes_all_types.append(F.to_dgl_nd(nodes[ntype])) else: nodes_all_types.append(nd.array([], ctx=nd.cpu())) if not isinstance(fanout, dict): fanout_array = [int(fanout)] * len(g.etypes) else: if len(fanout) != len(g.etypes): raise DGLError('Fan-out must be specified for each edge type ' 'if a dict is provided.') fanout_array = [None] * len(g.etypes) for etype, value in fanout.items(): fanout_array[g.get_etype_id(etype)] = value fanout_array = F.to_dgl_nd(F.tensor(fanout_array, dtype=F.int64)) if prob is None: prob_arrays = [nd.array([], ctx=nd.cpu())] * len(g.etypes) else: prob_arrays = [] for etype in g.canonical_etypes: if prob in g.edges[etype].data: prob_arrays.append(F.to_dgl_nd(g.edges[etype].data[prob])) else: prob_arrays.append(nd.array([], ctx=nd.cpu())) subgidx = _CAPI_DGLSampleNeighbors(g._graph, nodes_all_types, fanout_array, edge_dir, prob_arrays, replace) induced_edges = subgidx.induced_edges ret = DGLHeteroGraph(subgidx.graph, g.ntypes, g.etypes) # handle features # (TODO) (BarclayII) DGL distributed fails with bus error, freezes, or other # incomprehensible errors with lazy feature copy. # So in distributed training context, we fall back to old behavior where we # only set the edge IDs. if not _dist_training: if copy_ndata: node_frames = utils.extract_node_subframes(g, None) utils.set_new_frames(ret, node_frames=node_frames) if copy_edata: edge_frames = utils.extract_edge_subframes(g, induced_edges) utils.set_new_frames(ret, edge_frames=edge_frames) else: for i, etype in enumerate(ret.canonical_etypes): ret.edges[etype].data[EID] = induced_edges[i] return ret
[docs]def select_topk(g, k, weight, nodes=None, edge_dir='in', ascending=False, copy_ndata=True, copy_edata=True): """Select the neighboring edges with k-largest (or k-smallest) weights of the given nodes and return the induced subgraph. For each node, a number of inbound (or outbound when ``edge_dir == 'out'``) edges with the largest (or smallest when ``ascending == True``) weights will be chosen. The graph returned will then contain all the nodes in the original graph, but only the sampled edges. Node/edge features are not preserved. The original IDs of the sampled edges are stored as the `dgl.EID` feature in the returned graph. Parameters ---------- g : DGLGraph The graph. Must be on CPU. k : int or dict[etype, int] The number of edges to be selected for each node on each edge type. This argument can take a single int or a dictionary of edge types and ints. If a single int is given, DGL will select this number of edges for each node for every edge type. If -1 is given for a single edge type, all the neighboring edges with that edge type will be selected. weight : str Feature name of the weights associated with each edge. The feature should have only one element for each edge. The feature can be either int32/64 or float32/64. nodes : tensor or dict, optional Node IDs to sample neighbors from. This argument can take a single ID tensor or a dictionary of node types and ID tensors. If a single tensor is given, the graph must only have one type of nodes. If None, DGL will select the edges for all nodes. edge_dir : str, optional Determines whether to sample inbound or outbound edges. Can take either ``in`` for inbound edges or ``out`` for outbound edges. ascending : bool, optional If True, DGL will return edges with k-smallest weights instead of k-largest weights. copy_ndata: bool, optional If True, the node features of the new graph are copied from the original graph. If False, the new graph will not have any node features. (Default: True) copy_edata: bool, optional If True, the edge features of the new graph are copied from the original graph. If False, the new graph will not have any edge features. (Default: True) Returns ------- DGLGraph A sampled subgraph containing only the sampled neighboring edges. It is on CPU. Notes ----- If :attr:`copy_ndata` or :attr:`copy_edata` is True, same tensors are used as the node or edge features of the original graph and the new graph. As a result, users should avoid performing in-place operations on the node features of the new graph to avoid feature corruption. Examples -------- >>> g = dgl.graph(([0, 0, 1, 1, 2, 2], [1, 2, 0, 1, 2, 0])) >>> g.edata['weight'] = torch.FloatTensor([0, 1, 0, 1, 0, 1]) >>> sg = dgl.sampling.select_topk(g, 1, 'weight') >>> sg.edges(order='eid') (tensor([2, 1, 0]), tensor([0, 1, 2])) """ # Rectify nodes to a dictionary if nodes is None: nodes = { ntype: F.astype(F.arange(0, g.number_of_nodes(ntype)), g.idtype) for ntype in g.ntypes } elif not isinstance(nodes, dict): if len(g.ntypes) > 1: raise DGLError("Must specify node type when the graph is not homogeneous.") nodes = {g.ntypes[0] : nodes} assert g.device == F.cpu(), "Graph must be on CPU." # Parse nodes into a list of NDArrays. nodes = utils.prepare_tensor_dict(g, nodes, 'nodes') nodes_all_types = [] for ntype in g.ntypes: if ntype in nodes: nodes_all_types.append(F.to_dgl_nd(nodes[ntype])) else: nodes_all_types.append(nd.array([], ctx=nd.cpu())) if not isinstance(k, dict): k_array = [int(k)] * len(g.etypes) else: if len(k) != len(g.etypes): raise DGLError('K value must be specified for each edge type ' 'if a dict is provided.') k_array = [None] * len(g.etypes) for etype, value in k.items(): k_array[g.get_etype_id(etype)] = value k_array = F.to_dgl_nd(F.tensor(k_array, dtype=F.int64)) weight_arrays = [] for etype in g.canonical_etypes: if weight in g.edges[etype].data: weight_arrays.append(F.to_dgl_nd(g.edges[etype].data[weight])) else: raise DGLError('Edge weights "{}" do not exist for relation graph "{}".'.format( weight, etype)) subgidx = _CAPI_DGLSampleNeighborsTopk( g._graph, nodes_all_types, k_array, edge_dir, weight_arrays, bool(ascending)) induced_edges = subgidx.induced_edges ret = DGLHeteroGraph(subgidx.graph, g.ntypes, g.etypes) # handle features if copy_ndata: node_frames = utils.extract_node_subframes(g, None) utils.set_new_frames(ret, node_frames=node_frames) if copy_edata: edge_frames = utils.extract_edge_subframes(g, induced_edges) utils.set_new_frames(ret, edge_frames=edge_frames) return ret
_init_api('dgl.sampling.neighbor', __name__)