Source code for dgl.convert

"""Module for converting graph from/to other object."""
from collections import defaultdict
import numpy as np
import scipy as sp
import networkx as nx

from . import backend as F
from . import heterograph_index
from .heterograph import DGLHeteroGraph, combine_frames
from . import graph_index
from . import utils
from .base import NTYPE, ETYPE, NID, EID, DGLError, dgl_warning

__all__ = [
    'graph',
    'bipartite',
    'hetero_from_relations',
    'heterograph',
    'to_hetero',
    'to_homo',
    'to_networkx',
]

[docs]def graph(data, ntype='_N', etype='_E', num_nodes=None, card=None, validate=True, restrict_format='auto', index_dtype="int64", **kwargs): """Create a graph with one type of nodes and edges. In the sparse matrix perspective, :func:`dgl.graph` creates a graph whose adjacency matrix must be square while :func:`dgl.bipartite` creates a graph that does not necessarily have square adjacency matrix. Parameters ---------- data : graph data Data to initialize graph structure. Supported data formats are (1) list of edge pairs (e.g. [(0, 2), (3, 1), ...]) (2) pair of vertex IDs representing end nodes (e.g. ([0, 3, ...], [2, 1, ...])) (3) scipy sparse matrix (4) networkx graph ntype : str, optional Node type name. (Default: _N) etype : str, optional Edge type name. (Default: _E) num_nodes : int, optional Number of nodes in the graph. If None, infer from input data, i.e. the largest node ID plus 1. (Default: None) card : int, optional Deprecated (see :attr:`num_nodes`). Cardinality (number of nodes in the graph). If None, infer from input data, i.e. the largest node ID plus 1. (Default: None) validate : bool, optional If True, check if node ids are within cardinality, the check process may take some time. (Default: True) If False and card is not None, user would receive a warning. restrict_format : 'any', 'coo', 'csr', 'csc', 'auto', optional Force the storage format. Default: 'auto' (i.e. let DGL decide what to use). index_dtype : 'int32', 'int64', optional Force the index data type. Default: 'int64'. kwargs : key-word arguments, optional Other key word arguments. Only comes into effect when we are using a NetworkX graph. It can consist of: * edge_id_attr_name ``Str``, key name for edge ids in the NetworkX graph. If not found, we will consider the graph not to have pre-specified edge ids. * node_attrs ``List of str``, names for node features to retrieve from the NetworkX graph * edge_attrs ``List of str``, names for edge features to retrieve from the NetworkX graph Returns ------- DGLHeteroGraph Examples -------- Create from pairs of edges with form (src, dst) >>> g = dgl.graph([(0, 2), (0, 3), (1, 2)]) Create from source and destination vertex ID lists >>> u = [0, 0, 1] >>> v = [2, 3, 2] >>> g = dgl.graph((u, v)) The IDs can also be stored in framework-specific tensors >>> import torch >>> u = torch.tensor([0, 0, 1]) >>> v = torch.tensor([2, 3, 2]) >>> g = dgl.graph((u, v)) Create from scipy sparse matrix >>> from scipy.sparse import coo_matrix >>> spmat = coo_matrix(([1,1,1], ([0, 0, 1], [2, 3, 2])), shape=(4, 4)) >>> g = dgl.graph(spmat) Create from networkx graph >>> import networkx as nx >>> nxg = nx.path_graph(3) >>> g = dgl.graph(nxg) Specify node and edge type names >>> g = dgl.graph(..., 'user', 'follows') >>> g.ntypes ['user'] >>> g.etypes ['follows'] >>> g.canonical_etypes [('user', 'follows', 'user')] Check if node ids are within num_nodes specified >>> g = dgl.graph(([0, 1, 2], [1, 2, 0]), num_nodes=2, validate=True) ... dgl._ffi.base.DGLError: Invalid node id 2 (should be less than cardinality 2). >>> g = dgl.graph(([0, 1, 2], [1, 2, 0]), num_nodes=3, validate=True) Graph(num_nodes=3, num_edges=3, ndata_schemes={} edata_schemes={}) """ if card is not None: dgl_warning("Argument 'card' will be deprecated. " "Please use num_nodes={} instead.".format(card)) num_nodes = card if num_nodes is not None: urange, vrange = num_nodes, num_nodes else: urange, vrange = None, None if isinstance(data, tuple): u, v = data return create_from_edges( u, v, ntype, etype, ntype, urange, vrange, validate, restrict_format=restrict_format, index_dtype=index_dtype) elif isinstance(data, list): return create_from_edge_list( data, ntype, etype, ntype, urange, vrange, validate, restrict_format=restrict_format, index_dtype=index_dtype) elif isinstance(data, sp.sparse.spmatrix): return create_from_scipy( data, ntype, etype, ntype, restrict_format=restrict_format, index_dtype=index_dtype) elif isinstance(data, nx.Graph): return create_from_networkx( data, ntype, etype, restrict_format=restrict_format, index_dtype=index_dtype, **kwargs) else: raise DGLError('Unsupported graph data type:', type(data))
[docs]def bipartite(data, utype='_U', etype='_E', vtype='_V', num_nodes=None, card=None, validate=True, restrict_format='auto', index_dtype='int64', **kwargs): """Create a bipartite graph. The result graph is directed and edges must be from ``utype`` nodes to ``vtype`` nodes. Nodes of each type have their own ID counts. In the sparse matrix perspective, :func:`dgl.graph` creates a graph whose adjacency matrix must be square while :func:`dgl.bipartite` creates a graph that does not necessarily have square adjacency matrix. Parameters ---------- data : graph data Data to initialize graph structure. Supported data formats are (1) list of edge pairs (e.g. [(0, 2), (3, 1), ...]) (2) pair of vertex IDs representing end nodes (e.g. ([0, 3, ...], [2, 1, ...])) (3) scipy sparse matrix (4) networkx graph utype : str, optional Source node type name. (Default: _U) etype : str, optional Edge type name. (Default: _E) vtype : str, optional Destination node type name. (Default: _V) num_nodes : 2-tuple of int, optional Number of nodes in the source and destination group. If None, infer from input data, i.e. the largest node ID plus 1 for each type. (Default: None) card : 2-tuple of int, optional Deprecated (see :attr:`num_nodes`). Cardinality (number of nodes in the source and destination group). If None, infer from input data, i.e. the largest node ID plus 1 for each type. (Default: None) validate : bool, optional If True, check if node ids are within cardinality, the check process may take some time. (Default: True) If False and card is not None, user would receive a warning. restrict_format : 'any', 'coo', 'csr', 'csc', 'auto', optional Force the storage format. Default: 'auto' (i.e. let DGL decide what to use). index_dtype : 'int32', 'int64', optional Force the index data type. Default: 'int64'. kwargs : key-word arguments, optional Other key word arguments. Only comes into effect when we are using a NetworkX graph. It can consist of: * edge_id_attr_name ``Str``, key name for edge ids in the NetworkX graph. If not found, we will consider the graph not to have pre-specified edge ids. Returns ------- DGLHeteroGraph Examples -------- Create from pairs of edges >>> g = dgl.bipartite([(0, 2), (0, 3), (1, 2)], 'user', 'plays', 'game') >>> g.ntypes ['user', 'game'] >>> g.etypes ['plays'] >>> g.canonical_etypes [('user', 'plays', 'game')] >>> g.number_of_nodes('user') 2 >>> g.number_of_nodes('game') 4 >>> g.number_of_edges('plays') # 'plays' could be omitted here 3 Create from source and destination vertex ID lists >>> u = [0, 0, 1] >>> v = [2, 3, 2] >>> g = dgl.bipartite((u, v)) The IDs can also be stored in framework-specific tensors >>> import torch >>> u = torch.tensor([0, 0, 1]) >>> v = torch.tensor([2, 3, 2]) >>> g = dgl.bipartite((u, v)) Create from scipy sparse matrix. Since scipy sparse matrix has explicit shape, the cardinality of the result graph is derived from that. >>> from scipy.sparse import coo_matrix >>> spmat = coo_matrix(([1,1,1], ([0, 0, 1], [2, 3, 2])), shape=(4, 4)) >>> g = dgl.bipartite(spmat, 'user', 'plays', 'game') >>> g.number_of_nodes('user') 4 >>> g.number_of_nodes('game') 4 Create from networkx graph. The given graph must follow the bipartite graph convention in networkx. Each node has a ``bipartite`` attribute with values 0 or 1. The result graph has two types of nodes and only edges from ``bipartite=0`` to ``bipartite=1`` will be included. >>> import networkx as nx >>> nxg = nx.complete_bipartite_graph(3, 4) >>> g = dgl.bipartite(nxg, 'user', 'plays', 'game') >>> g.number_of_nodes('user') 3 >>> g.number_of_nodes('game') 4 >>> g.edges() (tensor([0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2]), tensor([0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3])) Check if node ids are within num_nodes specified >>> g = dgl.bipartite(([0, 1, 2], [1, 2, 3]), num_nodes=(2, 4), validate=True) ... dgl._ffi.base.DGLError: Invalid node id 2 (should be less than cardinality 2). >>> g = dgl.bipartite(([0, 1, 2], [1, 2, 3]), num_nodes=(3, 4), validate=True) >>> g Graph(num_nodes={'_U': 3, '_V': 4}, num_edges={('_U', '_E', '_V'): 3}, metagraph=[('_U', '_V')]) """ if utype == vtype: raise DGLError('utype should not be equal to vtype. Use ``dgl.graph`` instead.') if card is not None: dgl_warning("Argument 'card' will be deprecated. " "Please use num_nodes={} instead.".format(card)) num_nodes = card if num_nodes is not None: urange, vrange = num_nodes else: urange, vrange = None, None if isinstance(data, tuple): u, v = data return create_from_edges( u, v, utype, etype, vtype, urange, vrange, validate, index_dtype=index_dtype, restrict_format=restrict_format) elif isinstance(data, list): return create_from_edge_list( data, utype, etype, vtype, urange, vrange, validate, index_dtype=index_dtype, restrict_format=restrict_format) elif isinstance(data, sp.sparse.spmatrix): return create_from_scipy( data, utype, etype, vtype, restrict_format=restrict_format, index_dtype=index_dtype) elif isinstance(data, nx.Graph): return create_from_networkx_bipartite(data, utype, etype, vtype, restrict_format=restrict_format, index_dtype=index_dtype, **kwargs) else: raise DGLError('Unsupported graph data type:', type(data))
[docs]def hetero_from_relations(rel_graphs, num_nodes_per_type=None): """Create a heterograph from graphs representing connections of each relation. The input is a list of heterographs where the ``i``th graph contains edges of type :math:`(s_i, e_i, d_i)`. If two graphs share a same node type, the number of nodes for the corresponding type should be the same. See **Examples** for details. Parameters ---------- rel_graphs : list of DGLHeteroGraph Each element corresponds to a heterograph for one (src, edge, dst) relation. num_nodes_per_type : dict[str, Tensor], optional Number of nodes per node type. If not given, DGL will infer the number of nodes from the given relation graphs. Returns ------- DGLHeteroGraph A heterograph consisting of all relations. Examples -------- >>> import dgl >>> follows_g = dgl.graph([(0, 1), (1, 2)], 'user', 'follows') >>> plays_g = dgl.bipartite([(0, 0), (3, 1)], 'user', 'plays', 'game') >>> devs_g = dgl.bipartite([(0, 0), (1, 1)], 'developer', 'develops', 'game') >>> g = dgl.hetero_from_relations([follows_g, plays_g, devs_g]) will raise an error as we have 3 nodes of type 'user' in follows_g and 4 nodes of type 'user' in plays_g. We have two possible methods to avoid the construction. **Method 1**: Manually specify the number of nodes for all types when constructing the relation graphs. >>> # A graph with 4 nodes of type 'user' >>> follows_g = dgl.graph([(0, 1), (1, 2)], 'user', 'follows', num_nodes=4) >>> # A bipartite graph with 4 nodes of src type ('user') and 2 nodes of dst type ('game') >>> plays_g = dgl.bipartite([(0, 0), (3, 1)], 'user', 'plays', 'game', num_nodes=(4, 2)) >>> devs_g = dgl.bipartite([(0, 0), (1, 1)], 'developer', 'develops', 'game') >>> g = dgl.hetero_from_relations([follows_g, plays_g, devs_g]) >>> print(g) Graph(num_nodes={'user': 4, 'game': 2, 'developer': 2}, num_edges={('user', 'follows', 'user'): 2, ('user', 'plays', 'game'): 2, ('developer', 'develops', 'game'): 2}, metagraph=[('user', 'user'), ('user', 'game'), ('developer', 'game')]) ``devs_g`` does not have nodes of type ``'user'`` so no error will be raised. **Method 2**: Construct a heterograph at once without intermediate relation graphs, in which case we will infer the number of nodes for each type. >>> g = dgl.heterograph({ >>> ('user', 'follows', 'user'): [(0, 1), (1, 2)], >>> ('user', 'plays', 'game'): [(0, 0), (3, 1)], >>> ('developer', 'develops', 'game'): [(0, 0), (1, 1)] >>> }) >>> print(g) Graph(num_nodes={'user': 4, 'game': 2, 'developer': 2}, num_edges={('user', 'follows', 'user'): 2, ('user', 'plays', 'game'): 2, ('developer', 'develops', 'game'): 2}, metagraph=[('user', 'user'), ('user', 'game'), ('developer', 'game')]) """ # TODO(minjie): this API can be generalized as a union operation of the input graphs # TODO(minjie): handle node/edge data # infer meta graph meta_edges_src, meta_edges_dst = [], [] ntypes = [] etypes = [] # TODO(BarclayII): I'm keeping the node type names sorted because even if # the metagraph is the same, the same node type name in different graphs may # map to different node type IDs. # In the future, we need to lower the type names into C++. if num_nodes_per_type is None: ntype_set = set() for rgrh in rel_graphs: assert len(rgrh.etypes) == 1 stype, etype, dtype = rgrh.canonical_etypes[0] ntype_set.add(stype) ntype_set.add(dtype) ntypes = list(sorted(ntype_set)) else: ntypes = list(sorted(num_nodes_per_type.keys())) num_nodes_per_type = utils.toindex([num_nodes_per_type[ntype] for ntype in ntypes], "int64") ntype_dict = {ntype: i for i, ntype in enumerate(ntypes)} index_dtype = rel_graphs[0]._idtype_str for rgrh in rel_graphs: if rgrh._idtype_str != index_dtype: raise Exception("Expect relation graphs to be {}, but got {}".format( index_dtype, rgrh._idtype_str)) stype, etype, dtype = rgrh.canonical_etypes[0] meta_edges_src.append(ntype_dict[stype]) meta_edges_dst.append(ntype_dict[dtype]) etypes.append(etype) # metagraph is DGLGraph, currently still using int64 as index dtype metagraph = graph_index.from_coo(len(ntypes), meta_edges_src, meta_edges_dst, True) # create graph index hgidx = heterograph_index.create_heterograph_from_relations( metagraph, [rgrh._graph for rgrh in rel_graphs], num_nodes_per_type) retg = DGLHeteroGraph(hgidx, ntypes, etypes) for i, rgrh in enumerate(rel_graphs): for ntype in rgrh.ntypes: retg.nodes[ntype].data.update(rgrh.nodes[ntype].data) retg._edge_frames[i].update(rgrh._edge_frames[0]) return retg
[docs]def heterograph(data_dict, num_nodes_dict=None, restrict_format='auto', index_dtype='int64'): """Create a heterogeneous graph from a dictionary between edge types and edge lists. Parameters ---------- data_dict : dict The dictionary between edge types and edge list data. The edge types are specified as a triplet of (source node type name, edge type name, destination node type name). The edge list data can be anything acceptable by :func:`dgl.graph` or :func:`dgl.bipartite`, or objects returned by the two functions themselves. num_nodes_dict : dict[str, int] The number of nodes for each node type. By default DGL infers the number of nodes for each node type from ``data_dict`` by taking the maximum node ID plus one for each node type. restrict_format : 'any', 'coo', 'csr', 'csc', 'auto', optional Force the storage format. Default: 'auto' (i.e. let DGL decide what to use). index_dtype : 'int32', 'int64', optional Force the index data type. Default: 'int64'. Returns ------- DGLHeteroGraph Examples -------- >>> g = dgl.heterograph({ ... ('user', 'follows', 'user'): [(0, 1), (1, 2)], ... ('user', 'plays', 'game'): [(0, 0), (1, 0), (1, 1), (2, 1)], ... ('developer', 'develops', 'game'): [(0, 0), (1, 1)], ... }) """ rel_graphs = [] # infer number of nodes for each node type if num_nodes_dict is None: num_nodes_dict = defaultdict(int) for (srctype, etype, dsttype), data in data_dict.items(): if isinstance(data, tuple): src = utils.toindex(data[0]).tonumpy() dst = utils.toindex(data[1]).tonumpy() nsrc = (src.max() + 1) if len(src) > 0 else 0 ndst = (dst.max() + 1) if len(dst) > 0 else 0 elif isinstance(data, list): if len(data) == 0: nsrc = ndst = 0 else: src, dst = zip(*data) src = utils.toindex(src).tonumpy() dst = utils.toindex(dst).tonumpy() nsrc = src.max() + 1 ndst = dst.max() + 1 elif isinstance(data, sp.sparse.spmatrix): nsrc = data.shape[0] ndst = data.shape[1] elif isinstance(data, nx.Graph): if data.number_of_nodes() == 0: nsrc = ndst = 0 elif srctype == dsttype: nsrc = ndst = data.number_of_nodes() else: nsrc = len({n for n, d in data.nodes(data=True) if d['bipartite'] == 0}) ndst = data.number_of_nodes() - nsrc elif isinstance(data, DGLHeteroGraph): # original node type and edge type of ``data`` is ignored. assert len(data.canonical_etypes) == 1, \ "Relational graphs must have only one edge type." srctype, _, dsttype = data.canonical_etypes[0] nsrc = data.number_of_nodes(srctype) ndst = data.number_of_nodes(dsttype) else: raise DGLError('Unsupported graph data type %s for %s' % ( type(data), (srctype, etype, dsttype))) if srctype == dsttype: ndst = nsrc = max(nsrc, ndst) num_nodes_dict[srctype] = max(num_nodes_dict[srctype], nsrc) num_nodes_dict[dsttype] = max(num_nodes_dict[dsttype], ndst) for (srctype, etype, dsttype), data in data_dict.items(): if isinstance(data, DGLHeteroGraph): rel_graphs.append(data) elif srctype == dsttype: rel_graphs.append(graph( data, srctype, etype, num_nodes=num_nodes_dict[srctype], validate=False, restrict_format=restrict_format, index_dtype=index_dtype)) else: rel_graphs.append(bipartite( data, srctype, etype, dsttype, num_nodes=(num_nodes_dict[srctype], num_nodes_dict[dsttype]), validate=False, restrict_format=restrict_format, index_dtype=index_dtype)) return hetero_from_relations(rel_graphs, num_nodes_dict)
[docs]def to_hetero(G, ntypes, etypes, ntype_field=NTYPE, etype_field=ETYPE, metagraph=None): """Convert the given homogeneous graph to a heterogeneous graph. The input graph should have only one type of nodes and edges. Each node and edge stores an integer feature (under ``ntype_field`` and ``etype_field``), representing the type id, which can be used to retrieve the type names stored in the given ``ntypes`` and ``etypes`` arguments. The function will automatically distinguish edge types that have the same given type IDs but different src and dst type IDs. For example, we allow both edges A and B to have the same type ID 0, but one has (0, 1) and the other as (2, 3) as the (src, dst) type IDs. In this case, the function will "split" edge type 0 into two types: (0, ty_A, 1) and (2, ty_B, 3). In another word, these two edges share the same edge type name, but can be distinguished by a canonical edge type tuple. Parameters ---------- G : DGLHeteroGraph Input homogeneous graph. ntypes : list of str The node type names. etypes : list of str The edge type names. ntype_field : str, optional The feature field used to store node type. (Default: ``dgl.NTYPE``) etype_field : str, optional The feature field used to store edge type. (Default: ``dgl.ETYPE``) metagraph : networkx MultiDiGraph, optional Metagraph of the returned heterograph. If provided, DGL assumes that G can indeed be described with the given metagraph. If None, DGL will infer the metagraph from the given inputs, which would be potentially slower for large graphs. Returns ------- DGLHeteroGraph A heterograph. The parent node and edge ID are stored in the column ``dgl.NID`` and ``dgl.EID`` respectively for all node/edge types. Notes ----- The returned node and edge types may not necessarily be in the same order as ``ntypes`` and ``etypes``. And edge types may be duplicated if the source and destination types differ. The node IDs of a single type in the returned heterogeneous graph is ordered the same as the nodes with the same ``ntype_field`` feature. Edge IDs of a single type is similar. Examples -------- >>> g1 = dgl.bipartite([(0, 1), (1, 2)], 'user', 'develops', 'activity') >>> g2 = dgl.bipartite([(0, 0), (1, 1)], 'developer', 'develops', 'game') >>> hetero_g = dgl.hetero_from_relations([g1, g2]) >>> print(hetero_g) Graph(num_nodes={'user': 2, 'activity': 3, 'developer': 2, 'game': 2}, num_edges={('user', 'develops', 'activity'): 2, ('developer', 'develops', 'game'): 2}, metagraph=[('user', 'activity'), ('developer', 'game')]) We first convert the heterogeneous graph to a homogeneous graph. >>> homo_g = dgl.to_homo(hetero_g) >>> print(homo_g) Graph(num_nodes=9, num_edges=4, ndata_schemes={'_TYPE': Scheme(shape=(), dtype=torch.int64), '_ID': Scheme(shape=(), dtype=torch.int64)} edata_schemes={'_TYPE': Scheme(shape=(), dtype=torch.int64), '_ID': Scheme(shape=(), dtype=torch.int64)}) >>> homo_g.ndata {'_TYPE': tensor([0, 0, 1, 1, 1, 2, 2, 3, 3]), '_ID': tensor([0, 1, 0, 1, 2, 0, 1, 0, 1])} Nodes 0, 1 for 'user', 2, 3, 4 for 'activity', 5, 6 for 'developer', 7, 8 for 'game' >>> homo_g.edata {'_TYPE': tensor([0, 0, 1, 1]), '_ID': tensor([0, 1, 0, 1])} Edges 0, 1 for ('user', 'develops', 'activity'), 2, 3 for ('developer', 'develops', 'game') Now convert the homogeneous graph back to a heterogeneous graph. >>> hetero_g_2 = dgl.to_hetero(homo_g, hetero_g.ntypes, hetero_g.etypes) >>> print(hetero_g_2) Graph(num_nodes={'user': 2, 'activity': 3, 'developer': 2, 'game': 2}, num_edges={('user', 'develops', 'activity'): 2, ('developer', 'develops', 'game'): 2}, metagraph=[('user', 'activity'), ('developer', 'game')]) See Also -------- dgl.to_homo """ # TODO(minjie): use hasattr to support DGLGraph input; should be fixed once # DGLGraph is merged with DGLHeteroGraph if (hasattr(G, 'ntypes') and len(G.ntypes) > 1 or hasattr(G, 'etypes') and len(G.etypes) > 1): raise DGLError('The input graph should be homogenous and have only one ' ' type of nodes and edges.') num_ntypes = len(ntypes) index_dtype = G._idtype_str ntype_ids = F.asnumpy(G.ndata[ntype_field]) etype_ids = F.asnumpy(G.edata[etype_field]) # relabel nodes to per-type local IDs ntype_count = np.bincount(ntype_ids, minlength=num_ntypes) ntype_offset = np.insert(np.cumsum(ntype_count), 0, 0) ntype_ids_sortidx = np.argsort(ntype_ids) ntype_local_ids = np.zeros_like(ntype_ids) node_groups = [] for i in range(num_ntypes): node_group = ntype_ids_sortidx[ntype_offset[i]:ntype_offset[i+1]] node_groups.append(node_group) ntype_local_ids[node_group] = np.arange(ntype_count[i]) src, dst = G.all_edges(order='eid') src = F.asnumpy(src) dst = F.asnumpy(dst) src_local = ntype_local_ids[src] dst_local = ntype_local_ids[dst] # a 2D tensor of shape (E, 3). Each row represents the (stid, etid, dtid) tuple. edge_ctids = np.stack([ntype_ids[src], etype_ids, ntype_ids[dst]], 1) # infer metagraph and canonical edge types # No matter which branch it takes, the code will generate a 2D tensor of shape (E_m, 3), # E_m is the set of all possible canonical edge tuples. Each row represents the # (stid, dtid, dtid) tuple. We then compute a 2D tensor of shape (E, E_m) using the # above ``edge_ctids`` matrix. Each element i,j indicates whether the edge i is of the # canonical edge type j. We can then group the edges of the same type together. if metagraph is None: canonical_etids, _, etype_remapped = \ utils.make_invmap(list(tuple(_) for _ in edge_ctids), False) etype_mask = (etype_remapped[None, :] == np.arange(len(canonical_etids))[:, None]) else: ntypes_invmap = {nt: i for i, nt in enumerate(ntypes)} etypes_invmap = {et: i for i, et in enumerate(etypes)} canonical_etids = [] for i, (srctype, dsttype, etype) in enumerate(metagraph.edges(keys=True)): srctype_id = ntypes_invmap[srctype] etype_id = etypes_invmap[etype] dsttype_id = ntypes_invmap[dsttype] canonical_etids.append((srctype_id, etype_id, dsttype_id)) canonical_etids = np.asarray(canonical_etids) etype_mask = (edge_ctids[None, :] == canonical_etids[:, None]).all(2) edge_groups = [etype_mask[i].nonzero()[0] for i in range(len(canonical_etids))] rel_graphs = [] for i, (stid, etid, dtid) in enumerate(canonical_etids): src_of_etype = src_local[edge_groups[i]] dst_of_etype = dst_local[edge_groups[i]] if stid == dtid: rel_graph = graph( (src_of_etype, dst_of_etype), ntypes[stid], etypes[etid], num_nodes=ntype_count[stid], validate=False, index_dtype=index_dtype) else: rel_graph = bipartite( (src_of_etype, dst_of_etype), ntypes[stid], etypes[etid], ntypes[dtid], num_nodes=(ntype_count[stid], ntype_count[dtid]), validate=False, index_dtype=index_dtype) rel_graphs.append(rel_graph) hg = hetero_from_relations(rel_graphs, {ntype: count for ntype, count in zip( ntypes, ntype_count)}) ntype2ngrp = {ntype : node_groups[ntid] for ntid, ntype in enumerate(ntypes)} # features for key, data in G.ndata.items(): for ntid, ntype in enumerate(hg.ntypes): rows = F.copy_to(F.tensor(ntype2ngrp[ntype]), F.context(data)) hg._node_frames[ntid][key] = F.gather_row(data, rows) for key, data in G.edata.items(): for etid in range(len(hg.canonical_etypes)): rows = F.copy_to(F.tensor(edge_groups[etid]), F.context(data)) hg._edge_frames[etid][key] = F.gather_row(data, rows) for ntid, ntype in enumerate(hg.ntypes): hg._node_frames[ntid][NID] = F.tensor(ntype2ngrp[ntype]) for etid in range(len(hg.canonical_etypes)): hg._edge_frames[etid][EID] = F.tensor(edge_groups[etid]) return hg
[docs]def to_homo(G): """Convert the given heterogeneous graph to a homogeneous graph. The returned graph has only one type of nodes and edges. Node and edge types are stored as features in the returned graph. Each feature is an integer representing the type id, which can be used to retrieve the type names stored in ``G.ntypes`` and ``G.etypes`` arguments. Parameters ---------- G : DGLHeteroGraph Input heterogeneous graph. Returns ------- DGLHeteroGraph A homogeneous graph. The parent node and edge type/ID are stored in columns ``dgl.NTYPE/dgl.NID`` and ``dgl.ETYPE/dgl.EID`` respectively. Examples -------- >>> follows_g = dgl.graph([(0, 1), (1, 2)], 'user', 'follows') >>> devs_g = dgl.bipartite([(0, 0), (1, 1)], 'developer', 'develops', 'game') >>> hetero_g = dgl.hetero_from_relations([follows_g, devs_g]) >>> homo_g = dgl.to_homo(hetero_g) >>> homo_g.ndata {'_TYPE': tensor([0, 0, 0, 1, 1, 2, 2]), '_ID': tensor([0, 1, 2, 0, 1, 0, 1])} First three nodes for 'user', next two for 'developer' and the last two for 'game' >>> homo_g.edata {'_TYPE': tensor([0, 0, 1, 1]), '_ID': tensor([0, 1, 0, 1])} First two edges for 'follows', next two for 'develops' See Also -------- dgl.to_hetero """ num_nodes_per_ntype = [G.number_of_nodes(ntype) for ntype in G.ntypes] offset_per_ntype = np.insert(np.cumsum(num_nodes_per_ntype), 0, 0) srcs = [] dsts = [] etype_ids = [] eids = [] ntype_ids = [] nids = [] total_num_nodes = 0 for ntype_id, ntype in enumerate(G.ntypes): num_nodes = G.number_of_nodes(ntype) total_num_nodes += num_nodes ntype_ids.append(F.full_1d(num_nodes, ntype_id, F.int64, F.cpu())) nids.append(F.arange(0, num_nodes, G._idtype_str)) for etype_id, etype in enumerate(G.canonical_etypes): srctype, _, dsttype = etype src, dst = G.all_edges(etype=etype, order='eid') num_edges = len(src) srcs.append(src + int(offset_per_ntype[G.get_ntype_id(srctype)])) dsts.append(dst + int(offset_per_ntype[G.get_ntype_id(dsttype)])) etype_ids.append(F.full_1d(num_edges, etype_id, F.int64, F.cpu())) eids.append(F.arange(0, num_edges, G._idtype_str)) retg = graph((F.cat(srcs, 0), F.cat(dsts, 0)), num_nodes=total_num_nodes, validate=False, index_dtype=G._idtype_str) # copy features comb_nf = combine_frames(G._node_frames, range(len(G.ntypes))) comb_ef = combine_frames(G._edge_frames, range(len(G.etypes))) if comb_nf is not None: retg.ndata.update(comb_nf) if comb_ef is not None: retg.edata.update(comb_ef) # assign node type and id mapping field. retg.ndata[NTYPE] = F.cat(ntype_ids, 0) retg.ndata[NID] = F.cat(nids, 0) retg.edata[ETYPE] = F.cat(etype_ids, 0) retg.edata[EID] = F.cat(eids, 0) return retg
############################################################ # Internal APIs ############################################################ def create_from_edges(u, v, utype, etype, vtype, urange=None, vrange=None, validate=True, restrict_format="auto", index_dtype='int64'): """Internal function to create a graph from incident nodes with types. utype could be equal to vtype Parameters ---------- u : iterable of int List of source node IDs. v : iterable of int List of destination node IDs. utype : str Source node type name. etype : str Edge type name. vtype : str Destination node type name. urange : int, optional The source node ID range. If None, the value is the maximum of the source node IDs in the edge list plus 1. (Default: None) vrange : int, optional The destination node ID range. If None, the value is the maximum of the destination node IDs in the edge list plus 1. (Default: None) validate : bool, optional If True, checks if node IDs are within range. restrict_format : 'any', 'coo', 'csr', 'csc', 'auto', optional Force the storage format. Default: 'auto' (i.e. let DGL decide what to use). index_dtype : 'int32', 'int64', optional Force the index data type. Default: 'int64'. Returns ------- DGLHeteroGraph """ u = utils.toindex(u, index_dtype) v = utils.toindex(v, index_dtype) if validate: if urange is not None and len(u) > 0 and \ urange <= int(F.asnumpy(F.max(u.tousertensor(), dim=0))): raise DGLError('Invalid node id {} (should be less than cardinality {}).'.format( urange, int(F.asnumpy(F.max(u.tousertensor(), dim=0))))) if vrange is not None and len(v) > 0 and \ vrange <= int(F.asnumpy(F.max(v.tousertensor(), dim=0))): raise DGLError('Invalid node id {} (should be less than cardinality {}).'.format( vrange, int(F.asnumpy(F.max(v.tousertensor(), dim=0))))) urange = urange or ( 0 if len(u) == 0 else (int(F.asnumpy(F.max(u.tousertensor(), dim=0))) + 1)) vrange = vrange or ( 0 if len(v) == 0 else (int(F.asnumpy(F.max(v.tousertensor(), dim=0))) + 1)) if utype == vtype: urange = vrange = max(urange, vrange) num_ntypes = 1 else: num_ntypes = 2 hgidx = heterograph_index.create_unitgraph_from_coo( num_ntypes, urange, vrange, u, v, restrict_format) if utype == vtype: return DGLHeteroGraph(hgidx, [utype], [etype]) else: return DGLHeteroGraph(hgidx, [utype, vtype], [etype]) def create_from_edge_list(elist, utype, etype, vtype, urange=None, vrange=None, validate=True, restrict_format='auto', index_dtype='int64'): """Internal function to create a heterograph from a list of edge tuples with types. utype could be equal to vtype Parameters ---------- elist : iterable of int pairs List of (src, dst) node ID pairs. utype : str Source node type name. etype : str Edge type name. vtype : str Destination node type name. urange : int, optional The source node ID range. If None, the value is the maximum of the source node IDs in the edge list plus 1. (Default: None) vrange : int, optional The destination node ID range. If None, the value is the maximum of the destination node IDs in the edge list plus 1. (Default: None) validate : bool, optional If True, checks if node IDs are within range. restrict_format : 'any', 'coo', 'csr', 'csc', 'auto', optional Force the storage format. Default: 'auto' (i.e. let DGL decide what to use). index_dtype : 'int32', 'int64', optional Force the index data type. Default: 'int64'. Returns ------- DGLHeteroGraph """ if len(elist) == 0: u, v = [], [] else: u, v = zip(*elist) u = list(u) v = list(v) return create_from_edges(u, v, utype, etype, vtype, urange, vrange, validate, restrict_format, index_dtype=index_dtype) def create_from_scipy(spmat, utype, etype, vtype, with_edge_id=False, restrict_format='auto', index_dtype='int64'): """Internal function to create a heterograph from a scipy sparse matrix with types. Parameters ---------- spmat : scipy.sparse.spmatrix The adjacency matrix whose rows represent sources and columns represent destinations. utype : str Source node type name. etype : str Edge type name. vtype : str Destination node type name. with_edge_id : bool If True, the entries in the sparse matrix are treated as edge IDs. Otherwise, the entries are ignored and edges will be added in (source, destination) order. Note that this option only affects CSR matrices; COO matrices' rows and cols are always assumed to be ordered by edge ID already. validate : bool, optional If True, checks if node IDs are within range. restrict_format : 'any', 'coo', 'csr', 'csc', 'auto', optional Force the storage format. Default: 'auto' (i.e. let DGL decide what to use). index_dtype : 'int32', 'int64', optional Force the index data type. Default: 'int64'. Returns ------- DGLHeteroGraph """ num_src, num_dst = spmat.shape num_ntypes = 1 if utype == vtype else 2 if spmat.getformat() == 'coo': row = utils.toindex(spmat.row.astype(index_dtype), index_dtype) col = utils.toindex(spmat.col.astype(index_dtype), index_dtype) hgidx = heterograph_index.create_unitgraph_from_coo( num_ntypes, num_src, num_dst, row, col, restrict_format) else: spmat = spmat.tocsr() indptr = utils.toindex(spmat.indptr.astype(index_dtype), index_dtype) indices = utils.toindex(spmat.indices.astype(index_dtype), index_dtype) # TODO(minjie): with_edge_id is only reasonable for csr matrix. How to fix? data = utils.toindex(spmat.data if with_edge_id else list(range(len(indices))), index_dtype) hgidx = heterograph_index.create_unitgraph_from_csr( num_ntypes, num_src, num_dst, indptr, indices, data, restrict_format) if num_ntypes == 1: return DGLHeteroGraph(hgidx, [utype], [etype]) else: return DGLHeteroGraph(hgidx, [utype, vtype], [etype]) def create_from_networkx(nx_graph, ntype, etype, edge_id_attr_name='id', node_attrs=None, edge_attrs=None, restrict_format='auto', index_dtype='int64'): """Create a heterograph that has only one set of nodes and edges. Parameters ---------- nx_graph : NetworkX graph ntype : str Type name for both source and destination nodes etype : str Type name for edges edge_id_attr_name : str, optional Key name for edge ids in the NetworkX graph. If not found, we will consider the graph not to have pre-specified edge ids. (Default: 'id') node_attrs : list of str Names for node features to retrieve from the NetworkX graph (Default: None) edge_attrs : list of str Names for edge features to retrieve from the NetworkX graph (Default: None) restrict_format : 'any', 'coo', 'csr', 'csc', 'auto', optional Force the storage format. Default: 'auto' (i.e. let DGL decide what to use). index_dtype : 'int32', 'int64', optional Force the index data type. Default: 'int64'. Returns ------- g : DGLHeteroGraph """ if not nx_graph.is_directed(): nx_graph = nx_graph.to_directed() # Relabel nodes using consecutive integers nx_graph = nx.convert_node_labels_to_integers(nx_graph, ordering='sorted') # nx_graph.edges(data=True) returns src, dst, attr_dict if nx_graph.number_of_edges() > 0: has_edge_id = edge_id_attr_name in next(iter(nx_graph.edges(data=True)))[-1] else: has_edge_id = False if has_edge_id: num_edges = nx_graph.number_of_edges() src = np.zeros((num_edges,), dtype=getattr(np, index_dtype)) dst = np.zeros((num_edges,), dtype=getattr(np, index_dtype)) for u, v, attr in nx_graph.edges(data=True): eid = attr[edge_id_attr_name] src[eid] = u dst[eid] = v else: src = [] dst = [] for e in nx_graph.edges: src.append(e[0]) dst.append(e[1]) src = utils.toindex(src, index_dtype) dst = utils.toindex(dst, index_dtype) num_nodes = nx_graph.number_of_nodes() g = create_from_edges(src, dst, ntype, etype, ntype, num_nodes, num_nodes, validate=False, restrict_format=restrict_format, index_dtype=index_dtype) # handle features # copy attributes def _batcher(lst): if F.is_tensor(lst[0]): return F.cat([F.unsqueeze(x, 0) for x in lst], dim=0) else: return F.tensor(lst) if node_attrs is not None: # mapping from feature name to a list of tensors to be concatenated attr_dict = defaultdict(list) for nid in range(g.number_of_nodes()): for attr in node_attrs: attr_dict[attr].append(nx_graph.nodes[nid][attr]) for attr in node_attrs: g.ndata[attr] = _batcher(attr_dict[attr]) if edge_attrs is not None: # mapping from feature name to a list of tensors to be concatenated attr_dict = defaultdict(lambda: [None] * g.number_of_edges()) # each defaultdict value is initialized to be a list of None # None here serves as placeholder to be replaced by feature with # corresponding edge id if has_edge_id: num_edges = g.number_of_edges() for _, _, attrs in nx_graph.edges(data=True): if attrs[edge_id_attr_name] >= num_edges: raise DGLError('Expect the pre-specified edge ids to be' ' smaller than the number of edges --' ' {}, got {}.'.format(num_edges, attrs['id'])) for key in edge_attrs: attr_dict[key][attrs['id']] = attrs[key] else: # XXX: assuming networkx iteration order is deterministic # so the order is the same as graph_index.from_networkx for eid, (_, _, attrs) in enumerate(nx_graph.edges(data=True)): for key in edge_attrs: attr_dict[key][eid] = attrs[key] for attr in edge_attrs: for val in attr_dict[attr]: if val is None: raise DGLError('Not all edges have attribute {}.'.format(attr)) g.edata[attr] = _batcher(attr_dict[attr]) return g def create_from_networkx_bipartite(nx_graph, utype, etype, vtype, edge_id_attr_name='id', node_attrs=None, edge_attrs=None, restrict_format='auto', index_dtype='int64'): """Create a heterograph that has one set of source nodes, one set of destination nodes and one set of edges. Parameters ---------- nx_graph : NetworkX graph The input graph must follow the bipartite graph convention of networkx. Each node has an attribute ``bipartite`` with values 0 and 1 indicating which set it belongs to. Only edges from node set 0 to node set 1 are added to the returned graph. utype : str Source node type name. etype : str Edge type name. vtype : str Destination node type name. edge_id_attr_name : str, optional Key name for edge ids in the NetworkX graph. If not found, we will consider the graph not to have pre-specified edge ids. (Default: 'id') node_attrs : list of str Names for node features to retrieve from the NetworkX graph (Default: None) edge_attrs : list of str Names for edge features to retrieve from the NetworkX graph (Default: None) restrict_format : 'any', 'coo', 'csr', 'csc', 'auto' optional Force the storage format. Default: 'auto' (i.e. let DGL decide what to use). index_dtype : 'int32', 'int64', optional Force the index data type. Default: 'int64'. Returns ------- g : DGLHeteroGraph """ if not nx_graph.is_directed(): nx_graph = nx_graph.to_directed() top_nodes = {n for n, d in nx_graph.nodes(data=True) if d['bipartite'] == 0} bottom_nodes = set(nx_graph) - top_nodes top_nodes = sorted(top_nodes) bottom_nodes = sorted(bottom_nodes) top_map = {n : i for i, n in enumerate(top_nodes)} bottom_map = {n : i for i, n in enumerate(bottom_nodes)} if nx_graph.number_of_edges() > 0: has_edge_id = edge_id_attr_name in next(iter(nx_graph.edges(data=True)))[-1] else: has_edge_id = False if has_edge_id: num_edges = nx_graph.number_of_edges() src = np.zeros((num_edges,), dtype=getattr(np, index_dtype)) dst = np.zeros((num_edges,), dtype=getattr(np, index_dtype)) for u, v, attr in nx_graph.edges(data=True): eid = attr[edge_id_attr_name] src[eid] = top_map[u] dst[eid] = bottom_map[v] else: src = [] dst = [] for e in nx_graph.edges: if e[0] in top_map: src.append(top_map[e[0]]) dst.append(bottom_map[e[1]]) src = utils.toindex(src, index_dtype) dst = utils.toindex(dst, index_dtype) g = create_from_edges(src, dst, utype, etype, vtype, len(top_nodes), len(bottom_nodes), validate=False, restrict_format=restrict_format, index_dtype=index_dtype) # TODO attributes assert node_attrs is None, 'Retrieval of node attributes are not supported yet.' assert edge_attrs is None, 'Retrieval of edge attributes are not supported yet.' return g
[docs]def to_networkx(g, node_attrs=None, edge_attrs=None): """Convert to networkx graph. The edge id will be saved as the 'id' edge attribute. Parameters ---------- g : DGLGraph or DGLHeteroGraph For DGLHeteroGraphs, we currently only support the case of one node type and one edge type. node_attrs : iterable of str, optional The node attributes to be copied. (Default: None) edge_attrs : iterable of str, optional The edge attributes to be copied. (Default: None) Returns ------- networkx.DiGraph The nx graph """ return g.to_networkx(node_attrs, edge_attrs)