Source code for dgl.distributed.nn.pytorch.sparse_emb

"""Define sparse embedding and optimizer."""

import torch as th
from .... import backend as F
from .... import utils
from ...dist_tensor import DistTensor

[docs]class DistEmbedding: '''Distributed node embeddings. DGL provides a distributed embedding to support models that require learnable embeddings. DGL's distributed embeddings are mainly used for learning node embeddings of graph models. Because distributed embeddings are part of a model, they are updated by mini-batches. The distributed embeddings have to be updated by DGL's optimizers instead of the optimizers provided by the deep learning frameworks (e.g., Pytorch and MXNet). To support efficient training on a graph with many nodes, the embeddings support sparse updates. That is, only the embeddings involved in a mini-batch computation are updated. Currently, DGL provides only one optimizer: `SparseAdagrad`. DGL will provide more optimizers in the future. Distributed embeddings are sharded and stored in a cluster of machines in the same way as py:meth:`dgl.distributed.DistTensor`, except that distributed embeddings are trainable. Because distributed embeddings are sharded in the same way as nodes and edges of a distributed graph, it is usually much more efficient to access than the sparse embeddings provided by the deep learning frameworks. Parameters ---------- num_embeddings : int The number of embeddings. Currently, the number of embeddings has to be the same as the number of nodes or the number of edges. embedding_dim : int The dimension size of embeddings. name : str, optional The name of the embeddings. The name can uniquely identify embeddings in a system so that another DistEmbedding object can referent to the same embeddings. init_func : callable, optional The function to create the initial data. If the init function is not provided, the values of the embeddings are initialized to zero. part_policy : PartitionPolicy, optional The partition policy that assigns embeddings to different machines in the cluster. Currently, it only supports node partition policy or edge partition policy. The system determines the right partition policy automatically. Examples -------- >>> def initializer(shape, dtype): arr = th.zeros(shape, dtype=dtype) arr.uniform_(-1, 1) return arr >>> emb = dgl.distributed.DistEmbedding(g.number_of_nodes(), 10, init_func=initializer) >>> optimizer = dgl.distributed.optim.SparseAdagrad([emb], lr=0.001) >>> for blocks in dataloader: ... feats = emb(nids) ... loss = F.sum(feats + 1, 0) ... loss.backward() ... optimizer.step() Note ---- When a ``DistEmbedding`` object is used when the deep learning framework is recording the forward computation, users have to invoke py:meth:`~dgl.distributed.optim.SparseAdagrad.step` afterwards. Otherwise, there will be some memory leak. ''' def __init__(self, num_embeddings, embedding_dim, name=None, init_func=None, part_policy=None): self._tensor = DistTensor((num_embeddings, embedding_dim), F.float32, name, init_func=init_func, part_policy=part_policy) self._trace = [] self._name = name self._num_embeddings = num_embeddings self._embedding_dim = embedding_dim # Check whether it is multi-gpu/distributed training or not if th.distributed.is_initialized(): self._rank = th.distributed.get_rank() self._world_size = th.distributed.get_world_size() # [TODO] The following code is clearly wrong but changing it to "raise DGLError" # actually fails unit test. ??? # else: # assert 'th.distributed should be initialized' self._optm_state = None # track optimizer state self._part_policy = part_policy def __call__(self, idx, device=th.device('cpu')): """ node_ids : th.tensor Index of the embeddings to collect. device : th.device Target device to put the collected embeddings. Returns ------- Tensor The requested node embeddings """ idx = utils.toindex(idx).tousertensor() emb = self._tensor[idx].to(device, non_blocking=True) if F.is_recording(): emb = F.attach_grad(emb) self._trace.append((idx.to(device, non_blocking=True), emb)) return emb def reset_trace(self): '''Reset the traced data. ''' self._trace = [] @property def part_policy(self): """Return the partition policy Returns ------- PartitionPolicy partition policy """ return self._part_policy @property def name(self): """Return the name of the embeddings Returns ------- str The name of the embeddings """ return self._tensor.tensor_name @property def data_name(self): """Return the data name of the embeddings Returns ------- str The data name of the embeddings """ return self._tensor._name @property def kvstore(self): """Return the kvstore client Returns ------- KVClient The kvstore client """ return self._tensor.kvstore @property def num_embeddings(self): """Return the number of embeddings Returns ------- int The number of embeddings """ return self._num_embeddings @property def embedding_dim(self): """Return the dimension of embeddings Returns ------- int The dimension of embeddings """ return self._embedding_dim @property def optm_state(self): """Return the optimizer related state tensor. Returns ------- tuple of torch.Tensor The optimizer related state. """ return self._optm_state @property def weight(self): """Return the tensor storing the node embeddings Returns ------- torch.Tensor The tensor storing the node embeddings """ return self._tensor