Source code for dgl.sampling.negative
"""Negative sampling APIs"""
from numpy.polynomial import polynomial
from .._ffi.function import _init_api
from .. import backend as F
from .. import utils
from ..heterograph import DGLHeteroGraph
__all__ = [
'global_uniform_negative_sampling']
def _calc_redundancy(k_hat, num_edges, num_pairs, r=3): # pylint: disable=invalid-name
# pylint: disable=invalid-name
# Calculates the number of samples required based on a lower-bound
# of the expected number of negative samples, based on N draws from
# a binomial distribution. Solves the following equation for N:
#
# k_hat = N*p_k - r * np.sqrt(N*p_k*(1-p_k))
#
# where p_k is the probability that a node pairing is a negative edge
# and r is the number of standard deviations to construct the lower bound
#
# Credits to @zjost
p_m = num_edges / num_pairs
p_k = 1 - p_m
a = p_k ** 2
b = -p_k * (2 * k_hat + r ** 2 * p_m)
c = k_hat ** 2
poly = polynomial.Polynomial([c, b, a])
N = poly.roots()[-1]
redundancy = N / k_hat - 1.
return redundancy
[docs]def global_uniform_negative_sampling(
g, num_samples, exclude_self_loops=True, replace=False, etype=None,
redundancy=None):
"""Performs negative sampling, which generate source-destination pairs such that
edges with the given type do not exist.
Specifically, this function takes in an edge type and a number of samples. It
returns two tensors ``src`` and ``dst``, the former in the range of ``[0, num_src)``
and the latter in the range of ``[0, num_dst)``, where ``num_src`` and ``num_dst``
represents the number of nodes with the source and destination node type respectively.
It guarantees that no edge will exist between the corresponding pairs of ``src``
with the source node type and ``dst`` with the destination node type.
.. note::
This negative sampler will try to generate as many negative samples as possible, but
it may rarely return less than :attr:`num_samples` negative samples.
This is more likely to happen when a graph is so small or dense that not many
unique negative samples exist.
Parameters
----------
g : DGLGraph
The graph.
num_samples : int
The number of desired negative samples to generate.
exclude_self_loops : bool, optional
Whether to exclude self-loops from the negative samples. Only impacts the
edge types whose source and destination node types are the same.
Default: True.
replace : bool, optional
Whether to sample with replacement. Setting it to True will make things
faster. (Default: False)
etype : str or tuple of str, optional
The edge type. Can be omitted if the graph only has one edge type.
redundancy : float, optional
Indicates how much more negative samples to actually generate during rejection sampling
before finding the unique pairs.
Increasing it will increase the likelihood of getting :attr:`num_samples` negative
samples, but will also take more time and memory.
(Default: automatically determined by the density of graph)
Returns
-------
tuple[Tensor, Tensor]
The source and destination pairs.
Examples
--------
>>> g = dgl.graph(([0, 1, 2], [1, 2, 3]))
>>> dgl.sampling.global_uniform_negative_sampling(g, 3)
(tensor([0, 1, 3]), tensor([2, 0, 2]))
"""
if etype is None:
etype = g.etypes[0]
utype, _, vtype = g.to_canonical_etype(etype)
exclude_self_loops = exclude_self_loops and (utype == vtype)
redundancy = _calc_redundancy(
num_samples, g.num_edges(etype), g.num_nodes(utype) * g.num_nodes(vtype))
etype_id = g.get_etype_id(etype)
src, dst = _CAPI_DGLGlobalUniformNegativeSampling(
g._graph, etype_id, num_samples, 3, exclude_self_loops, replace, redundancy)
return F.from_dgl_nd(src), F.from_dgl_nd(dst)
DGLHeteroGraph.global_uniform_negative_sampling = utils.alias_func(
global_uniform_negative_sampling)
_init_api('dgl.sampling.negative', __name__)