Module pymake.frontend.frontendnetwork_gt
Source code
import os
import numpy as np
import scipy as sp
import scipy.sparse as sparse
from scipy.sparse import lil_matrix, csr_matrix
from .frontend import DataBase
from .drivers import OnlineDatasetDriver
try:
import graph_tool as gt
from graph_tool import stats, clustering, inference, spectral, topology, generation, search
from graph_tool import draw
except Exception as e:
print('Error while importing graph-tool: %s' % e)
class frontendNetwork_gt(DataBase, OnlineDatasetDriver):
''' A frontend layer for the graph-tool object.
Attributes
----------
data: Graph (from gt)
'''
def __init__(self, expe, data, corpus=None):
super().__init__(expe)
self._data_type = 'network'
corpus = corpus or {}
self.data = data
force_directed = expe.get('directed', corpus.get('directed'))
force_directed = bool(force_directed) if force_directed is not None else None
remove_self_loops = expe.get('remove_self_loops', True)
# Remove selfloop
if remove_self_loops:
# @Warning: the corresponding weight are kept in the map properties
self.log.debug('Self-loop are assumed to be remove from the graph.')
self.remove_self_loops()
# Set the direction and weights
if force_directed is not None:
self.set_directed(force_directed)
else:
# Try to guess directed or not
y = self.adj()
adj_symmetric = (y!=y.T).nnz == 0
adj_symmetric |= sp.sparse.triu(y,1).nnz == 0
adj_symmetric |= sp.sparse.tril(y,-1).nnz == 0
if adj_symmetric:
self.log.info('Guessing undirected graph for: %s' % expe.corpus)
self.set_directed(False)
if y.max() > 1:
self.log.critical('Multiple edges in the graph..')
if not 'weights' in data.ep:
if 'weight' in data.ep:
self.log.critical('Already Weight propertye in the graph, need to check ')
raise NotImplementedError
elif 'value' in data.ep:
weights = data.ep['value'].copy()
#w.a = (2**0.5)**w.a # exponentiate
#weights.a = np.round((weights.a+1)**2) # squared
weights.a = np.round(weights.a *10) # assume ten authors by paper for (collaboration networks)
data.ep['weights'] = weights.copy('int')
else:
weights = data.new_ep("int")
weights.a = 1
data.ep['weights'] = weights
else:
weights = data.ep.weights
if self.expe.get('shift_w'):
shift = self.expe.get('shift_w')
if str(shift).startswith('linear'):
_, shift = shift.split('_')
#shift = weights.a.max()
shift = int(shift)
expe['shift_w'] = shift
weights.a = shift*weights.a +shift
else:
shift = int(self.expe['shift_w'])
weights.a += shift
@classmethod
def _extract_data_file(cls, expe, corpus=None):
corpus = corpus or {}
input_path = expe._input_path
if not os.path.exists(input_path):
cls.log.error("Corpus `%s' Not found." % (input_path))
print('please run "fetch_networks"')
cls.data = None
return
if expe.corpus.endswith('.gt'):
corpus_name = expe.corpus[:-len('.gt')]
else:
corpus_name = expe.corpus
# DB integration ?
if corpus_name.startswith(('generator', 'graph')):
ext = 'graph'
basename = os.path.join(input_path, 't0.' + ext)
fun = 'parse_dancer'
elif corpus_name.startswith(('bench1')):
raise NotImplementedError()
elif corpus_name.startswith('facebook'):
basename = 'edges'
fn = os.path.join(input_path, '0.' + ext)
fun = 'parse_edges'
elif corpus_name.startswith(('manufacturing',)):
ext = 'csv'
basename = os.path.join(input_path, corpus_name +'.'+ ext)
fun = 'parse_csv'
elif corpus_name.startswith(('fb_uc', 'emaileu')):
ext = 'txt'
basename = os.path.join(input_path, corpus_name +'.'+ ext)
fun = 'parse_tnet'
elif corpus_name.startswith(('blogs','propro', 'euroroad')):
ext = 'dat'
basename = os.path.join(input_path, corpus_name +'.'+ ext)
fun = 'parse_dat'
else:
raise ValueError('Which corpus to Load; %s ?' % corpus_name)
fn = os.path.join(input_path, basename)
if os.path.isfile(fn) and os.stat(fn).st_size == 0:
cls.log.warning('Doh, Corpus file is empty at: %s' % fn)
return
parse_fun = getattr(cls, fun)
N = corpus.get('nodes')
E = corpus.get('edges')
g = gt.Graph(directed=True)
#g.add_vertex(N)
weights = g.new_edge_property("int") # g.new_ep
labels = g.new_vertex_property("string") # g.new_vp
clusters = g.new_vertex_property("int") # g.new_vp
### Slower but weight correct
for obj in parse_fun(fn):
n_nodes = g.num_vertices()
if isinstance(obj, tuple):
i, j, w, f = obj
if max(i,j) >= n_nodes:
g.add_vertex(max(i,j) - n_nodes + 1)
edge = g.edge(i,j)
if not edge:
# keep a binary networks in the graph structure.
# Number of edge are in the {weights} property.
edge = g.add_edge(i, j)
weights[edge] = w + weights[edge]
else:
v = obj.pop('index')
if v >= n_nodes:
g.add_vertex(v - n_nodes + 1)
if 'label' in obj:
labels[v] = obj['label']
if 'cluster' in obj:
clusters[v] = obj['cluster']
### Faster but weigth wrong
#_edges = []
#_index = []
#_label = []
#for obj in parse_fun(fn):
# if isinstance(obj, tuple):
# i, j, w, f = obj
# _edges.append([i,j,w])
# else:
# _index.append(obj['index'])
# _label.append(obj['label'])
#g.add_edge_list(_edges, eprops=[weights])
#if _label:
# labels.a[_index] = _label
g.edge_properties['weights'] = weights # g.ep
g.vertex_properties['labels'] = labels # g.vp
g.vertex_properties['clusters'] = clusters
# If nolabels remove that property
n_label = 0
for v in range(g.num_vertices()):
if g.vp['labels']:
n_label += 1
break
if n_label == 0:
del g.vp['labels']
# If all clusters are zeros, consider no label information
if g.vp['clusters'].a.sum() == 0:
del g.vp['clusters']
# Remove first index in case of indexation start at 1
zero_degree = g.vertex(0).out_degree() + g.vertex(0).in_degree()
if zero_degree == 0:
cls.log.debug('shifting the graph. (-1 to vertex index)')
g.remove_vertex(0)
_N = g.num_vertices()
_E = g.num_edges()
if N and N != _N:
cls.log.warning('Number of nodes differs, doc: %d, code: %d' % (N, _N))
if E and E != _E:
cls.log.warning('Number of edges differs, doc: %d, code: %d' % (E, _E))
return g
@classmethod
def _clean_data_konect(cls, expe, data):
cls.log.info('Cleaning konect graph %s...' % expe.corpus)
g = data
edges = g.get_edges()
y = gt.spectral.adjacency(g)
parallel = gt.stats.label_parallel_edges(g, mark_only=True)
weights = g.new_ep('int')
if 'weight' in g.ep:
w = g.ep.weight
if w.a.sum() == len(w.a):
# Assume parrallel edge
# Building the real weights
weights.a = 1
for _ix in np.where(parallel.a == 0)[0]:
ix = np.where(edges[:,2]==_ix)[0][0]
i, j = edges[ix, :2]
weights[i,j] += len(g.edge(i,j, all_edges=True))
del g.ep['weight']
elif w.a.max() == 1 and w.a.min() == -1:
for i,j, ix in edges:
weights[i,j] += w[i,j]
if weights.a.min() < 0:
# Remove negative edges
cls.log.warning('Negative weights here? removing edges...')
print('Number of negative edges: %d' % (weights.a < 0).sum() )
#print('Number of negative edges: %d' % (weights.a < 0).sum() )
#edge_f = g.new_ep('bool')
#edge_f.a = 1
#edge_f.a[weights.a < 0] = 0
#g.set_edge_filter(edge_f)
#g.purge_edges()
#g.clear_filters()
weights.a[weights.a < 0] = 1
if (weights.a==0).sum() > 0:
# Remove empty edges
cls.log.warning('Removing zeros weighted edges...')
print('Number of zeros edges: %d' % (weights.a == 0).sum() )
#edge_f = g.new_ep('bool')
#edge_f.a = 1
#edge_f.a[weights.a == 0] = 0
#g.set_edge_filter(edge_f)
#g.purge_edges()
#g.clear_filters()
weights.a[weights.a == 0] = 1
elif y.max() == 1:
# Assume weight are the real one, copy
weights = g.ep.weight.copy('int')
cls.log.warning('Weight regression to int values.')
elif y.max() > 1:
raise NotImplementedError('Multi edge + weight property for %s, manual check needed' % expe.corpus)
else:
raise NotImplementedError('Edge propertie unknown for %s, manual check needed' % expe.corpus)
elif 'weights' in g.ep:
self.log.critical('Already weights property for %s, manual check needed' % expe.corpus)
raise NotImplementedError
elif y.max() > 1:
# If multiple edge detected, set the weights.
weights.a = 1
for _ix in np.where(parallel.a == 0)[0]:
ix = np.where(edges[:,2]==_ix)[0][0]
i, j = edges[ix][:2]
weights[i,j] += len(g.edge(i,j, all_edges=True))
else:
# Assume unweighted network
weights.a = 1
# Remove parallel edges
#gt.stats.remove_parallel_edges(g) # dont't resize .a !
g.set_edge_filter(parallel, inverted=True)
g.purge_edges()
g.clear_filters()
g.shrink_to_fit()
g.ep['weights'] = weights
return g
@classmethod
def _resolve_filename(cls, expe):
input_path = expe._input_path
if expe.corpus.endswith('.gt'):
basename = expe.corpus
else:
basename = expe.corpus + '.gt'
fn = os.path.join(input_path, basename)
return fn
@classmethod
def _save_data(cls, fn, data):
driver = data.save
return super()._save_data(fn, None, driver=driver)
@classmethod
def _load_data(cls, fn):
driver = gt.load_graph
return super()._load_data(fn, driver=driver)
@classmethod
def from_expe(cls, expe, corpus=None, load=True, save=True):
if '_force_load_data' in expe:
load = expe._force_load_data
if '_force_save_data' in expe:
save = expe._force_save_data
input_path = cls.get_input_path(expe)
data = None
fn = cls._resolve_filename(expe)
target_file_exists = os.path.exists(fn)
if load is False or not target_file_exists:
# Data loading Strategy
if not data:
try:
# Load from graph-tool Konnect repo
from graph_tool import collection
data = gt.load_graph(collection.get_data_path(expe.corpus))
os.makedirs(os.path.join(input_path), exist_ok=True)
except FileNotFoundError as e:
pass
except Exception as e:
cls.log.error("Error in loading corpus `%s': %s" % (expe.corpus, e))
raise e
if not data:
try:
from urllib.error import HTTPError
# Load from graph-tool Konnect site
data = gt.collection.konect_data[expe.corpus]
data = cls._clean_data_konect(expe, data)
os.makedirs(os.path.join(input_path), exist_ok=True)
except HTTPError:
pass
except Exception as e:
cls.log.error("Error in loading corpus `%s': %s" % (expe.corpus, e))
raise e
if not data:
# Load manually from file
data = cls._extract_data_file(expe, corpus=corpus)
if save:
# ===== save ====
cls._save_data(fn, data)
else:
# ===== load ====
data = cls._load_data(fn)
return cls(expe, data, corpus=corpus)
#
# Get Properties
#
def edge(self, i,j):
return self.data.edge(i,j)
def weight(self, i,j):
w = self.data.ep['weights']
if self.edge(i,j):
return w[i,j]
else:
return 0
def label(self, v):
l = self.data.vp['labels']
return l[v]
def cluster(self, v):
c = self.data.vp['clusters']
return c[v]
def num_neighbors(self):
neigs = []
for v in range(self.num_nodes()):
_v = self.data.vertex(v)
neigs.append(len(list(_v.all_neighbors())))
return np.asarray(neigs)
def is_directed(self):
return self.data.is_directed()
def is_symmetric(self):
return not self.data.is_directed()
def getN(self):
return self.num_nodes()
def get_validset_ratio(self):
if 'validset_ratio' in self.expe:
validset_ratio = self.expe['validset_ratio']
validset_ratio = float(validset_ratio) / 100
else:
validset_ratio = 0
return validset_ratio
def get_training_ratio(self):
if 'training_ratio' in self.expe:
training_ratio = self.expe['training_ratio']
training_ratio = float(training_ratio) / 100
else:
training_ratio = 0
return training_ratio
def get_testset_ratio(self):
if 'testset_ratio' in self.expe:
testset_ratio = self.expe['testset_ratio']
testset_ratio = float(testset_ratio) / 100
else:
testset_ratio = 0
validset_ratio = self.get_validset_ratio()
# Validation ratio is a ratio took on the remaining data (eta_ratio...)
testset_ratio = testset_ratio * (1 + validset_ratio)
return testset_ratio
def num_nodes(self):
return self.data.num_vertices()
def num_edges(self):
return self.data.num_edges()
def num_nnz(self):
N = self.data.num_vertices()
sym_pt = 1 if self.is_directed() else 2
if hasattr(self, 'data_test'):
T = N*(N-1)/sym_pt - int(self.data_test.sum()/sym_pt)
else:
T = N*(N-1)/sym_pt
return T
def num_nnzsum(self):
return self.data.ep['weights'].a.sum()
def num_mnb(self):
''' Minibatche size for the sampling. '''
return int(self.num_nodes() * self._zeros_set_len * float(self.expe['sampling_coverage']))
def diameter(self):
diameter, end_points = gt.topology.pseudo_diameter(self.data)
return int(diameter)
def density(self):
e = self.data.num_edges()
v = self.data.num_vertices()
t = v * (v-1)
if not self.data.is_directed():
t /= 2
return e / t
def modularity(self):
if not 'labels' in self.data.vp:
return None
labels = self.data.vp['labels']
if 'weights' in self.data.ep:
weights = self.data.ep['weights']
else:
weights = None
mod = gt.inference.modularity(self.data, labels, weights)
return mod
def clustering_coefficient(self):
clust_coef, std = gt.clustering.global_clustering(self.data)
return clust_coef
def net_type(self):
weights = self.data.ep['weights'].a
return 'm/std/mn/mx: %.1f %.1f %d %d' % (weights.mean(),
weights.std(),
weights.min(),
weights.max())
def feat_len(self):
weights = self.data.ep['weights'].a
return len(np.unique(weights))
@staticmethod
def _prop(g):
n, e, y = g.num_vertices(), g.num_edges(), gt.spectral.adjacency(g)
sum = y.sum()
#sl = np.diag(y.A).sum()
sl = y[(np.arange(n),np.arange(n))].sum()
print(g)
print('N: %d, E: %d, adj.sum: %d, adj.selfloop: %d' % (n,e,sum,sl))
print('edges shape', g.get_edges().shape)
print('Vertex prop', g.vp)
print('Edge prop', g.ep)
#
# Transform
#
def reverse_filter(self):
filter, is_inv = self.data.get_edge_filter()
self.data.set_edge_filter(filter, inverted=not is_inv)
def adj(self):
''' Returns a sparse Adjacency matrix.
Notes
-----
index of the adjacency are reversed from np convention i:line, j:column
'''
return gt.spectral.adjacency(self.data).T
def laplacian(self):
''' Returns a sparse Laplacian matrix. '''
return gt.spectral.laplacian(self.data)
def binarize(self):
# Or set a propertie with gt.stats.label_parallel_edges(g)
return gt.stats.remove_parallel_edges(self.data)
def set_directed(self, directed=True):
# Modifying the adjency matrix don't modify the gt graph
#y = self.adj()
#(y!=y.T).nnz == 0
self.data.set_directed(directed)
def remove_self_loops(self):
# Or set a propertie with gt.stats.remove_self_loops(g)
return gt.stats.remove_self_loops(self.data)
def sample(self, N):
''' Reduce randomly the number of nodes of the graph. '''
n_to_remove = self.data.num_vertices() - N
ids = np.random.randint(0, self.data.num_vertices(), n_to_remove)
self.data.remove_vertex(ids)
self.data.shrink_to_fit()
#
# Test/Validation set
#
def make_testset(self, diag_off=1, set_filter=True):
''' make the testset as a edge propertie of the graph.
If {set_filter} is True, activate the filtering property on the graph
The algo try to build a equillibrite testset between edge and non-edge with
some uncertanty for undirected graph.
Notes
-----
For Undirected graph, the sparse matrice is symmetrized, so the value of
edges can be cummulated from the two inverted sample. Thus the ratio for the
non-edges is reduce to 1/3 of the edge ratio to be fair.
This operation can be sub-optimal since it compare array of size N**2.
Warning: if set_filter=false, wiehgt is not set in the testset matrix.
'''
testset_ratio = self.get_testset_ratio()
if not testset_ratio:
raise ValueError('Testset ratio not understood : %s' % testset_ratio)
g = self.data
y = self.adj()
N = g.num_vertices()
E = g.num_edges()
is_directed = not self.is_symmetric()
symmetric_scale = 1 if is_directed else 2
size = N**2
#validset = lil_matrix((N,N))
testset = lil_matrix((N,N), dtype=bool)
n = int(E * testset_ratio) # number of edge
nz = int(n *float(self.expe.get('zeros_ratio', 1))) # number of non-links
edges = g.get_edges().astype(int)
i, j = edges[:, 0:2].T
# Sampling edges
ix = np.random.choice(E, n, replace=False)
ix = np.array((i[ix], j[ix]))
#testset[ix[0], ix[1]] = 1
# Sampling non-edges
#
# Ignore the diagonal => include the diag indices
# in (i,j) like if it they was selected in the testset.
# If Undirected also force ignore the lower part of the adjacency matrix.
if is_directed:
i = np.hstack((i, np.arange(N)))
j = np.hstack((j, np.arange(N)))
else:
_i = i
i = np.hstack((i, j, np.arange(N)))
j = np.hstack((j, _i, np.arange(N)))
edge_set = set(np.ravel_multi_index((i,j), y.shape))
nonlinks_set = set()
max_loop = 150
cpt = 0
# Add the probability to sample symmetric edge...
zeros_len = int(nz + int(not is_directed)*nz/size)
while len(nonlinks_set) < zeros_len or cpt >= max_loop:
nnz = zeros_len - len(nonlinks_set)
nonlinks_set.update(set(np.random.choice(size, nnz)))
nonlinks_set -= edge_set
cpt+=1
self.log.debug('Number of iteration for sampling non-edges: %d' % cpt)
if cpt >= max_loop:
self.log.warning('Sampling tesset nonlinks failed:')
print('desires size: %d, obtained: %s' % (zeros_len, len(nonlinks_set)))
jx = np.array(np.unravel_index(list(nonlinks_set), (N,N)))
#k = np.ravel_multi_index((i,j), y.shape)
#jx = np.random.choice(size, int(nz/symmetric_scale), replace=False)
#kind = np.bincount(k, minlength=size).astype(bool)
#jind = np.bincount(jx, minlength=size).astype(bool)
## xor on bool faster ?!
#_n = (jind & (kind==0)).sum()
#max_loop = 100
#cpt = 0
#while _n < int(nz/symmetric_scale) or cpt > max_loop:
# jx = np.random.choice(size, int(nz/symmetric_scale) - _n, replace=False)
# jind = jind|np.bincount(jx, minlength=size).astype(bool)
# _n = (jind & (kind==0)).sum()
# cpt += 1
# #print(_n, n)
#self.log.debug('Number of iteration for sampling non-edges: %d'%cpt)
#jx = np.arange(size)[jind & (kind==0)]
#jx = np.array(np.unravel_index(jx, (N,N)))
# Actually settings the testset entries
testset[ix[0], ix[1]] = 1
testset[jx[0], jx[1]] = 1
if not is_directed:
testset[ix[1], ix[0]] = 1
testset[jx[1], jx[0]] = 1
data_test_w = np.transpose(testset.nonzero())
#self.reverse_filter()
weights = []
for i,j in data_test_w:
weights.append(self.weight(i,j))
#self.reverse_filter()
data_test_w = np.hstack((data_test_w, np.array(weights)[None].T))
if set_filter:
# Also set the weight here...
testset_filter = g.new_ep('bool')
testset_filter.a = 1
self.log.info('Looping to set the testset edge eproperties (graph-tool)...')
for i, j in ix.T:
testset_filter[i,j] = 0
g.set_edge_filter(testset_filter)
self._links_testset = ix
self._nonlinks_testset = jx
self.data_test = testset
self.data_test_w = data_test_w
# if we learn with a training_ratio < 1
# we just ignore some random point.
training_ratio = self.get_training_ratio()
if training_ratio > 0 and training_ratio < 1:
# remove edges
self.data.set_fast_edge_removal()
edges = self.data.get_edges()
weights = self.data.ep['weights']
n_extra = len(edges) - training_ratio*len(edges)
edges_to_remove = np.random.choice(len(edges), int(n_extra), replace=False)
for pos in edges_to_remove:
i,j = edges[pos, :2]
e = g.edge(i,j)
g.remove_edge(e)
self.data.set_fast_edge_removal(fast=False)
return
def _check(self):
''' Warning: will clear the filter! '''
expe = self.expe
g = self.data
self._prop(g)
y = self.adj()
N = g.num_vertices()
E = g.num_edges()
# check no self-loop
assert(y.diagonal().sum() == 0)
# check directed/undirected is consistent
is_directed = not self.is_symmetric()
adj_symmetric = (y!=y.T).nnz == 0
assert(adj_symmetric != is_directed)
# check is not bipartite
assert(not gt.topology.is_bipartite(g))
if 'testset_ratio' in expe:
testset_ratio = self.get_testset_ratio()
filter = g.get_edge_filter()
g.clear_filters()
y = self.adj()
n = int(g.num_edges() * testset_ratio)
nz = int(n *float(self.expe.get('zeros_ratio', 1)))
ix = self._links_testset
jx = self._nonlinks_testset
# check links testset
assert(np.all(y[ix[0], ix[1]] == 1))
# check non-links testset
assert(np.all(y[jx[0], jx[1]] == 0))
print('Size of testset: expected: %d, obtained: %d' % (n+nz, self.data_test.sum()))
print('number of links: %d' % (len(ix[0])))
print('number of non-links: %d' % (len(jx[0])))
if filter[0]:
g.set_edge_filter(filter[0], inverted=filter[1])
return
def make_noise(self):
expe = self.expe
g = self.data
is_directed = g.is_directed()
symmetric_scale = 1 if is_directed else 2
y = self.adj()
E = g.num_edges()
N = g.num_vertices()
T = N*(N-1)/symmetric_scale
weights = g.ep['weights']
ratio = 1/float(expe['noise'])
#ts = self.get_testset_ratio()
#if ts:
# ratio -= 2*ts*E/T
# if ratio < 0:
# raise ValueError('Negative prior for noise ration')
g.set_fast_edge_removal()
edges = g.get_edges()
edges_to_remove = np.random.choice(E, int(ratio*E), replace=False)
_removed = set()
for pos in edges_to_remove:
i,j = edges[pos, :2]
w = weights[i,j]
if w > 1:
weights[i,j] = w-1
else:
e = g.edge(i,j)
_removed.add(e)
g.remove_edge(e)
num_to_add = ratio*E*2
cpt = 0
while cpt < num_to_add:
i, j = np.random.randint(0,N, 2)
if g.edge(i,j):
continue
else:
e = g.add_edge(i,j)
if e in _removed:
continue
weights[e] = 1
cpt+=1
return g.set_fast_edge_removal(fast=False)
def __iter__(self):
# Get the sampling strategy:
chunk = self.expe.get('chunk', 'stratify')
if chunk == 'stratify':
return self._sample_stratify()
elif chunk == 'sparse':
return self._sample_sparse()
else:
raise NotImplementedError('Unkonw sampling strategy: %s' % chunk)
def _sample_stratify(self):
''' Sample with node replacement.
But edges in a minibatch is unique.
Returns
------
yield ether: a triplet (source, target, weight), that can be of two kinf
1. at a new minibatch (mnb):
* source: str -- a str that identify from which subset the mnb comes.
* target: int -- the target node of the mnb.
* weight: int -- the probability of to sample a element in the current set/mnb
in the corpus/dataset.
2. an obeservation (edges):
* source: int -- source node.
* target: int -- target node.
* weight: int -- edge weight.
'''
expe = self.expe
g = self.data
is_directed = g.is_directed()
symmetric_scale = 1 if is_directed else 2 # number of set containg each edges
y = self.adj()
E = g.num_edges()
N = g.num_vertices()
T = N*(N-1)/symmetric_scale
weights = g.ep['weights']
### Chunk prior probability
set_len = 2 + is_directed
# uniform between links and non-links
zero_prior = float(expe.get('zeros_set_prob', 0.5))
set_prior = [zero_prior] + [(1-zero_prior)/(set_len-1)]*(set_len-1)
### Sampling prior strategy
zeros_set_len = expe.get('zeros_set_len', 10)
self._zeros_set_len = int(zeros_set_len)
mask = np.ones((N,2), dtype=int) * self._zeros_set_len
### Build the array of neighbourhood for each nodes
self.log.debug('Building the neighbourhood array...')
neigs = []
mask_pos = []
for v in range(N):
_out = np.array([int(_v) for _v in g.vertex(v).out_neighbors()])
_in = np.array([int(_v) for _v in g.vertex(v).in_neighbors()])
neigs.append([_out, _in])
mask_pos.append( [list(range(self._zeros_set_len))]*2 )
for _mnb in range(self.num_mnb()):
# Pick a node
node = np.random.randint(0,N)
# Pick a set and yield a minibatch
set_index = np.random.choice(set_len, 1, p=set_prior)
if set_index == 0:
out_e = neigs[node][0]
if N-len(out_e) > 0 and len(out_e) > 0:
node_info = {'vertex':node, 'direction':0}
yield str(set_index), node_info, 1/symmetric_scale*set_len*mask[node, 0]
# Sample from non-links
zero_samples = np.arange(N)
zero_samples[out_e] = -1 # don't sample in edge
zero_samples[node] = -1 # don't sample in self loops
#zero_samples[self.data_test[node, :].nonzero()[1]] = -1 # don't sample in testset
zero_samples = zero_samples[zero_samples >0]
zero_samples = np.random.choice(zero_samples,
int(np.ceil(len(zero_samples)/mask[node, 0])),
replace=False)
# Weaker results !
#step = int(np.ceil(len(zero_samples)/mask[node, 0]))
#if len(mask_pos[node][0]) == 0:
# continue
# zero_samples = np.random.choice(zero_samples,
# int(np.ceil(len(zero_samples)/mask[node, 0])),
# replace=False)
#else:
# _p = np.random.choice(mask_pos[node][0],1).item()
# mask_pos[node][0].remove(_p)
# zero_samples = zero_samples[_p:_p+step]
if len(zero_samples) == 0:
continue
for target in zero_samples:
yield node, target, 0
in_e = neigs[node][1]
if N-len(in_e) > 0 and len(in_e) > 0:
node_info = {'vertex':node, 'direction':1}
yield str(set_index), node_info, 1/symmetric_scale*set_len*mask[node, 1]
zero_samples = np.arange(N)
if len(in_e) > 0:
zero_samples[in_e] = -1
zero_samples[node] = -1
#zero_samples[self.data_test[:, node].nonzero()[0]] = -1
zero_samples = zero_samples[zero_samples >0]
zero_samples = np.random.choice(zero_samples,
int(np.ceil(len(zero_samples)/mask[node, 1])),
replace=False)
# Weaker results !
#step = int(np.ceil(len(zero_samples)/mask[node, 1]))
#if len(mask_pos[node][1]) == 0:
# continue
# zero_samples = np.random.choice(zero_samples,
# int(np.ceil(len(zero_samples)/mask[node, 1])),
# replace=False)
#else:
# _p = np.random.choice(mask_pos[node][1],1).item()
# mask_pos[node][1].remove(_p)
# zero_samples = zero_samples[_p:_p+step]
if len(zero_samples) == 0:
continue
for target in zero_samples:
yield target, node, 0
elif set_index == 1:
# Sample from links
if len(neigs[node][0]) == 0:
continue
node_info = {'vertex':node, 'direction':0}
yield str(set_index), node_info, 1/symmetric_scale*N*set_len
for target in neigs[node][0]:
yield node, target, weights[node, target]
elif set_index == 2:
# Sample from links
if len(neigs[node][1]) == 0:
continue
node_info = {'vertex':node, 'direction':1}
yield str(set_index), node_info, 1/symmetric_scale*N*set_len
for target in neigs[node][1]:
yield target, node, weights[target, node]
else:
raise ValueError('Set index error: %s' % set_index)
def _sample_sparse(self):
''' Sample with node replacement.
'''
expe = self.expe
g = self.data
is_directed = g.is_directed()
symmetric_scale = 1 if is_directed else 2 # number of set containg each edges
y = self.adj()
E = g.num_edges()
N = g.num_vertices()
T = N*(N-1)/symmetric_scale
weights = g.ep['weights']
### Chunk prior probability
set_len = 2 + is_directed
# uniform between links and non-links
zero_prior = float(expe.get('zeros_set_prob', 0.5))
set_prior = [zero_prior] + [(1-zero_prior)/(set_len-1)]*(set_len-1)
### Sampling prior strategy
zeros_set_len = expe.get('zeros_set_len', 10)
self._zeros_set_len = int(zeros_set_len)
mask = np.ones((N,2), dtype=int) * self._zeros_set_len
### Build the array of neighbourhood for each nodes
self.log.debug('Building the neighbourhood array...')
neigs = []
mask_pos = []
for v in range(N):
_out = np.array([int(_v) for _v in g.vertex(v).out_neighbors()])
_in = np.array([int(_v) for _v in g.vertex(v).in_neighbors()])
neigs.append([_out, _in])
mask_pos.append( [list(range(self._zeros_set_len))]*2 )
for _mnb in range(self.num_mnb()):
# Pick a node
node = np.random.randint(0,N)
# Pick a set and yield a minibatch
set_index = np.random.choice(set_len, 1, p=set_prior)
if set_index == 0:
out_e = neigs[node][0]
if N-len(out_e) > 0 and len(out_e) > 0:
node_info = {'vertex':node, 'direction':0}
yield str(set_index), node_info, (N-len(out_e)-1)* mask[node,0]* symmetric_scale
# Sample from non-links
zero_samples = np.arange(N)
zero_samples[out_e] = -1 # don't sample in edge
zero_samples[node] = -1 # don't sample in self loops
#zero_samples[self.data_test[node, :].nonzero()[1]] = -1 # don't sample in testset
zero_samples = zero_samples[zero_samples >0]
zero_samples = np.random.choice(zero_samples,
int(np.ceil(len(zero_samples)/mask[node, 0])),
replace=False)
# Weaker results !
#step = int(np.ceil(len(zero_samples)/mask[node, 0]))
#if len(mask_pos[node][0]) == 0:
# zero_samples = np.random.choice(zero_samples,
# int(np.ceil(len(zero_samples)/mask[node, 0])),
# replace=False)
#else:
# _p = np.random.choice(mask_pos[node][0],1).item()
# mask_pos[node][0].remove(_p)
# zero_samples = zero_samples[_p:_p+step]
if len(zero_samples) == 0:
continue
for target in zero_samples:
yield node, target, 0
in_e = neigs[node][1]
if N-len(in_e) > 0 and len(in_e) > 0:
node_info = {'vertex':node, 'direction':1}
yield str(set_index), node_info, (N-len(in_e)-1)* mask[node,1]* symmetric_scale
zero_samples = np.arange(N)
if len(in_e) > 0:
zero_samples[in_e] = -1
zero_samples[node] = -1
#zero_samples[self.data_test[:, node].nonzero()[0]] = -1
zero_samples = zero_samples[zero_samples >0]
zero_samples = np.random.choice(zero_samples,
#int(np.ceil(len(zero_samples)/(10*len(in_e)))),
int(np.ceil(len(zero_samples)/mask[node, 1])),
replace=False)
# Weaker results !
#step = int(np.ceil(len(zero_samples)/mask[node, 1]))
#if len(mask_pos[node][1]) == 0:
# zero_samples = np.random.choice(zero_samples,
# int(np.ceil(len(zero_samples)/mask[node, 1])),
# replace=False)
#else:
# _p = np.random.choice(mask_pos[node][1],1).item()
# mask_pos[node][1].remove(_p)
# zero_samples = zero_samples[_p:_p+step]
if len(zero_samples) == 0:
continue
for target in zero_samples:
yield target, node, 0
elif set_index == 1:
# Sample from links
if len(neigs[node][0]) == 0:
continue
node_info = {'vertex':node, 'direction':0}
yield str(set_index), node_info, symmetric_scale*N*len(neigs[node][0])
for target in neigs[node][0]:
yield node, target, weights[node, target]
elif set_index == 2:
# Sample from links
if len(neigs[node][1]) == 0:
continue
node_info = {'vertex':node, 'direction':1}
yield str(set_index), node_info, symmetric_scale*N*len(neigs[node][1])
for target in neigs[node][1]:
yield target, node, weights[target, node]
else:
raise ValueError('Set index error: %s' % set_index)
Classes
class frontendNetwork_gt (expe, data, corpus=None)
-
A frontend layer for the graph-tool object.
Attributes
data
:Graph
(from
gt
)
Source code
class frontendNetwork_gt(DataBase, OnlineDatasetDriver): ''' A frontend layer for the graph-tool object. Attributes ---------- data: Graph (from gt) ''' def __init__(self, expe, data, corpus=None): super().__init__(expe) self._data_type = 'network' corpus = corpus or {} self.data = data force_directed = expe.get('directed', corpus.get('directed')) force_directed = bool(force_directed) if force_directed is not None else None remove_self_loops = expe.get('remove_self_loops', True) # Remove selfloop if remove_self_loops: # @Warning: the corresponding weight are kept in the map properties self.log.debug('Self-loop are assumed to be remove from the graph.') self.remove_self_loops() # Set the direction and weights if force_directed is not None: self.set_directed(force_directed) else: # Try to guess directed or not y = self.adj() adj_symmetric = (y!=y.T).nnz == 0 adj_symmetric |= sp.sparse.triu(y,1).nnz == 0 adj_symmetric |= sp.sparse.tril(y,-1).nnz == 0 if adj_symmetric: self.log.info('Guessing undirected graph for: %s' % expe.corpus) self.set_directed(False) if y.max() > 1: self.log.critical('Multiple edges in the graph..') if not 'weights' in data.ep: if 'weight' in data.ep: self.log.critical('Already Weight propertye in the graph, need to check ') raise NotImplementedError elif 'value' in data.ep: weights = data.ep['value'].copy() #w.a = (2**0.5)**w.a # exponentiate #weights.a = np.round((weights.a+1)**2) # squared weights.a = np.round(weights.a *10) # assume ten authors by paper for (collaboration networks) data.ep['weights'] = weights.copy('int') else: weights = data.new_ep("int") weights.a = 1 data.ep['weights'] = weights else: weights = data.ep.weights if self.expe.get('shift_w'): shift = self.expe.get('shift_w') if str(shift).startswith('linear'): _, shift = shift.split('_') #shift = weights.a.max() shift = int(shift) expe['shift_w'] = shift weights.a = shift*weights.a +shift else: shift = int(self.expe['shift_w']) weights.a += shift @classmethod def _extract_data_file(cls, expe, corpus=None): corpus = corpus or {} input_path = expe._input_path if not os.path.exists(input_path): cls.log.error("Corpus `%s' Not found." % (input_path)) print('please run "fetch_networks"') cls.data = None return if expe.corpus.endswith('.gt'): corpus_name = expe.corpus[:-len('.gt')] else: corpus_name = expe.corpus # DB integration ? if corpus_name.startswith(('generator', 'graph')): ext = 'graph' basename = os.path.join(input_path, 't0.' + ext) fun = 'parse_dancer' elif corpus_name.startswith(('bench1')): raise NotImplementedError() elif corpus_name.startswith('facebook'): basename = 'edges' fn = os.path.join(input_path, '0.' + ext) fun = 'parse_edges' elif corpus_name.startswith(('manufacturing',)): ext = 'csv' basename = os.path.join(input_path, corpus_name +'.'+ ext) fun = 'parse_csv' elif corpus_name.startswith(('fb_uc', 'emaileu')): ext = 'txt' basename = os.path.join(input_path, corpus_name +'.'+ ext) fun = 'parse_tnet' elif corpus_name.startswith(('blogs','propro', 'euroroad')): ext = 'dat' basename = os.path.join(input_path, corpus_name +'.'+ ext) fun = 'parse_dat' else: raise ValueError('Which corpus to Load; %s ?' % corpus_name) fn = os.path.join(input_path, basename) if os.path.isfile(fn) and os.stat(fn).st_size == 0: cls.log.warning('Doh, Corpus file is empty at: %s' % fn) return parse_fun = getattr(cls, fun) N = corpus.get('nodes') E = corpus.get('edges') g = gt.Graph(directed=True) #g.add_vertex(N) weights = g.new_edge_property("int") # g.new_ep labels = g.new_vertex_property("string") # g.new_vp clusters = g.new_vertex_property("int") # g.new_vp ### Slower but weight correct for obj in parse_fun(fn): n_nodes = g.num_vertices() if isinstance(obj, tuple): i, j, w, f = obj if max(i,j) >= n_nodes: g.add_vertex(max(i,j) - n_nodes + 1) edge = g.edge(i,j) if not edge: # keep a binary networks in the graph structure. # Number of edge are in the {weights} property. edge = g.add_edge(i, j) weights[edge] = w + weights[edge] else: v = obj.pop('index') if v >= n_nodes: g.add_vertex(v - n_nodes + 1) if 'label' in obj: labels[v] = obj['label'] if 'cluster' in obj: clusters[v] = obj['cluster'] ### Faster but weigth wrong #_edges = [] #_index = [] #_label = [] #for obj in parse_fun(fn): # if isinstance(obj, tuple): # i, j, w, f = obj # _edges.append([i,j,w]) # else: # _index.append(obj['index']) # _label.append(obj['label']) #g.add_edge_list(_edges, eprops=[weights]) #if _label: # labels.a[_index] = _label g.edge_properties['weights'] = weights # g.ep g.vertex_properties['labels'] = labels # g.vp g.vertex_properties['clusters'] = clusters # If nolabels remove that property n_label = 0 for v in range(g.num_vertices()): if g.vp['labels']: n_label += 1 break if n_label == 0: del g.vp['labels'] # If all clusters are zeros, consider no label information if g.vp['clusters'].a.sum() == 0: del g.vp['clusters'] # Remove first index in case of indexation start at 1 zero_degree = g.vertex(0).out_degree() + g.vertex(0).in_degree() if zero_degree == 0: cls.log.debug('shifting the graph. (-1 to vertex index)') g.remove_vertex(0) _N = g.num_vertices() _E = g.num_edges() if N and N != _N: cls.log.warning('Number of nodes differs, doc: %d, code: %d' % (N, _N)) if E and E != _E: cls.log.warning('Number of edges differs, doc: %d, code: %d' % (E, _E)) return g @classmethod def _clean_data_konect(cls, expe, data): cls.log.info('Cleaning konect graph %s...' % expe.corpus) g = data edges = g.get_edges() y = gt.spectral.adjacency(g) parallel = gt.stats.label_parallel_edges(g, mark_only=True) weights = g.new_ep('int') if 'weight' in g.ep: w = g.ep.weight if w.a.sum() == len(w.a): # Assume parrallel edge # Building the real weights weights.a = 1 for _ix in np.where(parallel.a == 0)[0]: ix = np.where(edges[:,2]==_ix)[0][0] i, j = edges[ix, :2] weights[i,j] += len(g.edge(i,j, all_edges=True)) del g.ep['weight'] elif w.a.max() == 1 and w.a.min() == -1: for i,j, ix in edges: weights[i,j] += w[i,j] if weights.a.min() < 0: # Remove negative edges cls.log.warning('Negative weights here? removing edges...') print('Number of negative edges: %d' % (weights.a < 0).sum() ) #print('Number of negative edges: %d' % (weights.a < 0).sum() ) #edge_f = g.new_ep('bool') #edge_f.a = 1 #edge_f.a[weights.a < 0] = 0 #g.set_edge_filter(edge_f) #g.purge_edges() #g.clear_filters() weights.a[weights.a < 0] = 1 if (weights.a==0).sum() > 0: # Remove empty edges cls.log.warning('Removing zeros weighted edges...') print('Number of zeros edges: %d' % (weights.a == 0).sum() ) #edge_f = g.new_ep('bool') #edge_f.a = 1 #edge_f.a[weights.a == 0] = 0 #g.set_edge_filter(edge_f) #g.purge_edges() #g.clear_filters() weights.a[weights.a == 0] = 1 elif y.max() == 1: # Assume weight are the real one, copy weights = g.ep.weight.copy('int') cls.log.warning('Weight regression to int values.') elif y.max() > 1: raise NotImplementedError('Multi edge + weight property for %s, manual check needed' % expe.corpus) else: raise NotImplementedError('Edge propertie unknown for %s, manual check needed' % expe.corpus) elif 'weights' in g.ep: self.log.critical('Already weights property for %s, manual check needed' % expe.corpus) raise NotImplementedError elif y.max() > 1: # If multiple edge detected, set the weights. weights.a = 1 for _ix in np.where(parallel.a == 0)[0]: ix = np.where(edges[:,2]==_ix)[0][0] i, j = edges[ix][:2] weights[i,j] += len(g.edge(i,j, all_edges=True)) else: # Assume unweighted network weights.a = 1 # Remove parallel edges #gt.stats.remove_parallel_edges(g) # dont't resize .a ! g.set_edge_filter(parallel, inverted=True) g.purge_edges() g.clear_filters() g.shrink_to_fit() g.ep['weights'] = weights return g @classmethod def _resolve_filename(cls, expe): input_path = expe._input_path if expe.corpus.endswith('.gt'): basename = expe.corpus else: basename = expe.corpus + '.gt' fn = os.path.join(input_path, basename) return fn @classmethod def _save_data(cls, fn, data): driver = data.save return super()._save_data(fn, None, driver=driver) @classmethod def _load_data(cls, fn): driver = gt.load_graph return super()._load_data(fn, driver=driver) @classmethod def from_expe(cls, expe, corpus=None, load=True, save=True): if '_force_load_data' in expe: load = expe._force_load_data if '_force_save_data' in expe: save = expe._force_save_data input_path = cls.get_input_path(expe) data = None fn = cls._resolve_filename(expe) target_file_exists = os.path.exists(fn) if load is False or not target_file_exists: # Data loading Strategy if not data: try: # Load from graph-tool Konnect repo from graph_tool import collection data = gt.load_graph(collection.get_data_path(expe.corpus)) os.makedirs(os.path.join(input_path), exist_ok=True) except FileNotFoundError as e: pass except Exception as e: cls.log.error("Error in loading corpus `%s': %s" % (expe.corpus, e)) raise e if not data: try: from urllib.error import HTTPError # Load from graph-tool Konnect site data = gt.collection.konect_data[expe.corpus] data = cls._clean_data_konect(expe, data) os.makedirs(os.path.join(input_path), exist_ok=True) except HTTPError: pass except Exception as e: cls.log.error("Error in loading corpus `%s': %s" % (expe.corpus, e)) raise e if not data: # Load manually from file data = cls._extract_data_file(expe, corpus=corpus) if save: # ===== save ==== cls._save_data(fn, data) else: # ===== load ==== data = cls._load_data(fn) return cls(expe, data, corpus=corpus) # # Get Properties # def edge(self, i,j): return self.data.edge(i,j) def weight(self, i,j): w = self.data.ep['weights'] if self.edge(i,j): return w[i,j] else: return 0 def label(self, v): l = self.data.vp['labels'] return l[v] def cluster(self, v): c = self.data.vp['clusters'] return c[v] def num_neighbors(self): neigs = [] for v in range(self.num_nodes()): _v = self.data.vertex(v) neigs.append(len(list(_v.all_neighbors()))) return np.asarray(neigs) def is_directed(self): return self.data.is_directed() def is_symmetric(self): return not self.data.is_directed() def getN(self): return self.num_nodes() def get_validset_ratio(self): if 'validset_ratio' in self.expe: validset_ratio = self.expe['validset_ratio'] validset_ratio = float(validset_ratio) / 100 else: validset_ratio = 0 return validset_ratio def get_training_ratio(self): if 'training_ratio' in self.expe: training_ratio = self.expe['training_ratio'] training_ratio = float(training_ratio) / 100 else: training_ratio = 0 return training_ratio def get_testset_ratio(self): if 'testset_ratio' in self.expe: testset_ratio = self.expe['testset_ratio'] testset_ratio = float(testset_ratio) / 100 else: testset_ratio = 0 validset_ratio = self.get_validset_ratio() # Validation ratio is a ratio took on the remaining data (eta_ratio...) testset_ratio = testset_ratio * (1 + validset_ratio) return testset_ratio def num_nodes(self): return self.data.num_vertices() def num_edges(self): return self.data.num_edges() def num_nnz(self): N = self.data.num_vertices() sym_pt = 1 if self.is_directed() else 2 if hasattr(self, 'data_test'): T = N*(N-1)/sym_pt - int(self.data_test.sum()/sym_pt) else: T = N*(N-1)/sym_pt return T def num_nnzsum(self): return self.data.ep['weights'].a.sum() def num_mnb(self): ''' Minibatche size for the sampling. ''' return int(self.num_nodes() * self._zeros_set_len * float(self.expe['sampling_coverage'])) def diameter(self): diameter, end_points = gt.topology.pseudo_diameter(self.data) return int(diameter) def density(self): e = self.data.num_edges() v = self.data.num_vertices() t = v * (v-1) if not self.data.is_directed(): t /= 2 return e / t def modularity(self): if not 'labels' in self.data.vp: return None labels = self.data.vp['labels'] if 'weights' in self.data.ep: weights = self.data.ep['weights'] else: weights = None mod = gt.inference.modularity(self.data, labels, weights) return mod def clustering_coefficient(self): clust_coef, std = gt.clustering.global_clustering(self.data) return clust_coef def net_type(self): weights = self.data.ep['weights'].a return 'm/std/mn/mx: %.1f %.1f %d %d' % (weights.mean(), weights.std(), weights.min(), weights.max()) def feat_len(self): weights = self.data.ep['weights'].a return len(np.unique(weights)) @staticmethod def _prop(g): n, e, y = g.num_vertices(), g.num_edges(), gt.spectral.adjacency(g) sum = y.sum() #sl = np.diag(y.A).sum() sl = y[(np.arange(n),np.arange(n))].sum() print(g) print('N: %d, E: %d, adj.sum: %d, adj.selfloop: %d' % (n,e,sum,sl)) print('edges shape', g.get_edges().shape) print('Vertex prop', g.vp) print('Edge prop', g.ep) # # Transform # def reverse_filter(self): filter, is_inv = self.data.get_edge_filter() self.data.set_edge_filter(filter, inverted=not is_inv) def adj(self): ''' Returns a sparse Adjacency matrix. Notes ----- index of the adjacency are reversed from np convention i:line, j:column ''' return gt.spectral.adjacency(self.data).T def laplacian(self): ''' Returns a sparse Laplacian matrix. ''' return gt.spectral.laplacian(self.data) def binarize(self): # Or set a propertie with gt.stats.label_parallel_edges(g) return gt.stats.remove_parallel_edges(self.data) def set_directed(self, directed=True): # Modifying the adjency matrix don't modify the gt graph #y = self.adj() #(y!=y.T).nnz == 0 self.data.set_directed(directed) def remove_self_loops(self): # Or set a propertie with gt.stats.remove_self_loops(g) return gt.stats.remove_self_loops(self.data) def sample(self, N): ''' Reduce randomly the number of nodes of the graph. ''' n_to_remove = self.data.num_vertices() - N ids = np.random.randint(0, self.data.num_vertices(), n_to_remove) self.data.remove_vertex(ids) self.data.shrink_to_fit() # # Test/Validation set # def make_testset(self, diag_off=1, set_filter=True): ''' make the testset as a edge propertie of the graph. If {set_filter} is True, activate the filtering property on the graph The algo try to build a equillibrite testset between edge and non-edge with some uncertanty for undirected graph. Notes ----- For Undirected graph, the sparse matrice is symmetrized, so the value of edges can be cummulated from the two inverted sample. Thus the ratio for the non-edges is reduce to 1/3 of the edge ratio to be fair. This operation can be sub-optimal since it compare array of size N**2. Warning: if set_filter=false, wiehgt is not set in the testset matrix. ''' testset_ratio = self.get_testset_ratio() if not testset_ratio: raise ValueError('Testset ratio not understood : %s' % testset_ratio) g = self.data y = self.adj() N = g.num_vertices() E = g.num_edges() is_directed = not self.is_symmetric() symmetric_scale = 1 if is_directed else 2 size = N**2 #validset = lil_matrix((N,N)) testset = lil_matrix((N,N), dtype=bool) n = int(E * testset_ratio) # number of edge nz = int(n *float(self.expe.get('zeros_ratio', 1))) # number of non-links edges = g.get_edges().astype(int) i, j = edges[:, 0:2].T # Sampling edges ix = np.random.choice(E, n, replace=False) ix = np.array((i[ix], j[ix])) #testset[ix[0], ix[1]] = 1 # Sampling non-edges # # Ignore the diagonal => include the diag indices # in (i,j) like if it they was selected in the testset. # If Undirected also force ignore the lower part of the adjacency matrix. if is_directed: i = np.hstack((i, np.arange(N))) j = np.hstack((j, np.arange(N))) else: _i = i i = np.hstack((i, j, np.arange(N))) j = np.hstack((j, _i, np.arange(N))) edge_set = set(np.ravel_multi_index((i,j), y.shape)) nonlinks_set = set() max_loop = 150 cpt = 0 # Add the probability to sample symmetric edge... zeros_len = int(nz + int(not is_directed)*nz/size) while len(nonlinks_set) < zeros_len or cpt >= max_loop: nnz = zeros_len - len(nonlinks_set) nonlinks_set.update(set(np.random.choice(size, nnz))) nonlinks_set -= edge_set cpt+=1 self.log.debug('Number of iteration for sampling non-edges: %d' % cpt) if cpt >= max_loop: self.log.warning('Sampling tesset nonlinks failed:') print('desires size: %d, obtained: %s' % (zeros_len, len(nonlinks_set))) jx = np.array(np.unravel_index(list(nonlinks_set), (N,N))) #k = np.ravel_multi_index((i,j), y.shape) #jx = np.random.choice(size, int(nz/symmetric_scale), replace=False) #kind = np.bincount(k, minlength=size).astype(bool) #jind = np.bincount(jx, minlength=size).astype(bool) ## xor on bool faster ?! #_n = (jind & (kind==0)).sum() #max_loop = 100 #cpt = 0 #while _n < int(nz/symmetric_scale) or cpt > max_loop: # jx = np.random.choice(size, int(nz/symmetric_scale) - _n, replace=False) # jind = jind|np.bincount(jx, minlength=size).astype(bool) # _n = (jind & (kind==0)).sum() # cpt += 1 # #print(_n, n) #self.log.debug('Number of iteration for sampling non-edges: %d'%cpt) #jx = np.arange(size)[jind & (kind==0)] #jx = np.array(np.unravel_index(jx, (N,N))) # Actually settings the testset entries testset[ix[0], ix[1]] = 1 testset[jx[0], jx[1]] = 1 if not is_directed: testset[ix[1], ix[0]] = 1 testset[jx[1], jx[0]] = 1 data_test_w = np.transpose(testset.nonzero()) #self.reverse_filter() weights = [] for i,j in data_test_w: weights.append(self.weight(i,j)) #self.reverse_filter() data_test_w = np.hstack((data_test_w, np.array(weights)[None].T)) if set_filter: # Also set the weight here... testset_filter = g.new_ep('bool') testset_filter.a = 1 self.log.info('Looping to set the testset edge eproperties (graph-tool)...') for i, j in ix.T: testset_filter[i,j] = 0 g.set_edge_filter(testset_filter) self._links_testset = ix self._nonlinks_testset = jx self.data_test = testset self.data_test_w = data_test_w # if we learn with a training_ratio < 1 # we just ignore some random point. training_ratio = self.get_training_ratio() if training_ratio > 0 and training_ratio < 1: # remove edges self.data.set_fast_edge_removal() edges = self.data.get_edges() weights = self.data.ep['weights'] n_extra = len(edges) - training_ratio*len(edges) edges_to_remove = np.random.choice(len(edges), int(n_extra), replace=False) for pos in edges_to_remove: i,j = edges[pos, :2] e = g.edge(i,j) g.remove_edge(e) self.data.set_fast_edge_removal(fast=False) return def _check(self): ''' Warning: will clear the filter! ''' expe = self.expe g = self.data self._prop(g) y = self.adj() N = g.num_vertices() E = g.num_edges() # check no self-loop assert(y.diagonal().sum() == 0) # check directed/undirected is consistent is_directed = not self.is_symmetric() adj_symmetric = (y!=y.T).nnz == 0 assert(adj_symmetric != is_directed) # check is not bipartite assert(not gt.topology.is_bipartite(g)) if 'testset_ratio' in expe: testset_ratio = self.get_testset_ratio() filter = g.get_edge_filter() g.clear_filters() y = self.adj() n = int(g.num_edges() * testset_ratio) nz = int(n *float(self.expe.get('zeros_ratio', 1))) ix = self._links_testset jx = self._nonlinks_testset # check links testset assert(np.all(y[ix[0], ix[1]] == 1)) # check non-links testset assert(np.all(y[jx[0], jx[1]] == 0)) print('Size of testset: expected: %d, obtained: %d' % (n+nz, self.data_test.sum())) print('number of links: %d' % (len(ix[0]))) print('number of non-links: %d' % (len(jx[0]))) if filter[0]: g.set_edge_filter(filter[0], inverted=filter[1]) return def make_noise(self): expe = self.expe g = self.data is_directed = g.is_directed() symmetric_scale = 1 if is_directed else 2 y = self.adj() E = g.num_edges() N = g.num_vertices() T = N*(N-1)/symmetric_scale weights = g.ep['weights'] ratio = 1/float(expe['noise']) #ts = self.get_testset_ratio() #if ts: # ratio -= 2*ts*E/T # if ratio < 0: # raise ValueError('Negative prior for noise ration') g.set_fast_edge_removal() edges = g.get_edges() edges_to_remove = np.random.choice(E, int(ratio*E), replace=False) _removed = set() for pos in edges_to_remove: i,j = edges[pos, :2] w = weights[i,j] if w > 1: weights[i,j] = w-1 else: e = g.edge(i,j) _removed.add(e) g.remove_edge(e) num_to_add = ratio*E*2 cpt = 0 while cpt < num_to_add: i, j = np.random.randint(0,N, 2) if g.edge(i,j): continue else: e = g.add_edge(i,j) if e in _removed: continue weights[e] = 1 cpt+=1 return g.set_fast_edge_removal(fast=False) def __iter__(self): # Get the sampling strategy: chunk = self.expe.get('chunk', 'stratify') if chunk == 'stratify': return self._sample_stratify() elif chunk == 'sparse': return self._sample_sparse() else: raise NotImplementedError('Unkonw sampling strategy: %s' % chunk) def _sample_stratify(self): ''' Sample with node replacement. But edges in a minibatch is unique. Returns ------ yield ether: a triplet (source, target, weight), that can be of two kinf 1. at a new minibatch (mnb): * source: str -- a str that identify from which subset the mnb comes. * target: int -- the target node of the mnb. * weight: int -- the probability of to sample a element in the current set/mnb in the corpus/dataset. 2. an obeservation (edges): * source: int -- source node. * target: int -- target node. * weight: int -- edge weight. ''' expe = self.expe g = self.data is_directed = g.is_directed() symmetric_scale = 1 if is_directed else 2 # number of set containg each edges y = self.adj() E = g.num_edges() N = g.num_vertices() T = N*(N-1)/symmetric_scale weights = g.ep['weights'] ### Chunk prior probability set_len = 2 + is_directed # uniform between links and non-links zero_prior = float(expe.get('zeros_set_prob', 0.5)) set_prior = [zero_prior] + [(1-zero_prior)/(set_len-1)]*(set_len-1) ### Sampling prior strategy zeros_set_len = expe.get('zeros_set_len', 10) self._zeros_set_len = int(zeros_set_len) mask = np.ones((N,2), dtype=int) * self._zeros_set_len ### Build the array of neighbourhood for each nodes self.log.debug('Building the neighbourhood array...') neigs = [] mask_pos = [] for v in range(N): _out = np.array([int(_v) for _v in g.vertex(v).out_neighbors()]) _in = np.array([int(_v) for _v in g.vertex(v).in_neighbors()]) neigs.append([_out, _in]) mask_pos.append( [list(range(self._zeros_set_len))]*2 ) for _mnb in range(self.num_mnb()): # Pick a node node = np.random.randint(0,N) # Pick a set and yield a minibatch set_index = np.random.choice(set_len, 1, p=set_prior) if set_index == 0: out_e = neigs[node][0] if N-len(out_e) > 0 and len(out_e) > 0: node_info = {'vertex':node, 'direction':0} yield str(set_index), node_info, 1/symmetric_scale*set_len*mask[node, 0] # Sample from non-links zero_samples = np.arange(N) zero_samples[out_e] = -1 # don't sample in edge zero_samples[node] = -1 # don't sample in self loops #zero_samples[self.data_test[node, :].nonzero()[1]] = -1 # don't sample in testset zero_samples = zero_samples[zero_samples >0] zero_samples = np.random.choice(zero_samples, int(np.ceil(len(zero_samples)/mask[node, 0])), replace=False) # Weaker results ! #step = int(np.ceil(len(zero_samples)/mask[node, 0])) #if len(mask_pos[node][0]) == 0: # continue # zero_samples = np.random.choice(zero_samples, # int(np.ceil(len(zero_samples)/mask[node, 0])), # replace=False) #else: # _p = np.random.choice(mask_pos[node][0],1).item() # mask_pos[node][0].remove(_p) # zero_samples = zero_samples[_p:_p+step] if len(zero_samples) == 0: continue for target in zero_samples: yield node, target, 0 in_e = neigs[node][1] if N-len(in_e) > 0 and len(in_e) > 0: node_info = {'vertex':node, 'direction':1} yield str(set_index), node_info, 1/symmetric_scale*set_len*mask[node, 1] zero_samples = np.arange(N) if len(in_e) > 0: zero_samples[in_e] = -1 zero_samples[node] = -1 #zero_samples[self.data_test[:, node].nonzero()[0]] = -1 zero_samples = zero_samples[zero_samples >0] zero_samples = np.random.choice(zero_samples, int(np.ceil(len(zero_samples)/mask[node, 1])), replace=False) # Weaker results ! #step = int(np.ceil(len(zero_samples)/mask[node, 1])) #if len(mask_pos[node][1]) == 0: # continue # zero_samples = np.random.choice(zero_samples, # int(np.ceil(len(zero_samples)/mask[node, 1])), # replace=False) #else: # _p = np.random.choice(mask_pos[node][1],1).item() # mask_pos[node][1].remove(_p) # zero_samples = zero_samples[_p:_p+step] if len(zero_samples) == 0: continue for target in zero_samples: yield target, node, 0 elif set_index == 1: # Sample from links if len(neigs[node][0]) == 0: continue node_info = {'vertex':node, 'direction':0} yield str(set_index), node_info, 1/symmetric_scale*N*set_len for target in neigs[node][0]: yield node, target, weights[node, target] elif set_index == 2: # Sample from links if len(neigs[node][1]) == 0: continue node_info = {'vertex':node, 'direction':1} yield str(set_index), node_info, 1/symmetric_scale*N*set_len for target in neigs[node][1]: yield target, node, weights[target, node] else: raise ValueError('Set index error: %s' % set_index) def _sample_sparse(self): ''' Sample with node replacement. ''' expe = self.expe g = self.data is_directed = g.is_directed() symmetric_scale = 1 if is_directed else 2 # number of set containg each edges y = self.adj() E = g.num_edges() N = g.num_vertices() T = N*(N-1)/symmetric_scale weights = g.ep['weights'] ### Chunk prior probability set_len = 2 + is_directed # uniform between links and non-links zero_prior = float(expe.get('zeros_set_prob', 0.5)) set_prior = [zero_prior] + [(1-zero_prior)/(set_len-1)]*(set_len-1) ### Sampling prior strategy zeros_set_len = expe.get('zeros_set_len', 10) self._zeros_set_len = int(zeros_set_len) mask = np.ones((N,2), dtype=int) * self._zeros_set_len ### Build the array of neighbourhood for each nodes self.log.debug('Building the neighbourhood array...') neigs = [] mask_pos = [] for v in range(N): _out = np.array([int(_v) for _v in g.vertex(v).out_neighbors()]) _in = np.array([int(_v) for _v in g.vertex(v).in_neighbors()]) neigs.append([_out, _in]) mask_pos.append( [list(range(self._zeros_set_len))]*2 ) for _mnb in range(self.num_mnb()): # Pick a node node = np.random.randint(0,N) # Pick a set and yield a minibatch set_index = np.random.choice(set_len, 1, p=set_prior) if set_index == 0: out_e = neigs[node][0] if N-len(out_e) > 0 and len(out_e) > 0: node_info = {'vertex':node, 'direction':0} yield str(set_index), node_info, (N-len(out_e)-1)* mask[node,0]* symmetric_scale # Sample from non-links zero_samples = np.arange(N) zero_samples[out_e] = -1 # don't sample in edge zero_samples[node] = -1 # don't sample in self loops #zero_samples[self.data_test[node, :].nonzero()[1]] = -1 # don't sample in testset zero_samples = zero_samples[zero_samples >0] zero_samples = np.random.choice(zero_samples, int(np.ceil(len(zero_samples)/mask[node, 0])), replace=False) # Weaker results ! #step = int(np.ceil(len(zero_samples)/mask[node, 0])) #if len(mask_pos[node][0]) == 0: # zero_samples = np.random.choice(zero_samples, # int(np.ceil(len(zero_samples)/mask[node, 0])), # replace=False) #else: # _p = np.random.choice(mask_pos[node][0],1).item() # mask_pos[node][0].remove(_p) # zero_samples = zero_samples[_p:_p+step] if len(zero_samples) == 0: continue for target in zero_samples: yield node, target, 0 in_e = neigs[node][1] if N-len(in_e) > 0 and len(in_e) > 0: node_info = {'vertex':node, 'direction':1} yield str(set_index), node_info, (N-len(in_e)-1)* mask[node,1]* symmetric_scale zero_samples = np.arange(N) if len(in_e) > 0: zero_samples[in_e] = -1 zero_samples[node] = -1 #zero_samples[self.data_test[:, node].nonzero()[0]] = -1 zero_samples = zero_samples[zero_samples >0] zero_samples = np.random.choice(zero_samples, #int(np.ceil(len(zero_samples)/(10*len(in_e)))), int(np.ceil(len(zero_samples)/mask[node, 1])), replace=False) # Weaker results ! #step = int(np.ceil(len(zero_samples)/mask[node, 1])) #if len(mask_pos[node][1]) == 0: # zero_samples = np.random.choice(zero_samples, # int(np.ceil(len(zero_samples)/mask[node, 1])), # replace=False) #else: # _p = np.random.choice(mask_pos[node][1],1).item() # mask_pos[node][1].remove(_p) # zero_samples = zero_samples[_p:_p+step] if len(zero_samples) == 0: continue for target in zero_samples: yield target, node, 0 elif set_index == 1: # Sample from links if len(neigs[node][0]) == 0: continue node_info = {'vertex':node, 'direction':0} yield str(set_index), node_info, symmetric_scale*N*len(neigs[node][0]) for target in neigs[node][0]: yield node, target, weights[node, target] elif set_index == 2: # Sample from links if len(neigs[node][1]) == 0: continue node_info = {'vertex':node, 'direction':1} yield str(set_index), node_info, symmetric_scale*N*len(neigs[node][1]) for target in neigs[node][1]: yield target, node, weights[target, node] else: raise ValueError('Set index error: %s' % set_index)
Ancestors
Static methods
def from_expe(expe, corpus=None, load=True, save=True)
-
Source code
@classmethod def from_expe(cls, expe, corpus=None, load=True, save=True): if '_force_load_data' in expe: load = expe._force_load_data if '_force_save_data' in expe: save = expe._force_save_data input_path = cls.get_input_path(expe) data = None fn = cls._resolve_filename(expe) target_file_exists = os.path.exists(fn) if load is False or not target_file_exists: # Data loading Strategy if not data: try: # Load from graph-tool Konnect repo from graph_tool import collection data = gt.load_graph(collection.get_data_path(expe.corpus)) os.makedirs(os.path.join(input_path), exist_ok=True) except FileNotFoundError as e: pass except Exception as e: cls.log.error("Error in loading corpus `%s': %s" % (expe.corpus, e)) raise e if not data: try: from urllib.error import HTTPError # Load from graph-tool Konnect site data = gt.collection.konect_data[expe.corpus] data = cls._clean_data_konect(expe, data) os.makedirs(os.path.join(input_path), exist_ok=True) except HTTPError: pass except Exception as e: cls.log.error("Error in loading corpus `%s': %s" % (expe.corpus, e)) raise e if not data: # Load manually from file data = cls._extract_data_file(expe, corpus=corpus) if save: # ===== save ==== cls._save_data(fn, data) else: # ===== load ==== data = cls._load_data(fn) return cls(expe, data, corpus=corpus)
Methods
def adj(self)
-
Returns a sparse Adjacency matrix.
Notes
index of the adjacency are reversed from np convention i:line, j:column
Source code
def adj(self): ''' Returns a sparse Adjacency matrix. Notes ----- index of the adjacency are reversed from np convention i:line, j:column ''' return gt.spectral.adjacency(self.data).T
def binarize(self)
-
Source code
def binarize(self): # Or set a propertie with gt.stats.label_parallel_edges(g) return gt.stats.remove_parallel_edges(self.data)
def cluster(self, v)
-
Source code
def cluster(self, v): c = self.data.vp['clusters'] return c[v]
def clustering_coefficient(self)
-
Source code
def clustering_coefficient(self): clust_coef, std = gt.clustering.global_clustering(self.data) return clust_coef
def density(self)
-
Source code
def density(self): e = self.data.num_edges() v = self.data.num_vertices() t = v * (v-1) if not self.data.is_directed(): t /= 2 return e / t
def diameter(self)
-
Source code
def diameter(self): diameter, end_points = gt.topology.pseudo_diameter(self.data) return int(diameter)
def edge(self, i, j)
-
Source code
def edge(self, i,j): return self.data.edge(i,j)
def feat_len(self)
-
Source code
def feat_len(self): weights = self.data.ep['weights'].a return len(np.unique(weights))
def getN(self)
-
Source code
def getN(self): return self.num_nodes()
def get_testset_ratio(self)
-
Source code
def get_testset_ratio(self): if 'testset_ratio' in self.expe: testset_ratio = self.expe['testset_ratio'] testset_ratio = float(testset_ratio) / 100 else: testset_ratio = 0 validset_ratio = self.get_validset_ratio() # Validation ratio is a ratio took on the remaining data (eta_ratio...) testset_ratio = testset_ratio * (1 + validset_ratio) return testset_ratio
def get_training_ratio(self)
-
Source code
def get_training_ratio(self): if 'training_ratio' in self.expe: training_ratio = self.expe['training_ratio'] training_ratio = float(training_ratio) / 100 else: training_ratio = 0 return training_ratio
def get_validset_ratio(self)
-
Source code
def get_validset_ratio(self): if 'validset_ratio' in self.expe: validset_ratio = self.expe['validset_ratio'] validset_ratio = float(validset_ratio) / 100 else: validset_ratio = 0 return validset_ratio
def is_directed(self)
-
Source code
def is_directed(self): return self.data.is_directed()
def is_symmetric(self)
-
Source code
def is_symmetric(self): return not self.data.is_directed()
def label(self, v)
-
Source code
def label(self, v): l = self.data.vp['labels'] return l[v]
def laplacian(self)
-
Returns a sparse Laplacian matrix.
Source code
def laplacian(self): ''' Returns a sparse Laplacian matrix. ''' return gt.spectral.laplacian(self.data)
def make_noise(self)
-
Source code
def make_noise(self): expe = self.expe g = self.data is_directed = g.is_directed() symmetric_scale = 1 if is_directed else 2 y = self.adj() E = g.num_edges() N = g.num_vertices() T = N*(N-1)/symmetric_scale weights = g.ep['weights'] ratio = 1/float(expe['noise']) #ts = self.get_testset_ratio() #if ts: # ratio -= 2*ts*E/T # if ratio < 0: # raise ValueError('Negative prior for noise ration') g.set_fast_edge_removal() edges = g.get_edges() edges_to_remove = np.random.choice(E, int(ratio*E), replace=False) _removed = set() for pos in edges_to_remove: i,j = edges[pos, :2] w = weights[i,j] if w > 1: weights[i,j] = w-1 else: e = g.edge(i,j) _removed.add(e) g.remove_edge(e) num_to_add = ratio*E*2 cpt = 0 while cpt < num_to_add: i, j = np.random.randint(0,N, 2) if g.edge(i,j): continue else: e = g.add_edge(i,j) if e in _removed: continue weights[e] = 1 cpt+=1 return g.set_fast_edge_removal(fast=False)
def make_testset(self, diag_off=1, set_filter=True)
-
make the testset as a edge propertie of the graph. If {set_filter} is True, activate the filtering property on the graph
The algo try to build a equillibrite testset between edge and non-edge with some uncertanty for undirected graph.
Notes
For Undirected graph, the sparse matrice is symmetrized, so the value of edges can be cummulated from the two inverted sample. Thus the ratio for the non-edges is reduce to 1/3 of the edge ratio to be fair.
This operation can be sub-optimal since it compare array of size N**2.
Warning: if set_filter=false, wiehgt is not set in the testset matrix.
Source code
def make_testset(self, diag_off=1, set_filter=True): ''' make the testset as a edge propertie of the graph. If {set_filter} is True, activate the filtering property on the graph The algo try to build a equillibrite testset between edge and non-edge with some uncertanty for undirected graph. Notes ----- For Undirected graph, the sparse matrice is symmetrized, so the value of edges can be cummulated from the two inverted sample. Thus the ratio for the non-edges is reduce to 1/3 of the edge ratio to be fair. This operation can be sub-optimal since it compare array of size N**2. Warning: if set_filter=false, wiehgt is not set in the testset matrix. ''' testset_ratio = self.get_testset_ratio() if not testset_ratio: raise ValueError('Testset ratio not understood : %s' % testset_ratio) g = self.data y = self.adj() N = g.num_vertices() E = g.num_edges() is_directed = not self.is_symmetric() symmetric_scale = 1 if is_directed else 2 size = N**2 #validset = lil_matrix((N,N)) testset = lil_matrix((N,N), dtype=bool) n = int(E * testset_ratio) # number of edge nz = int(n *float(self.expe.get('zeros_ratio', 1))) # number of non-links edges = g.get_edges().astype(int) i, j = edges[:, 0:2].T # Sampling edges ix = np.random.choice(E, n, replace=False) ix = np.array((i[ix], j[ix])) #testset[ix[0], ix[1]] = 1 # Sampling non-edges # # Ignore the diagonal => include the diag indices # in (i,j) like if it they was selected in the testset. # If Undirected also force ignore the lower part of the adjacency matrix. if is_directed: i = np.hstack((i, np.arange(N))) j = np.hstack((j, np.arange(N))) else: _i = i i = np.hstack((i, j, np.arange(N))) j = np.hstack((j, _i, np.arange(N))) edge_set = set(np.ravel_multi_index((i,j), y.shape)) nonlinks_set = set() max_loop = 150 cpt = 0 # Add the probability to sample symmetric edge... zeros_len = int(nz + int(not is_directed)*nz/size) while len(nonlinks_set) < zeros_len or cpt >= max_loop: nnz = zeros_len - len(nonlinks_set) nonlinks_set.update(set(np.random.choice(size, nnz))) nonlinks_set -= edge_set cpt+=1 self.log.debug('Number of iteration for sampling non-edges: %d' % cpt) if cpt >= max_loop: self.log.warning('Sampling tesset nonlinks failed:') print('desires size: %d, obtained: %s' % (zeros_len, len(nonlinks_set))) jx = np.array(np.unravel_index(list(nonlinks_set), (N,N))) #k = np.ravel_multi_index((i,j), y.shape) #jx = np.random.choice(size, int(nz/symmetric_scale), replace=False) #kind = np.bincount(k, minlength=size).astype(bool) #jind = np.bincount(jx, minlength=size).astype(bool) ## xor on bool faster ?! #_n = (jind & (kind==0)).sum() #max_loop = 100 #cpt = 0 #while _n < int(nz/symmetric_scale) or cpt > max_loop: # jx = np.random.choice(size, int(nz/symmetric_scale) - _n, replace=False) # jind = jind|np.bincount(jx, minlength=size).astype(bool) # _n = (jind & (kind==0)).sum() # cpt += 1 # #print(_n, n) #self.log.debug('Number of iteration for sampling non-edges: %d'%cpt) #jx = np.arange(size)[jind & (kind==0)] #jx = np.array(np.unravel_index(jx, (N,N))) # Actually settings the testset entries testset[ix[0], ix[1]] = 1 testset[jx[0], jx[1]] = 1 if not is_directed: testset[ix[1], ix[0]] = 1 testset[jx[1], jx[0]] = 1 data_test_w = np.transpose(testset.nonzero()) #self.reverse_filter() weights = [] for i,j in data_test_w: weights.append(self.weight(i,j)) #self.reverse_filter() data_test_w = np.hstack((data_test_w, np.array(weights)[None].T)) if set_filter: # Also set the weight here... testset_filter = g.new_ep('bool') testset_filter.a = 1 self.log.info('Looping to set the testset edge eproperties (graph-tool)...') for i, j in ix.T: testset_filter[i,j] = 0 g.set_edge_filter(testset_filter) self._links_testset = ix self._nonlinks_testset = jx self.data_test = testset self.data_test_w = data_test_w # if we learn with a training_ratio < 1 # we just ignore some random point. training_ratio = self.get_training_ratio() if training_ratio > 0 and training_ratio < 1: # remove edges self.data.set_fast_edge_removal() edges = self.data.get_edges() weights = self.data.ep['weights'] n_extra = len(edges) - training_ratio*len(edges) edges_to_remove = np.random.choice(len(edges), int(n_extra), replace=False) for pos in edges_to_remove: i,j = edges[pos, :2] e = g.edge(i,j) g.remove_edge(e) self.data.set_fast_edge_removal(fast=False) return
def modularity(self)
-
Source code
def modularity(self): if not 'labels' in self.data.vp: return None labels = self.data.vp['labels'] if 'weights' in self.data.ep: weights = self.data.ep['weights'] else: weights = None mod = gt.inference.modularity(self.data, labels, weights) return mod
def net_type(self)
-
Source code
def net_type(self): weights = self.data.ep['weights'].a return 'm/std/mn/mx: %.1f %.1f %d %d' % (weights.mean(), weights.std(), weights.min(), weights.max())
def num_edges(self)
-
Source code
def num_edges(self): return self.data.num_edges()
def num_mnb(self)
-
Minibatche size for the sampling.
Source code
def num_mnb(self): ''' Minibatche size for the sampling. ''' return int(self.num_nodes() * self._zeros_set_len * float(self.expe['sampling_coverage']))
def num_neighbors(self)
-
Source code
def num_neighbors(self): neigs = [] for v in range(self.num_nodes()): _v = self.data.vertex(v) neigs.append(len(list(_v.all_neighbors()))) return np.asarray(neigs)
def num_nnz(self)
-
Source code
def num_nnz(self): N = self.data.num_vertices() sym_pt = 1 if self.is_directed() else 2 if hasattr(self, 'data_test'): T = N*(N-1)/sym_pt - int(self.data_test.sum()/sym_pt) else: T = N*(N-1)/sym_pt return T
def num_nnzsum(self)
-
Source code
def num_nnzsum(self): return self.data.ep['weights'].a.sum()
def num_nodes(self)
-
Source code
def num_nodes(self): return self.data.num_vertices()
def remove_self_loops(self)
-
Source code
def remove_self_loops(self): # Or set a propertie with gt.stats.remove_self_loops(g) return gt.stats.remove_self_loops(self.data)
def reverse_filter(self)
-
Source code
def reverse_filter(self): filter, is_inv = self.data.get_edge_filter() self.data.set_edge_filter(filter, inverted=not is_inv)
def sample(self, N)
-
Reduce randomly the number of nodes of the graph.
Source code
def sample(self, N): ''' Reduce randomly the number of nodes of the graph. ''' n_to_remove = self.data.num_vertices() - N ids = np.random.randint(0, self.data.num_vertices(), n_to_remove) self.data.remove_vertex(ids) self.data.shrink_to_fit()
def set_directed(self, directed=True)
-
Source code
def set_directed(self, directed=True): # Modifying the adjency matrix don't modify the gt graph #y = self.adj() #(y!=y.T).nnz == 0 self.data.set_directed(directed)
def weight(self, i, j)
-
Source code
def weight(self, i,j): w = self.data.ep['weights'] if self.edge(i,j): return w[i,j] else: return 0
Inherited members