Module pymake.frontend.frontendnetwork
Source code
import sys, os
import itertools
from collections import defaultdict
from string import Template
from numpy import ma
import numpy as np
import networkx as nx
try:
import community as pylouvain
except:
pass
from .frontend import DataBase
from .drivers import DatasetDriver
from .frontendnetwork_gt import frontendNetwork_gt
from pymake.util.math import *
def getClique(N=100, K=4):
from scipy.linalg import block_diag
b = []
for k in range(K):
n = N // K
b.append(np.ones((n,n), int))
C = block_diag(*b)
return C
### @Issue42: fronteNetwork should be imported fron frontend
### =====> : resolve this with @class_method (from_hardrive etc...)
class frontendNetwork(DataBase, DatasetDriver):
""" Frontend for network data.
Symmetric network support.
"""
RANDOM_CORPUS = ('clique', 'alternate', 'BA')
_selfloop = False
def __init__(self, expe=None):
super(frontendNetwork, self).__init__(expe)
self._data_type = 'network'
data_format = expe.get('_data_format', 'b')
if data_format == 'w':
self._net_type = data_format
self._dtype = int
elif data_format == 'b':
self._net_type = data_format
self._dtype = bool
else:
raise NotImplemented('Network format unknwown: %s' % data_format)
# @Obsolete
# How to handle undefined variable ?
# What category for object ??
self.homo = int(expe.get('homo', 0))
self.clusters = None
self.features = None
self.true_classes = None
self.data_t = None
@classmethod
def from_array(cls, array):
fr = cls()
if isinstance(array, sp.sparse.csr_matrix):
raise NotImplementedError
elif isinstance(array, np.ma.MaskedArray):
data = array.data
fr.data_ma = array
elif isinstance(array, np.ndarray):
data = array
else:
raise NotImplementedError
fr.update_data(data)
return fr
#@mmm
def _set_rawdata_for_likelihood_computation(self):
''' Format train and test data to compute some score. '''
# JUNK
# for loglikelihood bernoulli computation
# For measure on the training set
self.data_A = self.data_ma.copy()
self.data_A.data[self.data_A.data == 0] = -1
self.data_B = np.ones(self.data_ma.shape) - self.data_ma
# For measure on the training set
data_ma_t = ma.array(self.data_ma.data, mask=~self.data_ma.mask)
self.data_A_t = data_ma_t.copy()
self.data_A_t.data[self.data_A_t.data == 0] = -1
self.data_B_t = np.ones(data_ma_t.shape) - data_ma_t
def load_data(self, randomize=False):
""" Load data according to different scheme,
by order of priority (if several specification in settings)
* Corpus from random generator
* Corpus from file dataset
"""
corpus_name = self.corpus_name
if self.corpus_name.startswith(self.RANDOM_CORPUS):
data = self.random_corpus(corpus_name)
else:
data = self.fs_corpus(corpus_name)
if data is None:
self.log.warning('Unable to load corpus: %s' % (corpus_name))
return
self.update_data(data)
# For Gof smothness
# error in degree_ check ?
if self.has_selfloop():
np.fill_diagonal(self.data, 1)
if randomize:
self.shuffle_node()
return self.data
def fs_corpus(self, corpus_name):
""" @debug Be smarter, has some database strategy.
Redirect to correct path depending on the corpus_name
"""
# DB integration ?
if corpus_name.startswith(('generator', 'graph')):
format = 'graph'
elif corpus_name in ('bench1'):
raise NotImplementedError()
elif corpus_name.startswith('facebook'):
format = 'edges'
elif corpus_name in ('manufacturing',):
format = 'csv'
elif corpus_name in ('fb_uc', 'emaileu'):
format = 'txt'
elif corpus_name in ('blogs','propro', 'euroroad'):
format = 'dat'
else:
raise ValueError('Which corpus to Load; %s ?' % corpus_name)
data = self.networkloader(corpus_name, format)
for a in ('features', 'clusters'):
if not hasattr(self, a):
setattr(self, a, None)
return data
def shuffle_instances(self):
index = np.arange(np.shape(self.data)[0])
np.random.shuffle(index)
self.data = self.data[index, :]
#if hasattr(self.data, 'A'):
# data = self.data.A
# np.random.shuffle(data)
# self.data = sp.sparse.csr_matrix(data)
#else:
# np.random.shuffle(self.data)
def shuffle_node(self):
""" Shuffle rows and columns of data """
N, M = self.data.shape
nodes_list = [np.random.permutation(N), np.random.permutation(M)]
self.reorder_node(nodes_list)
@staticmethod
def symmetrize(self, data=None):
''' inp-place symmetrization. '''
if data is None:
return None
data = np.triu(data) + np.triu(data, 1).T
def shuffle_features(self):
raise NotImplemented
def reorder_node(self, nodes_l):
""" Subsample the data with reordoring of rows and columns """
# Track the original nodes
self.nodes_list = [self.nodes_list[0][nodes_l[0]], self.nodes_list[1][nodes_l[1]]]
self.data = self.data[nodes_l[0], :][:, nodes_l[1]]
if hasattr(self, 'features') and self.features is not None:
self.features = self.features[nodes_l[0]]
if hasattr(self, 'clusters') and self.clusters is not None:
self.clusters = self.clusters[nodes_l[0]]
def sample(self, N, symmetric=False, randomize=False):
""" Write self ! """
if N == 'all':
N = self.data.shape[0]
else:
N = int(N)
# Can't get why modification inside self.nodes_list is not propagated ?
if randomize is True:
nodes_list = [np.random.permutation(N), np.random.permutation(N)]
self.reorder_node(nodes_list)
if N < self.data.shape[0]:
self.data = self.data[:N, :N]
self.update_data(self.data)
return self.data
def update_data(self, data):
''' Node list order will be lost '''
if data.dtype != self._dtype:
data = data.astype(self._dtype) * 1 # Bool operation are painfull
self.data = data
N, M = self.data.shape
self.N = N
self.nodes_list = [np.arange(N), np.arange(M)]
if hasattr(self, 'features') and self.features is not None:
self.features = self.features[:N]
if hasattr(self, 'clusters') and self.clusters is not None:
self.clusters = self.clusters[:N]
def make_testset(self, diag_off=1):
''' Make the test set with masked array. '''
testset_ratio = float(self.expe.get('testset_ratio'))
if testset_ratio >= 1:
testset_ratio = testset_ratio / 100
elif 0 <= testset_ratio < 1:
pass
else:
raise ValueError('cross validation ratio not understood : %s' % testset_ratio)
mask_type = self.expe.get('mask', 'unbalanced')
if mask_type == 'unbalanced':
self.data_ma = self.get_masked(testset_ratio, diag_off)
elif mask_type == 'balanced':
self.data_ma = self.get_masked_balanced(testset_ratio, diag_off)
elif mask_type == 'zeros':
self.data_ma = self.get_masked_zeros(diag_off)
else:
raise ValueError('mask type unknow :%s' % mask_type)
return
def get_masked(self, testset_ratio, diag_off=1):
""" Construct a random mask.
Random training set on 20% on Data / debug5 - debug11 -- Unbalanced
"""
data = self.data
if type(data) is np.ndarray:
#self.data_mat = sp.sparse.csr_matrix(data)
pass
else:
raise NotImplementedError('type %s unknow as corpus' % type(data))
n = int(data.size * testset_ratio)
mask_index = np.unravel_index(np.random.permutation(data.size)[:n], data.shape)
mask = np.zeros(data.shape, dtype=data.dtype)
mask[mask_index] = 1
if self.is_symmetric():
mask = np.tril(mask) + np.tril(mask, -1).T
data_ma = ma.array(data, mask=mask)
if diag_off == 1:
np.fill_diagonal(data_ma, ma.masked)
return data_ma
def get_masked_balanced(self, testset_ratio, diag_off=1):
''' Construct Mask based on the proportion of 1/links.
Random training set on 20% on Data vertex (0.2 * data == 1) / debug6 - debug 10 -- Balanced
'''
data = self.data
if type(data) is np.ndarray:
#self.data_mat = sp.sparse.csr_matrix(data)
pass
else:
raise NotImplementedError('type %s unknow as corpus' % type(data))
# Correponding Index
_0 = np.array(list(zip(*np.where(data == 0))))
_1 = np.array(list(zip(*np.where(data == 1))))
n = int(len(_1) * testset_ratio)
# Choice of Index
n_0 = _0[np.random.choice(len(_0), n, replace=False)]
n_1 = _1[np.random.choice(len(_1), n, replace=False)]
# Corresponding Mask
mask_index = list(zip(*(np.concatenate((n_0, n_1)))))
mask = np.zeros(data.shape, dtype=data.dtype)
mask[mask_index] = 1
if self.is_symmetric():
mask = np.tril(mask) + np.tril(mask, -1).T
data_ma = ma.array(data, mask=mask)
if diag_off == 1:
np.fill_diagonal(data_ma, ma.masked)
return data_ma
def get_masked_zeros(self, diag_off=1):
''' Take out all zeros '''
data = self.data
if type(data) is np.ndarray:
#self.data_mat = sp.sparse.csr_matrix(data)
pass
else:
raise NotImplementedError('type %s unknow as corpus' % type(data))
mask = np.zeros(data.shape, dtype=data.dtype)
mask[data == 0] = 1
if self.is_symmetric():
mask = np.tril(mask) + np.tril(mask, -1).T
data_ma = ma.array(data, mask=mask)
if diag_off == 1:
np.fill_diagonal(data_ma, ma.masked)
return data_ma
def is_symmetric(self, update=False):
if update or not hasattr(self, 'symmetric'):
self.symmetric = (self.data == self.data.T).all()
return self.symmetric
def is_directed(self):
return not self.is_symmetric()
def random_corpus(self, rnd):
N = self.getN()
if isinstance(N, str):
self.log.warning('Random graph size missing (-n): Using 100 nodes.')
N = 100
if rnd == 'uniform':
data = np.random.randint(0, 2, (N, N))
#np.fill_diagonal(data, 1)
elif rnd.startswith('clique'):
try :
K = int(rnd[len('clique'):])
except ValueError:
K = 42
data = getClique(N, K=K)
#Data = nx.adjacency_matrix(G, np.random.permutation(range(N))).A
elif rnd in ('BA', 'barabasi-albert'):
data = nx.adjacency_matrix(nx.barabasi_albert_graph(N, m=int(0.92*N)) ).A
elif rnd == 'alternate':
#data = np.empty((N,N),int)
data = np.zeros((N,N), int)
type_rd = 2
if type_rd == 1:
# degree alternating with frequency fr
fr = 3
data[:, ::fr] = 1
elif type_rd == 2:
# degree equal
data[:, ::2] = 1
data[::2] = np.roll(data[::2], 1)
return data
else:
raise NotImplementedError()
return data
def networkloader(self, corpus_name, format):
""" Load pickle or parse data.
Format is understanding for parsing.
Notes
-----
Corpus are in special path : {pmk/data/training/corpus_name}
"""
data = None
bdir = self.expe._input_path
fn = self._resolve_filename(self.expe)
# pmk file format...
if self._force_load_data and os.path.isfile(fn+'.gz'):
try:
data = self._load_data(fn)
except Exception as e:
self.log.error('Error : %s on %s' % (e, fn))
data = None
if data is None:
ext = format
_fn = os.path.join(bdir, corpus_name +'.'+ ext)
if os.path.isfile(fn) and os.stat(fn).st_size == 0:
self.log.warning('Doh, Corpus file is empty at: %s' % fn)
self.data = None
return
if ext == 'graph': # Dancer
_fn = os.path.join(bdir, 't0.graph')
_data = self.parse_dancer(_fn)
elif ext == 'edges': # NotImplemented
_fn = os.path.join(bdir, '0.edges')
_data = self.parse_edges(_fn)
raise NotImplementedError
elif ext in ('txt'):
_data = self.parse_tnet(_fn)
elif ext == 'csv':
_data = self.parse_csv(_fn)
elif ext == 'dat':
_data = self.parse_dat(_fn)
else:
raise ValueError('extension of network data unknown')
data = _data['data']
self.features = _data.get('features')
self.clusters = _data.get('clusters')
if self._force_save_data:
self._save_data(fn, data)
if np.tril(data, k=-1).sum() == 0:
# Symmetrize if lower triu is empty.
self.Symmetrize(data)
return data
def _old_communities_analysis(self):
clusters = self.clusters
if clusters is None:
return None
data = self.data
symmetric = self.is_symmetric()
community_distribution = list(np.bincount(clusters))
local_attach = {}
for n, c in enumerate(clusters):
comm = str(c)
local = local_attach.get(comm, [])
degree_n = data[n,:][clusters == c].sum()
if not symmetric:
degree_n += data[:, n][clusters == c].sum()
local.append(degree_n)
local_attach[comm] = local
return community_distribution, local_attach, clusters
# used by (obsolete) zipf.py
def communities_analysis(self, *args, **kwargs):
from pymake.util.algo import adj_to_degree # Circular import bug inthetop
clusters = self.clusters
if clusters is None:
return None
data = self.data
symmetric = self.is_symmetric()
community_distribution = list(np.bincount(clusters))
block_hist = np.bincount(clusters)
local_degree = {}
if symmetric:
k_perm = np.unique(list( map(list, map(set, itertools.product(np.unique(clusters) , repeat=2)))))
else:
k_perm = itertools.product(np.unique(clusters) , repeat=2)
for c in k_perm:
if type(c) in (np.float64, np.int64):
# one clusters (as it appears for real with max assignment
l = k = c
elif len(c) == 2:
# Stochastic Equivalence (extra class bind
k, l = c
else:
# Comunnities (intra class bind)
k = l = c.pop()
comm = (str(k), str(l))
local = local_degree.get(comm, [])
C = np.tile(clusters, (data.shape[0],1))
y_c = data * ((C==k) & (C.T==l))
if y_c.size > 0:
local_degree[comm] = adj_to_degree(y_c).values()
# Summing False !
#for n in np.arange(data.shape[0]))[clusters == k]:
# degree_n = data[n,:][(clusters == k) == (clusters == l)].sum()
# if not symmetric:
# degree_n = data[n,:][(clusters == k) == (clusters == l)].sum()
# local.append(degree_n)
#local_degree[comm] = local
return {'local_degree':local_degree,
'clusters': np.asarray(clusters),
'block_hist': block_hist,
'size': len(block_hist)}
def getG(self):
if not hasattr(self, 'G'):
if self.is_symmetric():
# Undirected Graph
typeG = nx.Graph()
else:
# Directed Graph
typeG = nx.DiGraph()
self.G = nx.from_numpy_matrix(self.data, create_using=typeG)
#self.G = nx.from_scipy_sparse_matrix(self.data, typeG)
return self.G
def to_directed(self):
''' Return self verion of graph wehre all links are flatened '''
if self.is_symmetric():
return self.getG()
else:
# nx to_undirected nedd a linkks in both side.
return nx.from_numpy_matrix(self.data, create_using=nx.Graph())
#
# Get Statistics
#
def num_nodes(self):
g = self.getG()
return g.number_of_nodes()
def num_edges(self):
g = self.getG()
return g.number_of_edges()
def diameter(self):
g = self.getG()
try:
diameter = nx.diameter(g)
except:
diameter = None
return diameter
def density(self):
g = self.getG()
return nx.density(g)
def modularity(self):
part = self.get_partition()
if not part:
return None
g = self.getG()
try:
modul = pylouvain.modularity(part, g)
except NameError:
self.log.error('python-louvain) library is not installed \n \
Modularity can\'t be computed ')
modul = None
return modul
def clustering_coefficient(self):
g = self.getG()
try:
cc = nx.average_clustering(g)
except:
cc = None
return cc
def net_type(self):
return '%s / max value: %s' % (self._net_type, np.max(self.data))
def feat_len(self):
return len(np.unique(self.data))
@property
def _type(self):
g = self.getG()
return type(g)
@property
def _shape(self):
g = self.getG()
return g.shape
def getN(self):
if hasattr(self, 'N'):
return self.N
N = str(self.expe['N'])
if N.isdigit():
N = int(N)
elif N.lower() in ('all', 'false', 'none'):
N = 'all'
else:
raise TypeError('Size of data no set (-n)')
self.N = N
return self.N
#def louvain_feature(self):
# get the louvain modularity
# and the feature for local analysis
def degree(self):
g = self.getG()
degree = list(dict(nx.degree(g)).values())
return degree
def degree_histogram(self):
g = self.getG()
return nx.degree_histogram(g)
def get_nfeat(self):
nfeat = self.data.max() + 1
if nfeat == 1:
self.log.warn('Warning, only zeros in adjacency matrix...')
nfeat = 2
return nfeat
def get_nnz(self):
''' len of tokens '''
size = sp.special.binom(self.getN(), 2)
if not self.is_symmetric():
size *= 2
if self.has_selfloop():
size += sekf.getN()
return size
def ma_nnz(self):
return len(self.data_ma.compressed())
def ma_nnz_t(self):
return self.data_ma.mask.sum()
# Contains the index of nodes with who it interact.
# @debug no more true for bipartite networks
def ma_dims(self):
''' Number of non masked values by row/nodes. '''
#data_dims = np.vectorize(len)(self.data)
#data_dims = [r.count() for r in self.data_ma]
data_dims = []
for i in range(self.data_ma.shape[0]):
data_dims.append(self.data_ma[i,:].count() + self.data_ma[:,i].count())
return np.array(data_dims, dtype=int)
def has_selfloop(self):
return self._selfloop
def get_params(self):
clusters = self.get_clusters()
K = max(clusters)+1
N = len(clusters)
theta = np.zeros((N,K))
theta[np.arange(N),clusters] = 1
return theta, None
def get_clusters(self):
return self.clusters
def get_partition(self, clusters=None):
if getattr(self, 'clusters', None) is None:
return {}
else:
clusters = self.clusters
N = len(clusters)
return dict(zip(*[np.arange(N), clusters]))
def clusters_len(self):
clusters = self.get_clusters()
if not clusters:
return None
else:
return max(clusters)+1
# Template for corpus information: Instance, Nnz, features etx
def template(self, dct, templ):
return Template(templ).substitute(dct)
def get_data_prop(self):
prop = defaultdict()
prop.update( {'corpus': self.corpus_name,
'instances' : self.data.shape[1] })
if self.is_symmetric():
nnz = np.triu(self.data).sum()
else:
nnz = self.data.sum()
_nnz = self.data.sum(axis=1)
d = {'instances': self.data.shape[1],
'nnz': nnz,
'nnz_mean': _nnz.mean(),
'nnz_var': _nnz.var(),
'density': self.density(),
'diameter': self.diameter(),
'clustering_coef': self.clustering_coefficient(),
'modularity': self.modularity(),
'communities': self.clusters_len(),
'features': self.get_nfeat(),
'directed': not self.is_symmetric()
}
prop.update(d)
return prop
def likelihood(self, theta, phi):
likelihood = theta.dot(phi).dot(theta.T)
return likelihood
def template(self, d):
d['time'] = d.get('time', None)
netw_templ = '''###### $corpus
Building: $time minutes
Nodes: $instances
Links: $nnz
Degree mean: $nnz_mean
Degree var: $nnz_var
Diameter: $diameter
Modularity: $modularity
Clustering Coefficient: $clustering_coef
Density: $density
Communities: $communities
Relations: $features
Directed: $directed
\n'''
return super(frontendNetwork, self).template(d, netw_templ)
def similarity_matrix(self, sim='cos'):
features = self.features
if features is None:
return None
if sim == 'dot':
sim = np.dot(features, features.T)
elif sim == 'cos':
norm = np.linalg.norm(features, axis=1)[np.newaxis]
sim = np.dot(features, features.T)/np.dot(norm.T, norm)
elif sim == 'kmeans':
cluster = kmeans(features, K=2)[np.newaxis]
cluster[cluster == 0] = -1
sim = np.dot(cluster.T,cluster)
elif sim == 'comm':
N = len(self.clusters)
#sim = np.repeat(np.array(self.clusters)[np.newaxis], N, 0)
theta , _ = self.get_params()
sim = theta.dot(theta.T)
sim = (sim == sim.T)*1
sim[sim < 1] = -1
elif sim == 'euclide_old':
from sklearn.metrics.pairwise import euclidean_distances as ed
#from plot import kmeans_plus
#kmeans_plus(features, K=4)
print (features)
dist = ed(features)
K = self.parameters_['k']
devs = self.parameters_['devs'][0]
sim = np.zeros(dist.shape)
sim[dist <= 2.0 * devs / K] = 1
sim[dist > 2.0 * devs / K] = -1
elif sim == 'euclide_abs':
from sklearn.metrics.pairwise import euclidean_distances as ed
#from plot import kmeans_plus
#kmeans_plus(features, K=4)
N = len(features)
K = self.parameters_['k']
devs = self.parameters_['devs'][0]
a = np.repeat(features[:,0][None], N, 0).T
b = np.repeat(features[:,0][None], N, 0)
sim1 = np.abs( a-b )
a = np.repeat(features[:,1][None], N, 0).T
b = np.repeat(features[:,1][None], N, 0)
sim2 = np.abs( a-b )
sim3 = np.zeros((N,N))
sim3[sim1 <= 2.0* devs / K] = 1
sim3[sim1 > 2.0 * devs / K] = -1
sim4 = np.zeros((N,N))
sim4[sim2 <= 2.0* devs / K] = 1
sim4[sim2 > 2.0 * devs / K] = -1
sim = sim4 + sim3
sim[sim >= 0] = 1
sim[sim < 0] = -1
elif sim == 'euclide_dist':
from sklearn.metrics.pairwise import euclidean_distances as ed
#from plot import kmeans_plus
#kmeans_plus(features, K=4)
N = len(features)
K = self.parameters_['k']
devs = self.parameters_['devs'][0]
sim1 = ed(np.repeat(features[:,0][None], 2, 0).T)
sim2 = ed(np.repeat(features[:,0][None], 2, 0).T)
sim3 = np.zeros((N,N))
sim3[sim1 <= 2.0* devs / K] = 1
sim3[sim1 > 2.0 * devs / K] = -1
sim4 = np.zeros((N,N))
sim4[sim2 <= 2.0* devs / K] = 1
sim4[sim2 > 2.0 * devs / K] = -1
sim = sim4 + sim3
sim[sim >= 0] = 1
sim[sim < 0] = -1
return sim
def homophily(self, model=None, sim='cos', type='kleinberg'):
N = self.data.shape[0]
card = N*(N-1)
if model:
data = model.generate(N)
#y = np.triu(y) + np.triu(y, 1).T
gram_matrix = model.similarity_matrix(sim=sim)
delta_treshold = .1
gram_matrix[gram_matrix >= delta_treshold] = 1
gram_matrix[gram_matrix < delta_treshold] = -1
else:
data = self.data
gram_matrix = self.similarity_matrix(sim=sim)
if gram_matrix is None:
return np.nan, np.nan
connected = data.sum()
unconnected = card - connected
similar = (gram_matrix > 0).sum()
unsimilar = (gram_matrix <= 0).sum()
indic_source = ma.array(np.ones(gram_matrix.shape)*-1, mask=ma.masked)
indic_source[(data == 1) & (gram_matrix > 0)] = 0
indic_source[(data == 1) & (gram_matrix <= 0)] = 1
indic_source[(data == 0) & (gram_matrix > 0)] = 2
indic_source[(data == 0) & (gram_matrix <= 0)] = 3
np.fill_diagonal(indic_source, ma.masked)
indic_source[indic_source == -1] = ma.masked
a = (indic_source==0).sum()
b = (indic_source==1).sum()
c = (indic_source==2).sum()
d = (indic_source==3).sum()
if type == 'kleinberg':
#print 'a: %s, connected: %s, similar %s, card: %s' % (a, connected,similar, card)
homo_obs = 1.0 * a / connected # precision; homophily respected
homo_exp = 1.0 * similar / card # rappel; strenght of homophily
else:
raise NotImplementedError
#if sim == 'euclide' and type is None:
# homo_obs = 1.0 * (a + d - c - b) / card
# pr = 1.0 * (data == 1).sum() / card
# ps = 1.0 * (indic_source==0).sum() / card
# pnr = 1.0 - pr
# pns = 1.0 - ps
# a_ = pr*ps*card
# b_ = pnr*ps*card
# c_ = pr*pns*card
# d_ = pnr*pns*card
# homo_expect = (a_+b_-c_-d_) /card
# return homo_obs, homo_expect
return homo_obs, homo_exp
def assort(self, model):
#if not source:
# data = self.data
# sim_source = self.similarity_matrix('cos')
data = self.data
N = self.data.shape[0]
sim_source = self.similarity_matrix(sim='cos')
y = model.generate(N)
#y = np.triu(y) + np.triu(y, 1).T
sim_learn = model.similarity_matrix(sim='cos')
np.fill_diagonal(indic_source, ma.masked)
assert(N == y.shape[0])
indic_source = ma.array(np.ones(sim_source.shape)*-1, mask=ma.masked)
indic_source[(data == 1) & (sim_source > 0)] = 0
indic_source[(data == 1) & (sim_source <= 0)] = 1
indic_source[(data == 0) & (sim_source > 0)] = 2
indic_source[(data == 0) & (sim_source <= 0)] = 3
indic_learn = ma.array(np.ones(sim_learn.shape)*-1, mask=ma.masked)
indic_learn[(y == 1) & (sim_learn > 0)] = 0
indic_learn[(y == 1) & (sim_learn <= 0)] = 1
indic_learn[(y == 0) & (sim_learn > 0)] = 2
indic_learn[(y == 0) & (sim_learn <= 0)] = 3
np.fill_diagonal(indic_learn, ma.masked)
np.fill_diagonal(indic_source, ma.masked)
indic_source[indic_source == -1] = ma.masked
indic_learn[indic_learn == -1] = ma.masked
### Indicateur Homophily Christine
homo_ind1_source = 1.0 * ( (indic_source==0).sum()+(indic_source==3).sum()-(indic_source==1).sum() - (indic_source==2).sum() ) / (N*(N-1))
homo_ind1_learn = 1.0 * ( (indic_learn== 0).sum()+(indic_learn==3).sum()-(indic_learn==1).sum() - (indic_learn==2).sum() ) / (N*(N-1))
# AMI / NMI
from sklearn import metrics
AMI = metrics.adjusted_mutual_info_score(indic_source.compressed(), indic_learn.compressed())
NMI = metrics.normalized_mutual_info_score(indic_source.compressed(), indic_learn.compressed())
print('homo_ind1 source: %f' % (homo_ind1_source))
print('homo_ind1 learn: %f' % (homo_ind1_learn))
print('AMI: %f, NMI: %f' % (AMI, NMI))
d = {'NMI' : NMI, 'homo_ind1_source' : homo_ind1_source, 'homo_ind1_learn' : homo_ind1_learn}
return d
Functions
def getClique(N=100, K=4)
-
Source code
def getClique(N=100, K=4): from scipy.linalg import block_diag b = [] for k in range(K): n = N // K b.append(np.ones((n,n), int)) C = block_diag(*b) return C
Classes
class frontendNetwork (expe=None)
-
Frontend for network data. Symmetric network support.
Source code
class frontendNetwork(DataBase, DatasetDriver): """ Frontend for network data. Symmetric network support. """ RANDOM_CORPUS = ('clique', 'alternate', 'BA') _selfloop = False def __init__(self, expe=None): super(frontendNetwork, self).__init__(expe) self._data_type = 'network' data_format = expe.get('_data_format', 'b') if data_format == 'w': self._net_type = data_format self._dtype = int elif data_format == 'b': self._net_type = data_format self._dtype = bool else: raise NotImplemented('Network format unknwown: %s' % data_format) # @Obsolete # How to handle undefined variable ? # What category for object ?? self.homo = int(expe.get('homo', 0)) self.clusters = None self.features = None self.true_classes = None self.data_t = None @classmethod def from_array(cls, array): fr = cls() if isinstance(array, sp.sparse.csr_matrix): raise NotImplementedError elif isinstance(array, np.ma.MaskedArray): data = array.data fr.data_ma = array elif isinstance(array, np.ndarray): data = array else: raise NotImplementedError fr.update_data(data) return fr #@mmm def _set_rawdata_for_likelihood_computation(self): ''' Format train and test data to compute some score. ''' # JUNK # for loglikelihood bernoulli computation # For measure on the training set self.data_A = self.data_ma.copy() self.data_A.data[self.data_A.data == 0] = -1 self.data_B = np.ones(self.data_ma.shape) - self.data_ma # For measure on the training set data_ma_t = ma.array(self.data_ma.data, mask=~self.data_ma.mask) self.data_A_t = data_ma_t.copy() self.data_A_t.data[self.data_A_t.data == 0] = -1 self.data_B_t = np.ones(data_ma_t.shape) - data_ma_t def load_data(self, randomize=False): """ Load data according to different scheme, by order of priority (if several specification in settings) * Corpus from random generator * Corpus from file dataset """ corpus_name = self.corpus_name if self.corpus_name.startswith(self.RANDOM_CORPUS): data = self.random_corpus(corpus_name) else: data = self.fs_corpus(corpus_name) if data is None: self.log.warning('Unable to load corpus: %s' % (corpus_name)) return self.update_data(data) # For Gof smothness # error in degree_ check ? if self.has_selfloop(): np.fill_diagonal(self.data, 1) if randomize: self.shuffle_node() return self.data def fs_corpus(self, corpus_name): """ @debug Be smarter, has some database strategy. Redirect to correct path depending on the corpus_name """ # DB integration ? if corpus_name.startswith(('generator', 'graph')): format = 'graph' elif corpus_name in ('bench1'): raise NotImplementedError() elif corpus_name.startswith('facebook'): format = 'edges' elif corpus_name in ('manufacturing',): format = 'csv' elif corpus_name in ('fb_uc', 'emaileu'): format = 'txt' elif corpus_name in ('blogs','propro', 'euroroad'): format = 'dat' else: raise ValueError('Which corpus to Load; %s ?' % corpus_name) data = self.networkloader(corpus_name, format) for a in ('features', 'clusters'): if not hasattr(self, a): setattr(self, a, None) return data def shuffle_instances(self): index = np.arange(np.shape(self.data)[0]) np.random.shuffle(index) self.data = self.data[index, :] #if hasattr(self.data, 'A'): # data = self.data.A # np.random.shuffle(data) # self.data = sp.sparse.csr_matrix(data) #else: # np.random.shuffle(self.data) def shuffle_node(self): """ Shuffle rows and columns of data """ N, M = self.data.shape nodes_list = [np.random.permutation(N), np.random.permutation(M)] self.reorder_node(nodes_list) @staticmethod def symmetrize(self, data=None): ''' inp-place symmetrization. ''' if data is None: return None data = np.triu(data) + np.triu(data, 1).T def shuffle_features(self): raise NotImplemented def reorder_node(self, nodes_l): """ Subsample the data with reordoring of rows and columns """ # Track the original nodes self.nodes_list = [self.nodes_list[0][nodes_l[0]], self.nodes_list[1][nodes_l[1]]] self.data = self.data[nodes_l[0], :][:, nodes_l[1]] if hasattr(self, 'features') and self.features is not None: self.features = self.features[nodes_l[0]] if hasattr(self, 'clusters') and self.clusters is not None: self.clusters = self.clusters[nodes_l[0]] def sample(self, N, symmetric=False, randomize=False): """ Write self ! """ if N == 'all': N = self.data.shape[0] else: N = int(N) # Can't get why modification inside self.nodes_list is not propagated ? if randomize is True: nodes_list = [np.random.permutation(N), np.random.permutation(N)] self.reorder_node(nodes_list) if N < self.data.shape[0]: self.data = self.data[:N, :N] self.update_data(self.data) return self.data def update_data(self, data): ''' Node list order will be lost ''' if data.dtype != self._dtype: data = data.astype(self._dtype) * 1 # Bool operation are painfull self.data = data N, M = self.data.shape self.N = N self.nodes_list = [np.arange(N), np.arange(M)] if hasattr(self, 'features') and self.features is not None: self.features = self.features[:N] if hasattr(self, 'clusters') and self.clusters is not None: self.clusters = self.clusters[:N] def make_testset(self, diag_off=1): ''' Make the test set with masked array. ''' testset_ratio = float(self.expe.get('testset_ratio')) if testset_ratio >= 1: testset_ratio = testset_ratio / 100 elif 0 <= testset_ratio < 1: pass else: raise ValueError('cross validation ratio not understood : %s' % testset_ratio) mask_type = self.expe.get('mask', 'unbalanced') if mask_type == 'unbalanced': self.data_ma = self.get_masked(testset_ratio, diag_off) elif mask_type == 'balanced': self.data_ma = self.get_masked_balanced(testset_ratio, diag_off) elif mask_type == 'zeros': self.data_ma = self.get_masked_zeros(diag_off) else: raise ValueError('mask type unknow :%s' % mask_type) return def get_masked(self, testset_ratio, diag_off=1): """ Construct a random mask. Random training set on 20% on Data / debug5 - debug11 -- Unbalanced """ data = self.data if type(data) is np.ndarray: #self.data_mat = sp.sparse.csr_matrix(data) pass else: raise NotImplementedError('type %s unknow as corpus' % type(data)) n = int(data.size * testset_ratio) mask_index = np.unravel_index(np.random.permutation(data.size)[:n], data.shape) mask = np.zeros(data.shape, dtype=data.dtype) mask[mask_index] = 1 if self.is_symmetric(): mask = np.tril(mask) + np.tril(mask, -1).T data_ma = ma.array(data, mask=mask) if diag_off == 1: np.fill_diagonal(data_ma, ma.masked) return data_ma def get_masked_balanced(self, testset_ratio, diag_off=1): ''' Construct Mask based on the proportion of 1/links. Random training set on 20% on Data vertex (0.2 * data == 1) / debug6 - debug 10 -- Balanced ''' data = self.data if type(data) is np.ndarray: #self.data_mat = sp.sparse.csr_matrix(data) pass else: raise NotImplementedError('type %s unknow as corpus' % type(data)) # Correponding Index _0 = np.array(list(zip(*np.where(data == 0)))) _1 = np.array(list(zip(*np.where(data == 1)))) n = int(len(_1) * testset_ratio) # Choice of Index n_0 = _0[np.random.choice(len(_0), n, replace=False)] n_1 = _1[np.random.choice(len(_1), n, replace=False)] # Corresponding Mask mask_index = list(zip(*(np.concatenate((n_0, n_1))))) mask = np.zeros(data.shape, dtype=data.dtype) mask[mask_index] = 1 if self.is_symmetric(): mask = np.tril(mask) + np.tril(mask, -1).T data_ma = ma.array(data, mask=mask) if diag_off == 1: np.fill_diagonal(data_ma, ma.masked) return data_ma def get_masked_zeros(self, diag_off=1): ''' Take out all zeros ''' data = self.data if type(data) is np.ndarray: #self.data_mat = sp.sparse.csr_matrix(data) pass else: raise NotImplementedError('type %s unknow as corpus' % type(data)) mask = np.zeros(data.shape, dtype=data.dtype) mask[data == 0] = 1 if self.is_symmetric(): mask = np.tril(mask) + np.tril(mask, -1).T data_ma = ma.array(data, mask=mask) if diag_off == 1: np.fill_diagonal(data_ma, ma.masked) return data_ma def is_symmetric(self, update=False): if update or not hasattr(self, 'symmetric'): self.symmetric = (self.data == self.data.T).all() return self.symmetric def is_directed(self): return not self.is_symmetric() def random_corpus(self, rnd): N = self.getN() if isinstance(N, str): self.log.warning('Random graph size missing (-n): Using 100 nodes.') N = 100 if rnd == 'uniform': data = np.random.randint(0, 2, (N, N)) #np.fill_diagonal(data, 1) elif rnd.startswith('clique'): try : K = int(rnd[len('clique'):]) except ValueError: K = 42 data = getClique(N, K=K) #Data = nx.adjacency_matrix(G, np.random.permutation(range(N))).A elif rnd in ('BA', 'barabasi-albert'): data = nx.adjacency_matrix(nx.barabasi_albert_graph(N, m=int(0.92*N)) ).A elif rnd == 'alternate': #data = np.empty((N,N),int) data = np.zeros((N,N), int) type_rd = 2 if type_rd == 1: # degree alternating with frequency fr fr = 3 data[:, ::fr] = 1 elif type_rd == 2: # degree equal data[:, ::2] = 1 data[::2] = np.roll(data[::2], 1) return data else: raise NotImplementedError() return data def networkloader(self, corpus_name, format): """ Load pickle or parse data. Format is understanding for parsing. Notes ----- Corpus are in special path : {pmk/data/training/corpus_name} """ data = None bdir = self.expe._input_path fn = self._resolve_filename(self.expe) # pmk file format... if self._force_load_data and os.path.isfile(fn+'.gz'): try: data = self._load_data(fn) except Exception as e: self.log.error('Error : %s on %s' % (e, fn)) data = None if data is None: ext = format _fn = os.path.join(bdir, corpus_name +'.'+ ext) if os.path.isfile(fn) and os.stat(fn).st_size == 0: self.log.warning('Doh, Corpus file is empty at: %s' % fn) self.data = None return if ext == 'graph': # Dancer _fn = os.path.join(bdir, 't0.graph') _data = self.parse_dancer(_fn) elif ext == 'edges': # NotImplemented _fn = os.path.join(bdir, '0.edges') _data = self.parse_edges(_fn) raise NotImplementedError elif ext in ('txt'): _data = self.parse_tnet(_fn) elif ext == 'csv': _data = self.parse_csv(_fn) elif ext == 'dat': _data = self.parse_dat(_fn) else: raise ValueError('extension of network data unknown') data = _data['data'] self.features = _data.get('features') self.clusters = _data.get('clusters') if self._force_save_data: self._save_data(fn, data) if np.tril(data, k=-1).sum() == 0: # Symmetrize if lower triu is empty. self.Symmetrize(data) return data def _old_communities_analysis(self): clusters = self.clusters if clusters is None: return None data = self.data symmetric = self.is_symmetric() community_distribution = list(np.bincount(clusters)) local_attach = {} for n, c in enumerate(clusters): comm = str(c) local = local_attach.get(comm, []) degree_n = data[n,:][clusters == c].sum() if not symmetric: degree_n += data[:, n][clusters == c].sum() local.append(degree_n) local_attach[comm] = local return community_distribution, local_attach, clusters # used by (obsolete) zipf.py def communities_analysis(self, *args, **kwargs): from pymake.util.algo import adj_to_degree # Circular import bug inthetop clusters = self.clusters if clusters is None: return None data = self.data symmetric = self.is_symmetric() community_distribution = list(np.bincount(clusters)) block_hist = np.bincount(clusters) local_degree = {} if symmetric: k_perm = np.unique(list( map(list, map(set, itertools.product(np.unique(clusters) , repeat=2))))) else: k_perm = itertools.product(np.unique(clusters) , repeat=2) for c in k_perm: if type(c) in (np.float64, np.int64): # one clusters (as it appears for real with max assignment l = k = c elif len(c) == 2: # Stochastic Equivalence (extra class bind k, l = c else: # Comunnities (intra class bind) k = l = c.pop() comm = (str(k), str(l)) local = local_degree.get(comm, []) C = np.tile(clusters, (data.shape[0],1)) y_c = data * ((C==k) & (C.T==l)) if y_c.size > 0: local_degree[comm] = adj_to_degree(y_c).values() # Summing False ! #for n in np.arange(data.shape[0]))[clusters == k]: # degree_n = data[n,:][(clusters == k) == (clusters == l)].sum() # if not symmetric: # degree_n = data[n,:][(clusters == k) == (clusters == l)].sum() # local.append(degree_n) #local_degree[comm] = local return {'local_degree':local_degree, 'clusters': np.asarray(clusters), 'block_hist': block_hist, 'size': len(block_hist)} def getG(self): if not hasattr(self, 'G'): if self.is_symmetric(): # Undirected Graph typeG = nx.Graph() else: # Directed Graph typeG = nx.DiGraph() self.G = nx.from_numpy_matrix(self.data, create_using=typeG) #self.G = nx.from_scipy_sparse_matrix(self.data, typeG) return self.G def to_directed(self): ''' Return self verion of graph wehre all links are flatened ''' if self.is_symmetric(): return self.getG() else: # nx to_undirected nedd a linkks in both side. return nx.from_numpy_matrix(self.data, create_using=nx.Graph()) # # Get Statistics # def num_nodes(self): g = self.getG() return g.number_of_nodes() def num_edges(self): g = self.getG() return g.number_of_edges() def diameter(self): g = self.getG() try: diameter = nx.diameter(g) except: diameter = None return diameter def density(self): g = self.getG() return nx.density(g) def modularity(self): part = self.get_partition() if not part: return None g = self.getG() try: modul = pylouvain.modularity(part, g) except NameError: self.log.error('python-louvain) library is not installed \n \ Modularity can\'t be computed ') modul = None return modul def clustering_coefficient(self): g = self.getG() try: cc = nx.average_clustering(g) except: cc = None return cc def net_type(self): return '%s / max value: %s' % (self._net_type, np.max(self.data)) def feat_len(self): return len(np.unique(self.data)) @property def _type(self): g = self.getG() return type(g) @property def _shape(self): g = self.getG() return g.shape def getN(self): if hasattr(self, 'N'): return self.N N = str(self.expe['N']) if N.isdigit(): N = int(N) elif N.lower() in ('all', 'false', 'none'): N = 'all' else: raise TypeError('Size of data no set (-n)') self.N = N return self.N #def louvain_feature(self): # get the louvain modularity # and the feature for local analysis def degree(self): g = self.getG() degree = list(dict(nx.degree(g)).values()) return degree def degree_histogram(self): g = self.getG() return nx.degree_histogram(g) def get_nfeat(self): nfeat = self.data.max() + 1 if nfeat == 1: self.log.warn('Warning, only zeros in adjacency matrix...') nfeat = 2 return nfeat def get_nnz(self): ''' len of tokens ''' size = sp.special.binom(self.getN(), 2) if not self.is_symmetric(): size *= 2 if self.has_selfloop(): size += sekf.getN() return size def ma_nnz(self): return len(self.data_ma.compressed()) def ma_nnz_t(self): return self.data_ma.mask.sum() # Contains the index of nodes with who it interact. # @debug no more true for bipartite networks def ma_dims(self): ''' Number of non masked values by row/nodes. ''' #data_dims = np.vectorize(len)(self.data) #data_dims = [r.count() for r in self.data_ma] data_dims = [] for i in range(self.data_ma.shape[0]): data_dims.append(self.data_ma[i,:].count() + self.data_ma[:,i].count()) return np.array(data_dims, dtype=int) def has_selfloop(self): return self._selfloop def get_params(self): clusters = self.get_clusters() K = max(clusters)+1 N = len(clusters) theta = np.zeros((N,K)) theta[np.arange(N),clusters] = 1 return theta, None def get_clusters(self): return self.clusters def get_partition(self, clusters=None): if getattr(self, 'clusters', None) is None: return {} else: clusters = self.clusters N = len(clusters) return dict(zip(*[np.arange(N), clusters])) def clusters_len(self): clusters = self.get_clusters() if not clusters: return None else: return max(clusters)+1 # Template for corpus information: Instance, Nnz, features etx def template(self, dct, templ): return Template(templ).substitute(dct) def get_data_prop(self): prop = defaultdict() prop.update( {'corpus': self.corpus_name, 'instances' : self.data.shape[1] }) if self.is_symmetric(): nnz = np.triu(self.data).sum() else: nnz = self.data.sum() _nnz = self.data.sum(axis=1) d = {'instances': self.data.shape[1], 'nnz': nnz, 'nnz_mean': _nnz.mean(), 'nnz_var': _nnz.var(), 'density': self.density(), 'diameter': self.diameter(), 'clustering_coef': self.clustering_coefficient(), 'modularity': self.modularity(), 'communities': self.clusters_len(), 'features': self.get_nfeat(), 'directed': not self.is_symmetric() } prop.update(d) return prop def likelihood(self, theta, phi): likelihood = theta.dot(phi).dot(theta.T) return likelihood def template(self, d): d['time'] = d.get('time', None) netw_templ = '''###### $corpus Building: $time minutes Nodes: $instances Links: $nnz Degree mean: $nnz_mean Degree var: $nnz_var Diameter: $diameter Modularity: $modularity Clustering Coefficient: $clustering_coef Density: $density Communities: $communities Relations: $features Directed: $directed \n''' return super(frontendNetwork, self).template(d, netw_templ) def similarity_matrix(self, sim='cos'): features = self.features if features is None: return None if sim == 'dot': sim = np.dot(features, features.T) elif sim == 'cos': norm = np.linalg.norm(features, axis=1)[np.newaxis] sim = np.dot(features, features.T)/np.dot(norm.T, norm) elif sim == 'kmeans': cluster = kmeans(features, K=2)[np.newaxis] cluster[cluster == 0] = -1 sim = np.dot(cluster.T,cluster) elif sim == 'comm': N = len(self.clusters) #sim = np.repeat(np.array(self.clusters)[np.newaxis], N, 0) theta , _ = self.get_params() sim = theta.dot(theta.T) sim = (sim == sim.T)*1 sim[sim < 1] = -1 elif sim == 'euclide_old': from sklearn.metrics.pairwise import euclidean_distances as ed #from plot import kmeans_plus #kmeans_plus(features, K=4) print (features) dist = ed(features) K = self.parameters_['k'] devs = self.parameters_['devs'][0] sim = np.zeros(dist.shape) sim[dist <= 2.0 * devs / K] = 1 sim[dist > 2.0 * devs / K] = -1 elif sim == 'euclide_abs': from sklearn.metrics.pairwise import euclidean_distances as ed #from plot import kmeans_plus #kmeans_plus(features, K=4) N = len(features) K = self.parameters_['k'] devs = self.parameters_['devs'][0] a = np.repeat(features[:,0][None], N, 0).T b = np.repeat(features[:,0][None], N, 0) sim1 = np.abs( a-b ) a = np.repeat(features[:,1][None], N, 0).T b = np.repeat(features[:,1][None], N, 0) sim2 = np.abs( a-b ) sim3 = np.zeros((N,N)) sim3[sim1 <= 2.0* devs / K] = 1 sim3[sim1 > 2.0 * devs / K] = -1 sim4 = np.zeros((N,N)) sim4[sim2 <= 2.0* devs / K] = 1 sim4[sim2 > 2.0 * devs / K] = -1 sim = sim4 + sim3 sim[sim >= 0] = 1 sim[sim < 0] = -1 elif sim == 'euclide_dist': from sklearn.metrics.pairwise import euclidean_distances as ed #from plot import kmeans_plus #kmeans_plus(features, K=4) N = len(features) K = self.parameters_['k'] devs = self.parameters_['devs'][0] sim1 = ed(np.repeat(features[:,0][None], 2, 0).T) sim2 = ed(np.repeat(features[:,0][None], 2, 0).T) sim3 = np.zeros((N,N)) sim3[sim1 <= 2.0* devs / K] = 1 sim3[sim1 > 2.0 * devs / K] = -1 sim4 = np.zeros((N,N)) sim4[sim2 <= 2.0* devs / K] = 1 sim4[sim2 > 2.0 * devs / K] = -1 sim = sim4 + sim3 sim[sim >= 0] = 1 sim[sim < 0] = -1 return sim def homophily(self, model=None, sim='cos', type='kleinberg'): N = self.data.shape[0] card = N*(N-1) if model: data = model.generate(N) #y = np.triu(y) + np.triu(y, 1).T gram_matrix = model.similarity_matrix(sim=sim) delta_treshold = .1 gram_matrix[gram_matrix >= delta_treshold] = 1 gram_matrix[gram_matrix < delta_treshold] = -1 else: data = self.data gram_matrix = self.similarity_matrix(sim=sim) if gram_matrix is None: return np.nan, np.nan connected = data.sum() unconnected = card - connected similar = (gram_matrix > 0).sum() unsimilar = (gram_matrix <= 0).sum() indic_source = ma.array(np.ones(gram_matrix.shape)*-1, mask=ma.masked) indic_source[(data == 1) & (gram_matrix > 0)] = 0 indic_source[(data == 1) & (gram_matrix <= 0)] = 1 indic_source[(data == 0) & (gram_matrix > 0)] = 2 indic_source[(data == 0) & (gram_matrix <= 0)] = 3 np.fill_diagonal(indic_source, ma.masked) indic_source[indic_source == -1] = ma.masked a = (indic_source==0).sum() b = (indic_source==1).sum() c = (indic_source==2).sum() d = (indic_source==3).sum() if type == 'kleinberg': #print 'a: %s, connected: %s, similar %s, card: %s' % (a, connected,similar, card) homo_obs = 1.0 * a / connected # precision; homophily respected homo_exp = 1.0 * similar / card # rappel; strenght of homophily else: raise NotImplementedError #if sim == 'euclide' and type is None: # homo_obs = 1.0 * (a + d - c - b) / card # pr = 1.0 * (data == 1).sum() / card # ps = 1.0 * (indic_source==0).sum() / card # pnr = 1.0 - pr # pns = 1.0 - ps # a_ = pr*ps*card # b_ = pnr*ps*card # c_ = pr*pns*card # d_ = pnr*pns*card # homo_expect = (a_+b_-c_-d_) /card # return homo_obs, homo_expect return homo_obs, homo_exp def assort(self, model): #if not source: # data = self.data # sim_source = self.similarity_matrix('cos') data = self.data N = self.data.shape[0] sim_source = self.similarity_matrix(sim='cos') y = model.generate(N) #y = np.triu(y) + np.triu(y, 1).T sim_learn = model.similarity_matrix(sim='cos') np.fill_diagonal(indic_source, ma.masked) assert(N == y.shape[0]) indic_source = ma.array(np.ones(sim_source.shape)*-1, mask=ma.masked) indic_source[(data == 1) & (sim_source > 0)] = 0 indic_source[(data == 1) & (sim_source <= 0)] = 1 indic_source[(data == 0) & (sim_source > 0)] = 2 indic_source[(data == 0) & (sim_source <= 0)] = 3 indic_learn = ma.array(np.ones(sim_learn.shape)*-1, mask=ma.masked) indic_learn[(y == 1) & (sim_learn > 0)] = 0 indic_learn[(y == 1) & (sim_learn <= 0)] = 1 indic_learn[(y == 0) & (sim_learn > 0)] = 2 indic_learn[(y == 0) & (sim_learn <= 0)] = 3 np.fill_diagonal(indic_learn, ma.masked) np.fill_diagonal(indic_source, ma.masked) indic_source[indic_source == -1] = ma.masked indic_learn[indic_learn == -1] = ma.masked ### Indicateur Homophily Christine homo_ind1_source = 1.0 * ( (indic_source==0).sum()+(indic_source==3).sum()-(indic_source==1).sum() - (indic_source==2).sum() ) / (N*(N-1)) homo_ind1_learn = 1.0 * ( (indic_learn== 0).sum()+(indic_learn==3).sum()-(indic_learn==1).sum() - (indic_learn==2).sum() ) / (N*(N-1)) # AMI / NMI from sklearn import metrics AMI = metrics.adjusted_mutual_info_score(indic_source.compressed(), indic_learn.compressed()) NMI = metrics.normalized_mutual_info_score(indic_source.compressed(), indic_learn.compressed()) print('homo_ind1 source: %f' % (homo_ind1_source)) print('homo_ind1 learn: %f' % (homo_ind1_learn)) print('AMI: %f, NMI: %f' % (AMI, NMI)) d = {'NMI' : NMI, 'homo_ind1_source' : homo_ind1_source, 'homo_ind1_learn' : homo_ind1_learn} return d
Ancestors
Class variables
var RANDOM_CORPUS
Static methods
def from_array(array)
-
Source code
@classmethod def from_array(cls, array): fr = cls() if isinstance(array, sp.sparse.csr_matrix): raise NotImplementedError elif isinstance(array, np.ma.MaskedArray): data = array.data fr.data_ma = array elif isinstance(array, np.ndarray): data = array else: raise NotImplementedError fr.update_data(data) return fr
def symmetrize(self, data=None)
-
inp-place symmetrization.
Source code
@staticmethod def symmetrize(self, data=None): ''' inp-place symmetrization. ''' if data is None: return None data = np.triu(data) + np.triu(data, 1).T
Methods
def assort(self, model)
-
Source code
def assort(self, model): #if not source: # data = self.data # sim_source = self.similarity_matrix('cos') data = self.data N = self.data.shape[0] sim_source = self.similarity_matrix(sim='cos') y = model.generate(N) #y = np.triu(y) + np.triu(y, 1).T sim_learn = model.similarity_matrix(sim='cos') np.fill_diagonal(indic_source, ma.masked) assert(N == y.shape[0]) indic_source = ma.array(np.ones(sim_source.shape)*-1, mask=ma.masked) indic_source[(data == 1) & (sim_source > 0)] = 0 indic_source[(data == 1) & (sim_source <= 0)] = 1 indic_source[(data == 0) & (sim_source > 0)] = 2 indic_source[(data == 0) & (sim_source <= 0)] = 3 indic_learn = ma.array(np.ones(sim_learn.shape)*-1, mask=ma.masked) indic_learn[(y == 1) & (sim_learn > 0)] = 0 indic_learn[(y == 1) & (sim_learn <= 0)] = 1 indic_learn[(y == 0) & (sim_learn > 0)] = 2 indic_learn[(y == 0) & (sim_learn <= 0)] = 3 np.fill_diagonal(indic_learn, ma.masked) np.fill_diagonal(indic_source, ma.masked) indic_source[indic_source == -1] = ma.masked indic_learn[indic_learn == -1] = ma.masked ### Indicateur Homophily Christine homo_ind1_source = 1.0 * ( (indic_source==0).sum()+(indic_source==3).sum()-(indic_source==1).sum() - (indic_source==2).sum() ) / (N*(N-1)) homo_ind1_learn = 1.0 * ( (indic_learn== 0).sum()+(indic_learn==3).sum()-(indic_learn==1).sum() - (indic_learn==2).sum() ) / (N*(N-1)) # AMI / NMI from sklearn import metrics AMI = metrics.adjusted_mutual_info_score(indic_source.compressed(), indic_learn.compressed()) NMI = metrics.normalized_mutual_info_score(indic_source.compressed(), indic_learn.compressed()) print('homo_ind1 source: %f' % (homo_ind1_source)) print('homo_ind1 learn: %f' % (homo_ind1_learn)) print('AMI: %f, NMI: %f' % (AMI, NMI)) d = {'NMI' : NMI, 'homo_ind1_source' : homo_ind1_source, 'homo_ind1_learn' : homo_ind1_learn} return d
def clustering_coefficient(self)
-
Source code
def clustering_coefficient(self): g = self.getG() try: cc = nx.average_clustering(g) except: cc = None return cc
def clusters_len(self)
-
Source code
def clusters_len(self): clusters = self.get_clusters() if not clusters: return None else: return max(clusters)+1
def communities_analysis(self, *args, **kwargs)
-
Source code
def communities_analysis(self, *args, **kwargs): from pymake.util.algo import adj_to_degree # Circular import bug inthetop clusters = self.clusters if clusters is None: return None data = self.data symmetric = self.is_symmetric() community_distribution = list(np.bincount(clusters)) block_hist = np.bincount(clusters) local_degree = {} if symmetric: k_perm = np.unique(list( map(list, map(set, itertools.product(np.unique(clusters) , repeat=2))))) else: k_perm = itertools.product(np.unique(clusters) , repeat=2) for c in k_perm: if type(c) in (np.float64, np.int64): # one clusters (as it appears for real with max assignment l = k = c elif len(c) == 2: # Stochastic Equivalence (extra class bind k, l = c else: # Comunnities (intra class bind) k = l = c.pop() comm = (str(k), str(l)) local = local_degree.get(comm, []) C = np.tile(clusters, (data.shape[0],1)) y_c = data * ((C==k) & (C.T==l)) if y_c.size > 0: local_degree[comm] = adj_to_degree(y_c).values() # Summing False ! #for n in np.arange(data.shape[0]))[clusters == k]: # degree_n = data[n,:][(clusters == k) == (clusters == l)].sum() # if not symmetric: # degree_n = data[n,:][(clusters == k) == (clusters == l)].sum() # local.append(degree_n) #local_degree[comm] = local return {'local_degree':local_degree, 'clusters': np.asarray(clusters), 'block_hist': block_hist, 'size': len(block_hist)}
def degree(self)
-
Source code
def degree(self): g = self.getG() degree = list(dict(nx.degree(g)).values()) return degree
def degree_histogram(self)
-
Source code
def degree_histogram(self): g = self.getG() return nx.degree_histogram(g)
def density(self)
-
Source code
def density(self): g = self.getG() return nx.density(g)
def diameter(self)
-
Source code
def diameter(self): g = self.getG() try: diameter = nx.diameter(g) except: diameter = None return diameter
def feat_len(self)
-
Source code
def feat_len(self): return len(np.unique(self.data))
def fs_corpus(self, corpus_name)
-
@debug Be smarter, has some database strategy. Redirect to correct path depending on the corpus_name
Source code
def fs_corpus(self, corpus_name): """ @debug Be smarter, has some database strategy. Redirect to correct path depending on the corpus_name """ # DB integration ? if corpus_name.startswith(('generator', 'graph')): format = 'graph' elif corpus_name in ('bench1'): raise NotImplementedError() elif corpus_name.startswith('facebook'): format = 'edges' elif corpus_name in ('manufacturing',): format = 'csv' elif corpus_name in ('fb_uc', 'emaileu'): format = 'txt' elif corpus_name in ('blogs','propro', 'euroroad'): format = 'dat' else: raise ValueError('Which corpus to Load; %s ?' % corpus_name) data = self.networkloader(corpus_name, format) for a in ('features', 'clusters'): if not hasattr(self, a): setattr(self, a, None) return data
def getG(self)
-
Source code
def getG(self): if not hasattr(self, 'G'): if self.is_symmetric(): # Undirected Graph typeG = nx.Graph() else: # Directed Graph typeG = nx.DiGraph() self.G = nx.from_numpy_matrix(self.data, create_using=typeG) #self.G = nx.from_scipy_sparse_matrix(self.data, typeG) return self.G
def getN(self)
-
Source code
def getN(self): if hasattr(self, 'N'): return self.N N = str(self.expe['N']) if N.isdigit(): N = int(N) elif N.lower() in ('all', 'false', 'none'): N = 'all' else: raise TypeError('Size of data no set (-n)') self.N = N return self.N
def get_clusters(self)
-
Source code
def get_clusters(self): return self.clusters
def get_data_prop(self)
-
Source code
def get_data_prop(self): prop = defaultdict() prop.update( {'corpus': self.corpus_name, 'instances' : self.data.shape[1] }) if self.is_symmetric(): nnz = np.triu(self.data).sum() else: nnz = self.data.sum() _nnz = self.data.sum(axis=1) d = {'instances': self.data.shape[1], 'nnz': nnz, 'nnz_mean': _nnz.mean(), 'nnz_var': _nnz.var(), 'density': self.density(), 'diameter': self.diameter(), 'clustering_coef': self.clustering_coefficient(), 'modularity': self.modularity(), 'communities': self.clusters_len(), 'features': self.get_nfeat(), 'directed': not self.is_symmetric() } prop.update(d) return prop
def get_masked(self, testset_ratio, diag_off=1)
-
Construct a random mask. Random training set on 20% on Data / debug5 - debug11 – Unbalanced
Source code
def get_masked(self, testset_ratio, diag_off=1): """ Construct a random mask. Random training set on 20% on Data / debug5 - debug11 -- Unbalanced """ data = self.data if type(data) is np.ndarray: #self.data_mat = sp.sparse.csr_matrix(data) pass else: raise NotImplementedError('type %s unknow as corpus' % type(data)) n = int(data.size * testset_ratio) mask_index = np.unravel_index(np.random.permutation(data.size)[:n], data.shape) mask = np.zeros(data.shape, dtype=data.dtype) mask[mask_index] = 1 if self.is_symmetric(): mask = np.tril(mask) + np.tril(mask, -1).T data_ma = ma.array(data, mask=mask) if diag_off == 1: np.fill_diagonal(data_ma, ma.masked) return data_ma
def get_masked_balanced(self, testset_ratio, diag_off=1)
-
Construct Mask based on the proportion of 1/links. Random training set on 20% on Data vertex (0.2 * data == 1) / debug6 - debug 10 – Balanced
Source code
def get_masked_balanced(self, testset_ratio, diag_off=1): ''' Construct Mask based on the proportion of 1/links. Random training set on 20% on Data vertex (0.2 * data == 1) / debug6 - debug 10 -- Balanced ''' data = self.data if type(data) is np.ndarray: #self.data_mat = sp.sparse.csr_matrix(data) pass else: raise NotImplementedError('type %s unknow as corpus' % type(data)) # Correponding Index _0 = np.array(list(zip(*np.where(data == 0)))) _1 = np.array(list(zip(*np.where(data == 1)))) n = int(len(_1) * testset_ratio) # Choice of Index n_0 = _0[np.random.choice(len(_0), n, replace=False)] n_1 = _1[np.random.choice(len(_1), n, replace=False)] # Corresponding Mask mask_index = list(zip(*(np.concatenate((n_0, n_1))))) mask = np.zeros(data.shape, dtype=data.dtype) mask[mask_index] = 1 if self.is_symmetric(): mask = np.tril(mask) + np.tril(mask, -1).T data_ma = ma.array(data, mask=mask) if diag_off == 1: np.fill_diagonal(data_ma, ma.masked) return data_ma
def get_masked_zeros(self, diag_off=1)
-
Take out all zeros
Source code
def get_masked_zeros(self, diag_off=1): ''' Take out all zeros ''' data = self.data if type(data) is np.ndarray: #self.data_mat = sp.sparse.csr_matrix(data) pass else: raise NotImplementedError('type %s unknow as corpus' % type(data)) mask = np.zeros(data.shape, dtype=data.dtype) mask[data == 0] = 1 if self.is_symmetric(): mask = np.tril(mask) + np.tril(mask, -1).T data_ma = ma.array(data, mask=mask) if diag_off == 1: np.fill_diagonal(data_ma, ma.masked) return data_ma
def get_nfeat(self)
-
Source code
def get_nfeat(self): nfeat = self.data.max() + 1 if nfeat == 1: self.log.warn('Warning, only zeros in adjacency matrix...') nfeat = 2 return nfeat
def get_nnz(self)
-
len of tokens
Source code
def get_nnz(self): ''' len of tokens ''' size = sp.special.binom(self.getN(), 2) if not self.is_symmetric(): size *= 2 if self.has_selfloop(): size += sekf.getN() return size
def get_params(self)
-
Source code
def get_params(self): clusters = self.get_clusters() K = max(clusters)+1 N = len(clusters) theta = np.zeros((N,K)) theta[np.arange(N),clusters] = 1 return theta, None
def get_partition(self, clusters=None)
-
Source code
def get_partition(self, clusters=None): if getattr(self, 'clusters', None) is None: return {} else: clusters = self.clusters N = len(clusters) return dict(zip(*[np.arange(N), clusters]))
def has_selfloop(self)
-
Source code
def has_selfloop(self): return self._selfloop
def homophily(self, model=None, sim='cos', type='kleinberg')
-
Source code
def homophily(self, model=None, sim='cos', type='kleinberg'): N = self.data.shape[0] card = N*(N-1) if model: data = model.generate(N) #y = np.triu(y) + np.triu(y, 1).T gram_matrix = model.similarity_matrix(sim=sim) delta_treshold = .1 gram_matrix[gram_matrix >= delta_treshold] = 1 gram_matrix[gram_matrix < delta_treshold] = -1 else: data = self.data gram_matrix = self.similarity_matrix(sim=sim) if gram_matrix is None: return np.nan, np.nan connected = data.sum() unconnected = card - connected similar = (gram_matrix > 0).sum() unsimilar = (gram_matrix <= 0).sum() indic_source = ma.array(np.ones(gram_matrix.shape)*-1, mask=ma.masked) indic_source[(data == 1) & (gram_matrix > 0)] = 0 indic_source[(data == 1) & (gram_matrix <= 0)] = 1 indic_source[(data == 0) & (gram_matrix > 0)] = 2 indic_source[(data == 0) & (gram_matrix <= 0)] = 3 np.fill_diagonal(indic_source, ma.masked) indic_source[indic_source == -1] = ma.masked a = (indic_source==0).sum() b = (indic_source==1).sum() c = (indic_source==2).sum() d = (indic_source==3).sum() if type == 'kleinberg': #print 'a: %s, connected: %s, similar %s, card: %s' % (a, connected,similar, card) homo_obs = 1.0 * a / connected # precision; homophily respected homo_exp = 1.0 * similar / card # rappel; strenght of homophily else: raise NotImplementedError #if sim == 'euclide' and type is None: # homo_obs = 1.0 * (a + d - c - b) / card # pr = 1.0 * (data == 1).sum() / card # ps = 1.0 * (indic_source==0).sum() / card # pnr = 1.0 - pr # pns = 1.0 - ps # a_ = pr*ps*card # b_ = pnr*ps*card # c_ = pr*pns*card # d_ = pnr*pns*card # homo_expect = (a_+b_-c_-d_) /card # return homo_obs, homo_expect return homo_obs, homo_exp
def is_directed(self)
-
Source code
def is_directed(self): return not self.is_symmetric()
def is_symmetric(self, update=False)
-
Source code
def is_symmetric(self, update=False): if update or not hasattr(self, 'symmetric'): self.symmetric = (self.data == self.data.T).all() return self.symmetric
def likelihood(self, theta, phi)
-
Source code
def likelihood(self, theta, phi): likelihood = theta.dot(phi).dot(theta.T) return likelihood
def load_data(self, randomize=False)
-
Load data according to different scheme, by order of priority (if several specification in settings) * Corpus from random generator * Corpus from file dataset
Source code
def load_data(self, randomize=False): """ Load data according to different scheme, by order of priority (if several specification in settings) * Corpus from random generator * Corpus from file dataset """ corpus_name = self.corpus_name if self.corpus_name.startswith(self.RANDOM_CORPUS): data = self.random_corpus(corpus_name) else: data = self.fs_corpus(corpus_name) if data is None: self.log.warning('Unable to load corpus: %s' % (corpus_name)) return self.update_data(data) # For Gof smothness # error in degree_ check ? if self.has_selfloop(): np.fill_diagonal(self.data, 1) if randomize: self.shuffle_node() return self.data
def ma_dims(self)
-
Number of non masked values by row/nodes.
Source code
def ma_dims(self): ''' Number of non masked values by row/nodes. ''' #data_dims = np.vectorize(len)(self.data) #data_dims = [r.count() for r in self.data_ma] data_dims = [] for i in range(self.data_ma.shape[0]): data_dims.append(self.data_ma[i,:].count() + self.data_ma[:,i].count()) return np.array(data_dims, dtype=int)
def ma_nnz(self)
-
Source code
def ma_nnz(self): return len(self.data_ma.compressed())
def ma_nnz_t(self)
-
Source code
def ma_nnz_t(self): return self.data_ma.mask.sum()
def make_testset(self, diag_off=1)
-
Make the test set with masked array.
Source code
def make_testset(self, diag_off=1): ''' Make the test set with masked array. ''' testset_ratio = float(self.expe.get('testset_ratio')) if testset_ratio >= 1: testset_ratio = testset_ratio / 100 elif 0 <= testset_ratio < 1: pass else: raise ValueError('cross validation ratio not understood : %s' % testset_ratio) mask_type = self.expe.get('mask', 'unbalanced') if mask_type == 'unbalanced': self.data_ma = self.get_masked(testset_ratio, diag_off) elif mask_type == 'balanced': self.data_ma = self.get_masked_balanced(testset_ratio, diag_off) elif mask_type == 'zeros': self.data_ma = self.get_masked_zeros(diag_off) else: raise ValueError('mask type unknow :%s' % mask_type) return
def modularity(self)
-
Source code
def modularity(self): part = self.get_partition() if not part: return None g = self.getG() try: modul = pylouvain.modularity(part, g) except NameError: self.log.error('python-louvain) library is not installed \n \ Modularity can\'t be computed ') modul = None return modul
def net_type(self)
-
Source code
def net_type(self): return '%s / max value: %s' % (self._net_type, np.max(self.data))
def networkloader(self, corpus_name, format)
-
Load pickle or parse data. Format is understanding for parsing.
Notes
Corpus are in special path : {pmk/data/training/corpus_name}
Source code
def networkloader(self, corpus_name, format): """ Load pickle or parse data. Format is understanding for parsing. Notes ----- Corpus are in special path : {pmk/data/training/corpus_name} """ data = None bdir = self.expe._input_path fn = self._resolve_filename(self.expe) # pmk file format... if self._force_load_data and os.path.isfile(fn+'.gz'): try: data = self._load_data(fn) except Exception as e: self.log.error('Error : %s on %s' % (e, fn)) data = None if data is None: ext = format _fn = os.path.join(bdir, corpus_name +'.'+ ext) if os.path.isfile(fn) and os.stat(fn).st_size == 0: self.log.warning('Doh, Corpus file is empty at: %s' % fn) self.data = None return if ext == 'graph': # Dancer _fn = os.path.join(bdir, 't0.graph') _data = self.parse_dancer(_fn) elif ext == 'edges': # NotImplemented _fn = os.path.join(bdir, '0.edges') _data = self.parse_edges(_fn) raise NotImplementedError elif ext in ('txt'): _data = self.parse_tnet(_fn) elif ext == 'csv': _data = self.parse_csv(_fn) elif ext == 'dat': _data = self.parse_dat(_fn) else: raise ValueError('extension of network data unknown') data = _data['data'] self.features = _data.get('features') self.clusters = _data.get('clusters') if self._force_save_data: self._save_data(fn, data) if np.tril(data, k=-1).sum() == 0: # Symmetrize if lower triu is empty. self.Symmetrize(data) return data
def num_edges(self)
-
Source code
def num_edges(self): g = self.getG() return g.number_of_edges()
def num_nodes(self)
-
Source code
def num_nodes(self): g = self.getG() return g.number_of_nodes()
def random_corpus(self, rnd)
-
Source code
def random_corpus(self, rnd): N = self.getN() if isinstance(N, str): self.log.warning('Random graph size missing (-n): Using 100 nodes.') N = 100 if rnd == 'uniform': data = np.random.randint(0, 2, (N, N)) #np.fill_diagonal(data, 1) elif rnd.startswith('clique'): try : K = int(rnd[len('clique'):]) except ValueError: K = 42 data = getClique(N, K=K) #Data = nx.adjacency_matrix(G, np.random.permutation(range(N))).A elif rnd in ('BA', 'barabasi-albert'): data = nx.adjacency_matrix(nx.barabasi_albert_graph(N, m=int(0.92*N)) ).A elif rnd == 'alternate': #data = np.empty((N,N),int) data = np.zeros((N,N), int) type_rd = 2 if type_rd == 1: # degree alternating with frequency fr fr = 3 data[:, ::fr] = 1 elif type_rd == 2: # degree equal data[:, ::2] = 1 data[::2] = np.roll(data[::2], 1) return data else: raise NotImplementedError() return data
def reorder_node(self, nodes_l)
-
Subsample the data with reordoring of rows and columns
Source code
def reorder_node(self, nodes_l): """ Subsample the data with reordoring of rows and columns """ # Track the original nodes self.nodes_list = [self.nodes_list[0][nodes_l[0]], self.nodes_list[1][nodes_l[1]]] self.data = self.data[nodes_l[0], :][:, nodes_l[1]] if hasattr(self, 'features') and self.features is not None: self.features = self.features[nodes_l[0]] if hasattr(self, 'clusters') and self.clusters is not None: self.clusters = self.clusters[nodes_l[0]]
def sample(self, N, symmetric=False, randomize=False)
-
Write self !
Source code
def sample(self, N, symmetric=False, randomize=False): """ Write self ! """ if N == 'all': N = self.data.shape[0] else: N = int(N) # Can't get why modification inside self.nodes_list is not propagated ? if randomize is True: nodes_list = [np.random.permutation(N), np.random.permutation(N)] self.reorder_node(nodes_list) if N < self.data.shape[0]: self.data = self.data[:N, :N] self.update_data(self.data) return self.data
def shuffle_features(self)
-
Source code
def shuffle_features(self): raise NotImplemented
def shuffle_instances(self)
-
Source code
def shuffle_instances(self): index = np.arange(np.shape(self.data)[0]) np.random.shuffle(index) self.data = self.data[index, :]
def shuffle_node(self)
-
Shuffle rows and columns of data
Source code
def shuffle_node(self): """ Shuffle rows and columns of data """ N, M = self.data.shape nodes_list = [np.random.permutation(N), np.random.permutation(M)] self.reorder_node(nodes_list)
def similarity_matrix(self, sim='cos')
-
Source code
def similarity_matrix(self, sim='cos'): features = self.features if features is None: return None if sim == 'dot': sim = np.dot(features, features.T) elif sim == 'cos': norm = np.linalg.norm(features, axis=1)[np.newaxis] sim = np.dot(features, features.T)/np.dot(norm.T, norm) elif sim == 'kmeans': cluster = kmeans(features, K=2)[np.newaxis] cluster[cluster == 0] = -1 sim = np.dot(cluster.T,cluster) elif sim == 'comm': N = len(self.clusters) #sim = np.repeat(np.array(self.clusters)[np.newaxis], N, 0) theta , _ = self.get_params() sim = theta.dot(theta.T) sim = (sim == sim.T)*1 sim[sim < 1] = -1 elif sim == 'euclide_old': from sklearn.metrics.pairwise import euclidean_distances as ed #from plot import kmeans_plus #kmeans_plus(features, K=4) print (features) dist = ed(features) K = self.parameters_['k'] devs = self.parameters_['devs'][0] sim = np.zeros(dist.shape) sim[dist <= 2.0 * devs / K] = 1 sim[dist > 2.0 * devs / K] = -1 elif sim == 'euclide_abs': from sklearn.metrics.pairwise import euclidean_distances as ed #from plot import kmeans_plus #kmeans_plus(features, K=4) N = len(features) K = self.parameters_['k'] devs = self.parameters_['devs'][0] a = np.repeat(features[:,0][None], N, 0).T b = np.repeat(features[:,0][None], N, 0) sim1 = np.abs( a-b ) a = np.repeat(features[:,1][None], N, 0).T b = np.repeat(features[:,1][None], N, 0) sim2 = np.abs( a-b ) sim3 = np.zeros((N,N)) sim3[sim1 <= 2.0* devs / K] = 1 sim3[sim1 > 2.0 * devs / K] = -1 sim4 = np.zeros((N,N)) sim4[sim2 <= 2.0* devs / K] = 1 sim4[sim2 > 2.0 * devs / K] = -1 sim = sim4 + sim3 sim[sim >= 0] = 1 sim[sim < 0] = -1 elif sim == 'euclide_dist': from sklearn.metrics.pairwise import euclidean_distances as ed #from plot import kmeans_plus #kmeans_plus(features, K=4) N = len(features) K = self.parameters_['k'] devs = self.parameters_['devs'][0] sim1 = ed(np.repeat(features[:,0][None], 2, 0).T) sim2 = ed(np.repeat(features[:,0][None], 2, 0).T) sim3 = np.zeros((N,N)) sim3[sim1 <= 2.0* devs / K] = 1 sim3[sim1 > 2.0 * devs / K] = -1 sim4 = np.zeros((N,N)) sim4[sim2 <= 2.0* devs / K] = 1 sim4[sim2 > 2.0 * devs / K] = -1 sim = sim4 + sim3 sim[sim >= 0] = 1 sim[sim < 0] = -1 return sim
def template(self, d)
-
Source code
def template(self, d): d['time'] = d.get('time', None) netw_templ = '''###### $corpus Building: $time minutes Nodes: $instances Links: $nnz Degree mean: $nnz_mean Degree var: $nnz_var Diameter: $diameter Modularity: $modularity Clustering Coefficient: $clustering_coef Density: $density Communities: $communities Relations: $features Directed: $directed \n''' return super(frontendNetwork, self).template(d, netw_templ)
def to_directed(self)
-
Return self verion of graph wehre all links are flatened
Source code
def to_directed(self): ''' Return self verion of graph wehre all links are flatened ''' if self.is_symmetric(): return self.getG() else: # nx to_undirected nedd a linkks in both side. return nx.from_numpy_matrix(self.data, create_using=nx.Graph())
def update_data(self, data)
-
Node list order will be lost
Source code
def update_data(self, data): ''' Node list order will be lost ''' if data.dtype != self._dtype: data = data.astype(self._dtype) * 1 # Bool operation are painfull self.data = data N, M = self.data.shape self.N = N self.nodes_list = [np.arange(N), np.arange(M)] if hasattr(self, 'features') and self.features is not None: self.features = self.features[:N] if hasattr(self, 'clusters') and self.clusters is not None: self.clusters = self.clusters[:N]
Inherited members