%matplotlib inline
import time
import matplotlib.pyplot as plt
import numpy as np
import networkx as nx
def initializeCommunities(g, k, S):
node_num = 0
degree = 0
for v in g.nodes():
neighbors = g.neighbors(v)
n_len = len(neighbors)
node_num += 1
degree += n_len
if n_len >= k:
S[v] = set(neighbors)
S[v].add(v)
#added[v] = set(neighbors)
#print 'init finish'
print "average degree is", float(degree) / node_num
bucks = max(20, float(degree) / node_num)
def duplicationRemovel(S, ovl):
delete_list = []
nodes = sorted(S.keys())
count = 0
for u in nodes:
if u not in S:
continue
for v in sorted(S[u]):
if u == v:
continue
if v in S:
join = S[u].intersection(S[v])
join_len = len(join)
if u == 11878:
print u, v, join, join_len
if join_len+0.001 >= ovl * len(S[u]) and join_len+0.001 >= ovl * len(S[v]):
count += 1
s = v if len(S[u]) > len(S[v]) else u
m = u if s == v else v
#print 'when %d , Communtiy %d removed for duplication. Duplicate with %d' % (u, s, m)
S.pop(s)
if s == u:
break
return S
def xi(v, S, g):
neighbors = set(g.neighbors(v))
return 1.0 * len(neighbors.intersection(S)) / len(neighbors)
def psi(C, Cdot):
return 1.0 * len(C.intersection(Cdot)) / min(len(C), len(Cdot))
def findStayOff(temp):
anchor = 0
mark = 0
for e in range(len(temp)-1, 0, -1):
if not mark and temp[e]:
mark = e
break
#print temp, mark,
for e in range(mark-1, 0, -1):
if temp[e] < temp[mark]:
mark = e
if temp[e-1] >= temp[e] and temp[e] == temp[mark]:
# print mark, e
anchor = e+1
break
#print anchor
return anchor
temp = [6, 1, 2, 2, 2, 1, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 2, 0, 0]
print findStayOff(temp)
def zeta(v, S, g, k):
neighbors = set(g.neighbors(v)).intersection(S)
return 1.0*(len(neighbors)-k+1)/(len(S) - k) if len(neighbors) > k else 0
def computeScoreList(S, g, k):
zeta_scores = {}
count = 0
join_scores = {}
for i in S:
for vj in S[i]:
zeta_scores[vj] = {}
join_scores[vj] = {}
for i in S:
Si = S[i]
Si_len = len(Si)
count += Si_len
for vj in Si:
if vj == i:
continue
neighbors = g.neighbors(vj)
n_len = len(neighbors)
sin = Si.intersection(neighbors)
sin_len = len(sin)
if sin_len < k:
zeta_scores[vj][i] = 0
join_scores[vj][i] = 0
else:
zetascore = float(sin_len - k) / (Si_len - k)
joinscore = sin_len / n_len
zeta_scores[vj][i] = zetascore
zeta_scores[vj]
#k = zetascore
join_scores[vj][i] = joinscore
#print 'zeta compute count', count
# for vj in zeta_scores:
# print vj, zeta_scores[vj], g.neighbors(vj)
return zeta_scores, join_scores
def leaveCommunities(S, g, k, ovl, zeta_scores, zetacut):
delete_list = []
leave = True
for s in sorted(S.keys()):
count = len(S[s])
for n in sorted(S[s]):
if n == s:
continue
if n not in zetacut or zeta_scores[n][s] < (zetacut[n]-0.001)/bucks or zeta_scores[n][s] <= 0.001:
S[s].remove(n)
count -= 1
if count <= k:
S.pop(s)
leave = False
break
return leave
def selectBucket(scores_list, count):
comm_count = 0
bucket_list = [0] * count
for e in scores_list:
if e < 0.001:
continue
comm_count += 1
bucket = int((e-0.001) * count)
#print bucket, 'count ' , count, 'e ', e
bucket_list[bucket] += 1
if comm_count > 1:
anchor = findStayOff(bucket_list)
else:
anchor = 0
#fp.write(" %s comm_count %d bucket is %d\n" % (str(bucket_list), comm_count, anchor))
#print " %s comm_count %d bucket is %d\n" % (str(bucket_list), comm_count, anchor)
#print bucket_list,
return anchor
def expandCommunities(S, g, join_scores):
for v in join_scores:
count = bucks
anchor = selectBucket(join_scores[v].values(), count)
join_scores[v]['cutoff'] = (anchor-1) * 1.0 / count
#for e in join_scores:
# print e, join_scores[e]
#for e in added:
# print e, added[e]
status = True
#nowadded = {}
for i in S:
#nowadded[i] = set()
for vj in list(S[i]):
for uk in g.neighbors(vj):
if uk not in join_scores or i not in join_scores[uk]:
#print 'not in' , uk, i, uk not in join_scores, i not in join_scores[uk]
continue
if join_scores[uk][i] > join_scores[uk]['cutoff'] and uk not in S[i]:
S[i].add(uk)
status = False
return status
def summary_cluster(S):
count = 0
for i in S:
count += len(S[i])
return len(S), count
def preferredCommunities(g, k, ovl):
S = {}
initializeCommunities(g, k, S)
print "init stage, cluster %d ge, total node %d" % summary_cluster(S), time.time() - prev
expand = 10
while expand:
if expand <= 0:
break
expand -= 1
t = 10
while t:
t -= 1
S = duplicationRemovel(S, ovl)
print 'After duplicate removal, cluster %d ge, total node %d' % summary_cluster(S), time.time() - prev
zeta_scores, join_scores = computeScoreList(S, g, k)
zeta_cutoff = {}
for v in zeta_scores:
anchor = selectBucket(zeta_scores[v].values(), bucks)
zeta_cutoff[v] = anchor
leave = leaveCommunities(S, g, k, ovl, zeta_scores, zeta_cutoff)
print 'After leave function, cluster %d ge, total node %d' % summary_cluster(S), time.time() - prev
if leave:
break
status = expandCommunities(S, g, join_scores)
print 'After expand function, cluster %d ge, total node %d' % summary_cluster(S), time.time() - prev
if status:
break
print 'finish, cluster %d ge, total node %d' % summary_cluster(S)
return S
#data = np.loadtxt('G:/project/dataset/dblp/com-dblp.ungraph.txt', dtype=np.int32, delimiter='\t')
data = np.loadtxt('G:/software/open_source/focs-master/tmp.txt', dtype=np.int32, delimiter='\t', usecols=(0, 1))
G = nx.Graph()
G.add_edges_from(data)
g = nx.adjacency_matrix(G)
print G.number_of_nodes()
S = {}
bucks = 20
prev = time.time()
start = prev
z = preferredCommunities(G, 2, 0.6)
print 'total time', time.time() - start
def test_2():
S = {}
bucks = 20
k = 2
OVL = 0.6
initializeCommunities(G, k, S)
#print summary_cluster(S)
#duplicationRemovel(S, OVL)
#print summary_cluster(S)
zeta_scores, join_scores = computeScoreList(S, G, k)
zeta_cutoff = {}
fp = file('G:/software/open_source/focs-master/cutoff_pyt.txt', 'w')
for v in sorted(G.nodes()):
count = bucks
fp.write("phase 1 iter 0 cutoff %d " % v)
if v in zeta_scores:
anchor = selectBucket(zeta_scores[v].values(), bucks)
else:
anchor = selectBucket([], bucks)
zeta_cutoff[v] = anchor
#print v, zeta_scores[v]['cutoff']
fp.close()
#print len(S.keys())
#for i in sorted(S.keys()):
leaveCommunities(S, G, k, OVL, zeta_scores, zeta_cutoff)
print summary_cluster(S)
#fp = file('G:/software/open_source/focs-master/cutoff_pyt.txt', 'w')
#for v in sorted(zeta_scores):
# count = bucks
# fp.write("phase 1 cutoff %d " % v)
# anchor = selectBucket(zeta_scores[v].values(), bucks)
#print v, anchor
#print v, zeta_scores[v]['cutoff']
#fp.close()
colors = ['r'] * len(test.nodes())
cmap = {0:'b', 10:'yellow', 27:'g'}
for e in z:
for n in z[e]:
colors[n] = cmap[e]
print colors
nx.draw_networkx(test, node_size=150, node_color=colors)
pcdqdata = np.loadtxt('G:/software/open_source/focs-master/tmp.txt', dtype=np.int32, delimiter='\t', usecols=(0, 1))
pcdqG = nx.Graph()
pcdqG.add_edges_from(pcdqdata)
g = nx.adjacency_matrix(pcdqG)
pcdqS = {}
added = {}
prev = time.time()
z = preferredCommunities(pcdqG, 2, 0.6)
print time.time() - prev
pcdqS = {}
added = {}
prev = time.time()
initializeCommunities(pcdqG, 2, pcdqS)
duplicationRemovel(pcdqS, 0.6)
#z = preferredCommunities(pcdqG, 2, 0.6)
print time.time() - prev
#log = file('log1.txt', 'w')
#
#for u in sorted(pcdqS.keys())[:10]:
# for v in sorted(pcdqS[u]):
# if u == v:
# continue
# print u, v, zeta_scores[v]['cutoff'], zeta_scores[v][u]
# #log.write("node %d nei node %d, cut-off %.3f, real score %.3f\n" % (u, v, zeta_scores[v]['cutoff'], zeta_scores[v][u]))
#
#log.close()
ground_truth = u"G:/project/dataset/社区发现/dblp/com-dblp.all.cmty.txt"
my_result = "G:/software/open_source/focs-master/focs-master/ordered.txt"
gt_fp = file(ground_truth)
mr_fp = file(my_result)
gt_content = gt_fp.readlines()
mr_content = mr_fp.readlines()
gt_fp.close()
mr_fp.close()
gt_cmt_set = set()
mr_cmt_set = set()
for l in gt_content:
c = tuple(int(e) for e in l.split())
gt_cmt_set.add(c)
for l in mr_content:
c = tuple(int(e) for e in l.split())
mr_cmt_set.add(c)
print len(gt_cmt_set), len(mr_cmt_set)
x = set([1,2,3,5,2,3,4,5,2,3,4])
from math import log
N = 317080
c_gt = 0
for e in gt_cmt_set:
c_i = len(e)
c_gt += c_i * log(float(c_i) / N)
c_mr = 0
for e in mr_cmt_set:
c_i = len(e)
c_mr += c_i * log(float(c_i) / N)
print c_gt, c_mr
comm_nmi = 0
#for i in gt_cmt_set:
# for j in mr_cmt_set:
# c_i = len(i)
# c_j = len(j)
# c_ij = len(set(i).intersection(set(j)))
# if not c_ij:
# continue
# comm_nmi += c_ij * log(float(c_ij * N) / (c_i * c_j))
print comm_nmi
from collections import defaultdict
gt_invert_index = defaultdict(list)
mr_invert_index = defaultdict(list)
#for s in gt_cmt_set:
# for e in s:
# gt_invert_index[e].append(s)
#
#for s in mr_cmt_set:
# for e in s:
# mr_invert_index[e].append(s)
print len(gt_invert_index), len(mr_invert_index)
comm_nmi = 0
#for e in gt_invert_index:
# if e in mr_invert_index:
# for i in gt_invert_index[e]:
# for j in mr_invert_index[e]:
# c_i = len(i)
# c_j = len(j)
# c_ij = len(set(i).intersection(set(j)))
# comm_nmi += c_ij * log(float(c_ij * N) / (c_i * c_j))
print comm_nmi
nmi = -2 * comm_nmi / (c_gt + c_mr)
print nmi
def random_init_graph():
import random
from itertools import combinations
edges = []
for (u, v) in combinations(range(10), 2):
r = random.random()
if r < 0.5:
edges.append((u, v))
for (u, v) in combinations(range(8), 2):
r = random.random()
if r < 0.5:
edges.append((u+10, v+10))
for (u, v) in combinations(range(9), 2):
r = random.random()
if r < 0.6:
edges.append((u+18, v+18))
for (u, v) in combinations(range(5), 2):
r = random.random()
if r < 0.4:
edges.append((u+27, v+27))
for (u, v) in combinations(range(6), 2):
r = random.random()
if r < 0.1:
edges.append((u, v+10))
for (u, v) in combinations(range(7), 2):
r = random.random()
if r < 0.3:
edges.append((u, v+18))
for (u, v) in combinations(range(6), 2):
r = random.random()
if r < 0.2:
edges.append((u, v+27))
print len(edges)
test = nx.Graph()
test.add_edges_from(edges)
#fp = file('dataset'+str(len(test))+'.txt', 'w')
#for (u, v) in edges:
# fp.write('%d %d\n' % (u, v))
#fp.close()
nx.draw(test, with_label = True)
pos = nx.spring_layout(test)
nx.draw_networkx(test, pos=pos)
colors = ['r'] * len(test.nodes())
cmap = {0:'b', 10:'yellow', 27:'g'}
#for e in test:
# for n in test[e]:
# colors[n] = cmap[e]
#print colors
#nx.draw_networkx(test, node_size=150, node_color=colors)
random_init_graph()