In [ ]:
%matplotlib inline 

import time
import matplotlib.pyplot as plt
import numpy as np


import networkx as nx
In [ ]:
def initializeCommunities(g, k, S):
    node_num = 0
    degree = 0
    for v in g.nodes():
        neighbors = g.neighbors(v)
        n_len = len(neighbors)
        node_num += 1
        degree += n_len
        if n_len >= k:
            S[v] = set(neighbors)
            S[v].add(v)
            #added[v] = set(neighbors)       
    #print 'init finish'
    print "average degree is", float(degree) / node_num
    bucks = max(20, float(degree) / node_num)
In [ ]:
def duplicationRemovel(S, ovl):
    delete_list = []
    nodes = sorted(S.keys())
    count = 0
    for u in nodes:
        if u not in S:
            continue

        for v in sorted(S[u]):
            if u == v:
                continue

            if v in S:
                join = S[u].intersection(S[v])
                join_len = len(join)
                if u == 11878:
                    print u, v, join, join_len
                if join_len+0.001 >= ovl * len(S[u]) and join_len+0.001 >= ovl * len(S[v]):
                    count += 1
                    s = v if len(S[u]) > len(S[v]) else u
                    m = u if s == v else v
                    #print 'when %d , Communtiy %d removed for duplication. Duplicate with %d' % (u, s, m)
                    
                    S.pop(s)
                    if s == u:
                        break

    return S
In [ ]:
def xi(v, S, g):
    neighbors = set(g.neighbors(v))    
    return 1.0 * len(neighbors.intersection(S)) / len(neighbors)
    
def psi(C, Cdot):
    return 1.0 * len(C.intersection(Cdot)) / min(len(C), len(Cdot))

def findStayOff(temp):
    anchor = 0
    mark = 0

    for e in range(len(temp)-1, 0, -1):
        if not mark and temp[e]:
            mark = e
            break
            
    #print temp, mark,
    for e in range(mark-1, 0, -1):
        if temp[e] < temp[mark]:
            mark = e

        if temp[e-1] >= temp[e] and temp[e] == temp[mark]:
            
     #       print mark, e
            anchor = e+1
            break
    #print anchor
    return anchor
In [ ]:
temp = [6, 1, 2, 2, 2, 1, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 2, 0, 0]
print findStayOff(temp)
In [ ]:
def zeta(v, S, g, k):
    neighbors = set(g.neighbors(v)).intersection(S)
    return 1.0*(len(neighbors)-k+1)/(len(S) - k) if len(neighbors) > k else 0
    
def computeScoreList(S, g, k):
    zeta_scores = {}
    count = 0
    join_scores = {}
    
    for i in S:
        for vj in S[i]:
            zeta_scores[vj] = {}
            join_scores[vj] = {}
            
    for i in S:
        Si = S[i]
        Si_len = len(Si)
        count += Si_len    
        
        for vj in Si:
            if vj == i:
                continue

            neighbors = g.neighbors(vj)
            n_len = len(neighbors)
            
            sin = Si.intersection(neighbors)
            sin_len = len(sin)
            
            if sin_len < k:
                zeta_scores[vj][i] = 0
                join_scores[vj][i] = 0
            else:
                zetascore = float(sin_len - k) / (Si_len - k)
                joinscore = sin_len / n_len
                zeta_scores[vj][i] = zetascore
                zeta_scores[vj]
                #k = zetascore 
                join_scores[vj][i] = joinscore

    #print 'zeta compute count', count
    # for vj in zeta_scores:
    #    print vj, zeta_scores[vj], g.neighbors(vj)
    return zeta_scores, join_scores
In [ ]:
def leaveCommunities(S, g, k, ovl, zeta_scores, zetacut):    
    delete_list = []
    leave = True
    
    for s in sorted(S.keys()):
        count = len(S[s])
        for n in sorted(S[s]):
            if n == s:
                continue

            if n not in zetacut or zeta_scores[n][s] < (zetacut[n]-0.001)/bucks or zeta_scores[n][s] <= 0.001:
                S[s].remove(n)
                count -= 1
                
            if count <= k:
                S.pop(s)
                leave = False
                break
    
    return leave
In [ ]:
def selectBucket(scores_list, count):

    comm_count = 0
    bucket_list = [0] * count
    for e in scores_list:
        if e < 0.001:
            continue
        comm_count += 1
        bucket = int((e-0.001) * count)
        #print bucket, 'count ' , count, 'e ', e
        bucket_list[bucket] += 1
    if comm_count > 1:
        anchor = findStayOff(bucket_list)
    else:
        anchor = 0
    #fp.write(" %s comm_count %d bucket is %d\n" % (str(bucket_list), comm_count, anchor))
    #print " %s comm_count %d bucket is %d\n" % (str(bucket_list), comm_count, anchor)
    #print bucket_list, 
    return anchor
In [ ]:
def expandCommunities(S, g, join_scores):

    for v in join_scores:
        count = bucks
        anchor = selectBucket(join_scores[v].values(), count)
        join_scores[v]['cutoff'] = (anchor-1) * 1.0 / count
    
    #for e in join_scores:
     #   print e, join_scores[e]
    #for e in added:
    #    print e, added[e]
    
    status = True
    #nowadded = {}
    for i in S:
        #nowadded[i] = set()
        for vj in list(S[i]):
            for uk in g.neighbors(vj):
                if uk not in join_scores or i not in join_scores[uk]:
                    #print 'not in' , uk, i, uk not in join_scores, i not in join_scores[uk]
                    continue
                if join_scores[uk][i] > join_scores[uk]['cutoff'] and uk not in S[i]:
                    S[i].add(uk)
                    status = False
    return status
In [ ]:
def summary_cluster(S):
    count = 0
    for i in S:
        count += len(S[i])
    return len(S), count
In [ ]:
def preferredCommunities(g, k, ovl):
    S = {}
    
    initializeCommunities(g, k, S)
    print "init stage, cluster %d ge, total node %d" % summary_cluster(S), time.time() - prev
    expand = 10
    while expand:
        
        if expand <= 0:
            break
        expand -= 1

        t = 10
        while t:
            t -= 1
            S = duplicationRemovel(S, ovl)

            print 'After duplicate removal, cluster %d ge, total node %d' % summary_cluster(S), time.time() - prev

            zeta_scores, join_scores = computeScoreList(S, g, k)
                                    
            zeta_cutoff = {}

            for v in zeta_scores:
                anchor = selectBucket(zeta_scores[v].values(), bucks)
                zeta_cutoff[v] = anchor

            leave = leaveCommunities(S, g, k, ovl, zeta_scores, zeta_cutoff)
            print 'After leave function, cluster %d ge, total node %d' % summary_cluster(S), time.time() - prev

            if leave:
                break

        status = expandCommunities(S, g, join_scores)
        print 'After expand function, cluster %d ge, total node %d' % summary_cluster(S), time.time() - prev
        if status:
            break

    print 'finish, cluster %d ge, total node %d' % summary_cluster(S)
    return S
In [ ]:
#data = np.loadtxt('G:/project/dataset/dblp/com-dblp.ungraph.txt', dtype=np.int32, delimiter='\t')
data = np.loadtxt('G:/software/open_source/focs-master/tmp.txt', dtype=np.int32, delimiter='\t', usecols=(0, 1))
G = nx.Graph()
G.add_edges_from(data)
g = nx.adjacency_matrix(G)
In [ ]:
print G.number_of_nodes()
In [ ]:
S = {}
bucks = 20
prev = time.time()
start = prev
z = preferredCommunities(G, 2, 0.6)
print 'total time', time.time() - start
In [ ]:
def test_2():
    S = {}
    bucks = 20
    k = 2
    OVL = 0.6
    initializeCommunities(G, k, S)
    #print summary_cluster(S)
    #duplicationRemovel(S, OVL)
    #print summary_cluster(S)

    zeta_scores, join_scores = computeScoreList(S, G, k)

    zeta_cutoff = {}

    fp = file('G:/software/open_source/focs-master/cutoff_pyt.txt', 'w')
    for v in sorted(G.nodes()):
        count = bucks
        fp.write("phase 1 iter 0 cutoff %d " % v)
        if v in zeta_scores:
            anchor = selectBucket(zeta_scores[v].values(), bucks)
        else:
            anchor = selectBucket([], bucks)
        zeta_cutoff[v] = anchor
        #print v, zeta_scores[v]['cutoff']
    fp.close()

    #print len(S.keys())
    #for i in sorted(S.keys()):
    leaveCommunities(S, G, k, OVL, zeta_scores, zeta_cutoff)
    print summary_cluster(S)
    #fp = file('G:/software/open_source/focs-master/cutoff_pyt.txt', 'w')
    #for v in sorted(zeta_scores):
     #   count = bucks
      #  fp.write("phase 1 cutoff %d " % v)
       # anchor = selectBucket(zeta_scores[v].values(), bucks)
        #print v, anchor
        #print v, zeta_scores[v]['cutoff']
    #fp.close()
In [ ]:
 
In [ ]:
 
In [ ]:
colors = ['r'] * len(test.nodes())
cmap = {0:'b', 10:'yellow', 27:'g'}
for e in z:
    
    for n in z[e]:
        colors[n] = cmap[e]
print colors
In [ ]:
nx.draw_networkx(test, node_size=150, node_color=colors)
In [ ]:
pcdqdata = np.loadtxt('G:/software/open_source/focs-master/tmp.txt', dtype=np.int32, delimiter='\t', usecols=(0, 1))
pcdqG = nx.Graph()
pcdqG.add_edges_from(pcdqdata)
g = nx.adjacency_matrix(pcdqG)
In [ ]:
pcdqS = {}
added = {}
prev = time.time()

z = preferredCommunities(pcdqG, 2, 0.6)
print time.time() - prev
In [ ]:
pcdqS = {}
added = {}
prev = time.time()

initializeCommunities(pcdqG, 2, pcdqS)
duplicationRemovel(pcdqS, 0.6)


#z = preferredCommunities(pcdqG, 2, 0.6)
print time.time() - prev
In [ ]:
#log = file('log1.txt', 'w')
#
#for u in sorted(pcdqS.keys())[:10]:
#    for v in sorted(pcdqS[u]):
#        if u == v:
#            continue
#        print u, v, zeta_scores[v]['cutoff'], zeta_scores[v][u]
#        #log.write("node %d nei node %d, cut-off %.3f, real score %.3f\n" % (u, v, zeta_scores[v]['cutoff'], zeta_scores[v][u]))
#        
#log.close()
In [ ]:
ground_truth = u"G:/project/dataset/社区发现/dblp/com-dblp.all.cmty.txt"
my_result = "G:/software/open_source/focs-master/focs-master/ordered.txt"

gt_fp = file(ground_truth)
mr_fp = file(my_result)

gt_content = gt_fp.readlines()
mr_content = mr_fp.readlines()
gt_fp.close()
mr_fp.close()
In [ ]:
gt_cmt_set = set()
mr_cmt_set = set()

for l in gt_content:
    c = tuple(int(e) for e in l.split())
    gt_cmt_set.add(c)
    
for l in mr_content:
    c = tuple(int(e) for e in l.split())
    mr_cmt_set.add(c)
        
In [ ]:
print len(gt_cmt_set), len(mr_cmt_set)
In [ ]:
x = set([1,2,3,5,2,3,4,5,2,3,4])
In [ ]:
from math import log
In [ ]:
N = 317080

c_gt = 0
for e in gt_cmt_set:
    c_i = len(e)
    c_gt += c_i * log(float(c_i) / N)

c_mr = 0
for e in mr_cmt_set:
    c_i = len(e)
    c_mr += c_i * log(float(c_i) / N)

print c_gt, c_mr
In [ ]:
comm_nmi = 0

#for i in gt_cmt_set:
#    for j in mr_cmt_set:
#        c_i = len(i)
#        c_j = len(j)
#        c_ij = len(set(i).intersection(set(j)))
#        if not c_ij:
#            continue
#        comm_nmi += c_ij * log(float(c_ij * N) / (c_i * c_j))
        
print comm_nmi
In [ ]:
from collections import defaultdict
gt_invert_index = defaultdict(list)
mr_invert_index = defaultdict(list)

#for s in gt_cmt_set:
#    for e in s:
#        gt_invert_index[e].append(s)
#        
#for s in mr_cmt_set:
#    for e in s:
#        mr_invert_index[e].append(s)

print len(gt_invert_index), len(mr_invert_index)
In [ ]:
comm_nmi = 0
#for e in gt_invert_index:
#    if e in mr_invert_index:
#        for i in gt_invert_index[e]:
#            for j in mr_invert_index[e]:
#                c_i = len(i)
#                c_j = len(j)
#                c_ij = len(set(i).intersection(set(j)))
#                comm_nmi += c_ij * log(float(c_ij * N) / (c_i * c_j))
print comm_nmi
In [ ]:
nmi = -2 * comm_nmi / (c_gt + c_mr)
In [ ]:
print nmi
In [ ]:
 
In [ ]:
def random_init_graph():
    
    import random
    from itertools import combinations

    edges = []
    for (u, v) in combinations(range(10), 2):
        r = random.random()
        if r < 0.5:
            edges.append((u, v))

    for (u, v) in combinations(range(8), 2):
        r = random.random()
        if r < 0.5:
            edges.append((u+10, v+10))

    for (u, v) in combinations(range(9), 2):
        r = random.random()
        if r < 0.6:
            edges.append((u+18, v+18))

    for (u, v) in combinations(range(5), 2):
        r = random.random()
        if r < 0.4:
            edges.append((u+27, v+27))

    for (u, v) in combinations(range(6), 2):
        r = random.random()
        if r < 0.1:
            edges.append((u, v+10))

    for (u, v) in combinations(range(7), 2):
        r = random.random()
        if r < 0.3:
            edges.append((u, v+18))

    for (u, v) in combinations(range(6), 2):
        r = random.random()
        if r < 0.2:
            edges.append((u, v+27))


    print  len(edges)
    test = nx.Graph()
    test.add_edges_from(edges)
    
    #fp = file('dataset'+str(len(test))+'.txt', 'w')
    #for (u, v) in edges:
    #    fp.write('%d %d\n' % (u, v))
    #fp.close()
    nx.draw(test,  with_label = True)
    pos = nx.spring_layout(test)
    nx.draw_networkx(test, pos=pos)
    colors = ['r'] * len(test.nodes())
    cmap = {0:'b', 10:'yellow', 27:'g'}
    #for e in test:
    #    for n in test[e]:
    #        colors[n] = cmap[e]
    #print colors
    #nx.draw_networkx(test, node_size=150, node_color=colors)
In [ ]:
random_init_graph()
In [ ]: