1. Data management script: original file format id \ tclusterId \ tgoldstandardId
DataManagement. py
#! /Usr/bin/python
Import cPickle as p;
Import sys;
Import re;
If (_ name __= = "_ main __"):
Filename = str (sys. argv [1]);
Preturn = re. compile ('(^ \ s + | \ s + $ )');
Fidsrc = file (filename, 'R ');
Clusters = {}; # (key, [])
Goldstandards ={}; # (key, [])
For line in fidsrc. readlines ():
Line = preturn. sub ('', line );
M = line. split ('\ t ');
# Print m
# S = raw_input ('Please enter ');
If (len (m) = 3): # if
If (not clusters. has_key (int (m [1]):
Clusters [int (m [1])] = [];
Clusters [int (m [1])]. append (int (m [0]);
Else:
Clusters [int (m [1])]. append (int (m [0]);
If (not goldstandards. has_key (int (m [2]):
Goldstandards [int (m [2])] = [];
Goldstandards [int (m [2])]. append (int (m [0]);
Else:
Goldstandards [int (m [2])]. append (int (m [0]);
Fidclusters = file (sys. argv [2], 'w ');
Fidgoldstandards = file (sys. argv [3], 'w ');
P. dump (clusters, fidclusters );
Fidclusters. close ();
P. dump (goldstandards, fidgoldstandards );
Fidgoldstandards. close ();
Fidsrc. close ();
Print '% s has finished! '% Sys. argv [0];
EvaluationClusterAlgorithm. py
#! /Usr/bin/python
#-*-Coding: cp936 -*-
Import re;
Import cPickle as mypickle;
Import sys;
Import math;
Class Evaluation:
Def _ init _ (self, clusterfid, goldstandardfid ):
Self. clusters = mypickle. load (file (clusterfid); # get the cluster algorithm results
Self. goldstandards = mypickle. load (file (goldstandardfid); # get the gold-standard answers
Tempclusterkeys = self. clusters. keys ();
Tempclusterkeys. sort ();
Tempgoldstandardkeys = self. goldstandards. keys ();
Tempgoldstandardkeys. sort ();
Self. k = len (tempclusterkeys );
Self. q = len (tempgoldstandardkeys );
Self. minclusterId = tempclusterkeys [0]; # minimum cluster ID
Self. maxclusterId = tempclusterkeys [self. k-1]; # maximum cluster ID
Self. mingoldstandardId = tempgoldstandardkeys [0];
Self. maxgoldstandardId = tempgoldstandardkeys [self. q-1];
Self. coocurrence ={}; # (clusterId, goldstandardId) = num; store the number of documents shared by clusterId and goldstandardId;
N1 = 0;
N2 = 0;
For m in tempclusterkeys:
N1 = N1 + len (self. clusters [m]);
For m in tempgoldstandardkeys:
N2 = N2 + len (self. goldstandards [m]);
If (N1 = N2 ):
Self. N = N1; # num of statements
Else:
Print 'there is a error N1 = % d, N2 = % d, please reexamine the data source '% (N1, N2 );
Def GenerateCoocurrence (self ):
For key_cluster in self. clusters. keys ():
Set1 = set (self. clusters [key_cluster]);
For key_gold in self. goldstandards. keys ():
Set2 = set (self. goldstandards [key_gold]);
Setintersect = set1 & set2;
Num = len (setintersect );
If (not self. coocurrence. has_key (key_cluster, key_gold ))):
Self. coocurrence [(key_cluster, key_gold)] = Num;
Def CalPurityForPerCluster (self, clusterId ):
Result = 0.0;
NumCollection = [];
For Id in range (self. mingoldstandardId, self. maxgoldstandardId + 1 ):
NumCollection. append (self. coocurrence [(clusterId, Id)]);
NumCollection. sort ();
Result = float (NumCollection [len (NumCollection)-1])/float (len (self. clusters [clusterId]);
Return result;
Def CalPurity (self ):
Result = 0.0;
For clusterId in range (self. minclusterId, self. maxclusterId + 1 ):
PurityPer = self. CalPurityForPerCluster (clusterId );
Result = result + float (len (self. clusters [clusterId]) * purityPer/float (self. N );
Return result;
Def CalEntropyFormula (self, seq ):
Result = 0.0;
For elemP in seq:
If (elemP> 0 ):
Result = result + elemP * math. log (elemP, 2 );
Return-result;
Def CalEntropyForPerCluster (self, clusterId ):
Seq = [];
Result = 0;
For Id in range (self. mingoldstandardId, self. maxgoldstandardId + 1 ):
Prob = float (self. coocurrence [(clusterId, Id)])/float (len (self. clusters [clusterId]);
Seq. append (Prob );
Result = self. CalEntropyFormula (seq );
Return result;
Def CalEntropy (self ):
Result = 0;
For clusterId in range (self. minclusterId, self. maxclusterId + 1 ):
EntropyPer = self. CalEntropyForPerCluster (clusterId );
Result = result + float (len (self. clusters [clusterId]) * entropyPer/float (self. N );
Return result;
Def CalMutualInformation (self ):
Result = 0.0;
For clusterId in range (self. minclusterId, self. maxclusterId + 1 ):
N_c = len (self. clusters [clusterId]);
For goldId in range (self. mingoldstandardId, self. maxgoldstandardId + 1 ):
N_g = len (self. goldstandards [goldId]);
N_cg = self. coocurrence [(clusterId, goldId)];
Part = float (self. N) * float (N_cg)/(N_c * N_g );
If (part> 0 ):
Result = result + (float (N_cg)/float (self. N) * math. log (part, 2 );
Return result;
Def CalNMI (self ):
NMI = 0.0;
Seq1 = []; # calculate the entropy of automated clusters
Seq2 = []; # calculate the entropy of gold-standard clusters
For clusterId in range (self. minclusterId, self. maxclusterId + 1 ):
Prob = float (len (self. clusters [clusterId])/float (self. N );
Seq1.append (Prob );
For goldId in range (self. mingoldstandardId, self. maxgoldstandardId + 1 ):
Prob = float (len (self. goldstandards [goldId])/float (self. N );
Seq2.append (Prob );
H1 = self. CalEntropyFormula (seq1 );
H2 = self. CalEntropyFormula (seq2 );
IG = self. CalMutualInformation ();
NMI = 2 * IG/(H1 + H2 );
Return NMI;
If (_ name __= = "_ main __"):
ClusterAddress = str (sys. argv [1]);
GoldAddress = str (sys. argv [2]);
E = Evaluation (clusterAddress, goldAddress );
Print 'number of clusters generated by the clustering algorithm % d' % e. k;
Print 'number of clusters % d' % e. q in the manually labeled standard answer;
Print 'total document count % d' % e. N;
Print 'minimum cluster ID label % d' % e. minclusterId;
Print 'maximum cluster ID % d' % e. maxclusterId;
Print 'minimum cluster ID % d' % e. mingoldstandardId in the standard answer;
Print 'maximum cluster ID % d' % e. maxgoldstandardId in the standard answer;
E. GenerateCoocurrence ();
# For m in e. coocurrence:
# Print m;
# Print e. coocurrence [m];
# Print '***************************'
Purity = e. CalPurity ();
Print 'purity: % F' % purity;
# A = [0.2, 0.3, 0.5, 0];
# Print e. CalEntropyFormula ();
Entropy = e. CalEntropy ();
Print 'entropy % F' % entropy;
Nmi = e. CalNMI ();
Print 'normalized mutual information is % F' % nmi
Code call