Supplemental online material for “Clustering, Seriation, and Subset Extraction of Confusion Data,” by Michael J. Brusco and J. Douglas Steinley.
Program 1: PREPARE.M
function [s1, s2, s3, d1, d2, d3] = prepare(confuse)
%PREPARE2 reads in a raw confusion matrix 'confuse', and prepares new matrices
%Five output matrices are produced (all diagonal terms are zeroed out)
%Matrix s1 is a symmetric similarity matrix obtained by the arithmetic mean
% of off diagonals
%Matrix s2 is a symmetric similarity matrix obtained by the arithmetic mean
% of off diagonals
%Matrix s3 is a symmetric similarity matrix based on Luce's (1963) SCM
%Matrix d1 is a symmetric dissimilarity matrix based on s1
%Matrix d2 is a symmetric dissimilarity matrix based on s2
%Matrix d3 is Shepard's (1957) pseudo Euclidean distance, which is
% computed as the natural log of the terms in s3.
epsilon = .000000000001; sse = 99999; [n,n1] = size(confuse); lambda = ones(n,n);
s1 = confuse' + confuse; s1 = s1./2;
for i = 1:n-1
for j = 1+i:n
s2(i,j) = (confuse(i,j).*confuse(j,i)).^.5; s2(j,i) = s2(i,j);
end
end
trig = 0;
for i = 1:n % Find out if there are any cases where
for j = 1:n % a(i,j) = a(j,i) = 0
if confuse(i,j) == 0 & confuse(j,i)==0
trig = 1;
end
end
end
if trig == 1 % Use Heiser's "1/2 rule" for raw confusion data
for i = 1:n-1 % and Gilmore et al.'s rule for confusion pcts.
for j = i+1:n
if confuse(i,j) == 0 & confuse(j,i)==0
if trace(confuse) > 2.*n
confuse(i,j) = confuse(i,j) + .5; confuse(j,i) = confuse(j,i) + .5;
else
confuse(i,j) = confuse(i,j) + .001; confuse(j,i) = confuse(j,i) + .001;
end
end
end
end
end
% This block fits the Shepard-Luce similarity choice matrix (s2) by using
% iterative proportional fitting as described by Heiser (1988).
lambda_new = zeros(n,n); eta = zeros(n,n);
while sse > epsilon
lambprev = lambda;
csum = sum(sum(confuse(:,:)));
lambda_sum = sum(sum(lambda(:,:)));
lambda = (csum./lambda_sum).*lambda;
rowcsum = sum(confuse');
rowlambdasum = sum(lambda');
for i = 1:n
for j = 1:n
lambda_new(i,j) = (rowcsum(i)./rowlambdasum(i)).* lambda(i,j);
end
end
lambda = lambda_new;
colcsum = sum(confuse);
collambdasum = sum(lambda);
for j = 1:n
for i = 1:n
lambda_new(i,j) = (colcsum(j)./collambdasum(j)).* lambda(i,j);
end
end
lambda = lambda_new;
for i = 1:n
for j = 1:n
lambda_new(i,j) = (confuse(i,j)+confuse(j,i)).*lambda(i,j)./(lambda(i,j)+lambda(j,i));
end
end
lambda = lambda_new;
diff = (lambda - lambprev).^2;
sse = sum(sum(diff(:,:)));
end
for i = 1:n
for j = 1:n
eta(i,j) = ((lambda(i,j).*lambda(j,i))./(lambda(i,i).*lambda(j,j))).^.5;
end
end
s3 = eta;
d3 = -log(s3); % Matrix d3 is Shepard's distance based on s3
%The block below will obtain dissimilarity matrices d1 and d2 based on
%matrices s1 and s2, respectively.
for i = 1:n
s1(i,i) = 0; s2(i,i) = 0;
end
maxs1 = max(max(s1)); maxs2 = max(max(s2)); d1 = maxs1 - s1; d2 = maxs2-s2;
for i = 1:n
d1(i,i) = 0; d2(i,i) = 0; d3(i,i) = 0;
end
Program 2: BBDIAM.M
function [partition, diameter] = bbdiam(a, num_clusters)
%BBDIAM find a minimum-diameter partition of 'e' objects corresponding to
% the dissimilarity matrix 'a'. The number of clusters in the
% partition is c = 'num_clusters'. The minimum diameter is
% 'diameter' and the 'partition' contains the optimal partition.
% This program uses 10 replication of an exchange algorithm to get
% a good upper bound on partition diameter.
% It also reorders using the rule from Brusco and Cradit (2004), but
% this is transparent to the user because the solution is remapped
% backed to the original ordering prior to returning 'partition'
tic;
[n1,e] = size(a); epsilon = .0000001; c = num_clusters;
bestz = 99999999999;
for ijk = 1:10
pp = 1:c; pp = repmat(pp,1,e); qq = randperm(e.*c); pp = pp(qq); pp = pp(1:e);
for k = 1:c
numk(k)=sum(pp==k); % Run exchange heuristic to get
end % a good initial bound
trig = 0;
while trig == 0
trig = 1;
for i = 1:e
k1 = pp(i);
if numk(k1)==1
continue
end
k1max = 0;
for l = 1:e
if pp(l) == k1 & a(i,l) > k1max
k1max = a(i,l);
end
end
for k = 1:c
if k1 == k
continue
end
kmax = 0;
for l = 1:e
if pp(l) == k & a(i,l) > kmax
kmax = a(i,l);
end
end
if kmax < k1max
numk(k) = numk(k)+1; numk(k1) = numk(k1) - 1; k1max = kmax;
pp(i) = k; k1 = k; trig = 0;
end
end
end
end
z = 0;
for i = 1:e-1
for j = i+1:e
if pp(i)==pp(j) & a(i,j) > z
z = a(i,j);
end
end
end
if z < bestz
bestz = z;
end
end
ss = zeros(1,e); % Reorder by looking at the number
for i = 1:e % of elements in each row of the
ss(i) = sum(a(i,:)>=bestz); % dissimilarity matrix that are
end % greater than the heuristic bound
[st,index] = sort(ss);
aa = a;
for i = 1:e
index2(i) = index(e-i+1);
end
a = a(index2,index2);
p = 0; q = c; z = bestz+.01 % Initialize
x = zeros(1,e); xb = zeros(1,e);
n = zeros(1,c);
optfound = 0; trig1 = 0; trig2 = 0;
while optfound == 0
if trig1 == 0 % Branch Forward
p = p + 1; m = 1; n(m) = n(m) + 1; x(p) = m;
if n(m) == 1
q = q - 1;
end
trig1 = 1;
end
if trig2 == 0
if e - p >= q % Check Feasibility
trig7 = 0;
for i = 1:p-1
k1 = x(i);
for j = i+1:p
k2 = x(j);
if k1 == k2 & a(i,j) > z - epsilon
trig7 = 1;
break
end
end
% if k1 == m & a(i,p) > z-epsilon;
% trig7 = 1;
% break
% end
end
if trig7 == 0 & q == 0
for j = p+1:e
ksum = zeros(1,c);
for i = 1:p
ki = x(i);
if a(i,j) > ksum(ki)
ksum(ki) = a(i,j);
end
end
kmin = min(ksum);
if kmin > z-epsilon
trig7 = 1;
break
end
end
end
if trig7 == 0
if p ~= e
trig1 = 0;
continue
else
xb = x; z = 0; % Install new best solution
for i = 1:p-1
ki = x(i);
for j = i+1:p
kj = x(j);
if ki == kj & a(i,j) > z
z = a(i,j);
end
end
end
end
end
end
end
if m == c || n(m) == 1 % Dispensation (if m = c or n(m) = 1)
x(p) = 0; % then Retract
n(m) = n(m) - 1; p = p - 1;
if n(m) == 0
q = q + 1;
end
if p == 0
optfound = 1;
continue
else
m = x(p); trig1 = 1; trig2 = 1;
end
else
% Otherwise, Branch Right
n(m) = n(m) - 1; m = m + 1; n(m) = n(m)+1; x(p) = m;
if n(m) == 1
q = q - 1;
end
trig1 = 1; trig2 = 0;
continue
end
end
diameter = z; a = aa; % Remap everything back.
for i = 1:e
partition(index2(i)) = xb(i);
end
toc
Program 3: RANDOPT.M
function [ari,newa,newb] = randopt(mata,matb,parta,partb,diama,diamb)
% RANDOPT.M reads two dissimilarity matrices (mata and matb), two
% partitions of those matrices (parta, partb), and the diameters of the
% partitions (diama and diamb). An exchange algorithm is used to refine
% the partitions to maximize the adjusted Rand index, while maintaining
% minimum-diameter partitions. The best-found ARI and the new partitions
% (newa and newb) are returned as output.
datax = mata; datay = matb; xx = parta; yy= partb; boundx = diama;
boundy = diamb;
m = length(xx); a = 0; b = 0; c = 0; d = 0; x = xx; y = yy;
kx = max(xx); ky = max(yy);
n = m.*(m-1)./2; % compute initial ARI
for i = 1:m-1
for j = i+1:m
if x(i) == x(j) & y(i) == y(j)
a = a + 1;
elseif x(i) ~= x(j) & y(i) ~= y(j)
d = d + 1;
elseif x(i) == x(j) & y(i) ~= y(j)
b = b + 1;
else
c = c + 1;
end
end
end
rx = (n.*(a+d)-((a+b).*(a+c)+(c+d).*(b+d)))./(n.*n-((a+b).*(a+c)+(c+d).*(b+d)))
xopt = xx; yopt = yy;
trig = 0; % Begin exchange algorithm here.
while trig == 0 % If at least one improvement is
trig = 1; rb = rx; % found during a cycle, then trig = 0
for ii = 1:m
for k = 1:kx
if k == xx(ii)
continue
end
x = xx; x(ii) = k; itest = 0; y = yy;
for iii = 1:m
if x(ii) == x(iii) & datax(ii,iii) > boundx
itest = 1; break
end
end
if itest == 1
continue
end
a = 0; b = 0; c = 0; d = 0;
for i = 1:m-1
for j = i+1:m
if x(i) == x(j) & y(i) == y(j)
a = a + 1;
elseif x(i) ~= x(j) & y(i) ~= y(j)
d = d + 1;
elseif x(i) == x(j) & y(i) ~= y(j)
b = b + 1;
else
c = c + 1;
end
end
end
rx = (n.*(a+d)-((a+b).*(a+c)+(c+d).*(b+d)))./
(n.*n-((a+b).*(a+c)+(c+d).*(b+d)));
if rx > rb
rb = rx; isel = ii; ksel = k; icol = 1; trig = 0;
end
end
end
for ii = 1:m
for k = 1:ky
if k == yy(ii)
continue
end
y = yy; y(ii) = k;itest = 0; x=xx;
for iii = 1:m
if y(ii) == y(iii) & datay(ii,iii) > boundy
itest = 1; break
end
end
if itest == 1
continue
end
a = 0; b = 0; c = 0; d = 0;
for i = 1:m-1
for j = i+1:m
if x(i) == x(j) & y(i) == y(j)
a = a + 1;
elseif x(i) ~= x(j) & y(i) ~= y(j)
d = d + 1;
elseif x(i) == x(j) & y(i) ~= y(j)
b = b + 1;
else
c = c + 1;
end
end
end
rx = (n.*(a+d)-((a+b).*(a+c)+(c+d).*(b+d)))./
(n.*n-((a+b).*(a+c)+(c+d).*(b+d)));
if rx > rb
isel = ii; ksel = k; icol = 2; trig = 0;
rb = rx;
end
end
end
if trig == 0
if icol == 1
xx(isel) = ksel; rx = rb
else
yy(isel) = ksel; rx = rb
end
end
end
newa = xx; newb = yy; ari=rb;
Program 4: BBDOM.M
function [domindex,permopt, lindx, con, incon] = bbdom(confusion_matrix)
% BBDOM.M uses a branch-and-bound algorithm to fins a reordering of the
% rows and columns of an asymmetric matrix so as to maximize the sum of the
% elements above the main diagonal of the reordered matrix. The CPU time
% for this algorithm begins to accelerate for matrices larger than 20x20
tic; a = confusion_matrix;
[n,n1]=size(a);
for i = 1:n
a(i,i) = 0;
end
for i = 1:n-1
for j = i+1:n
b(i,j) = max(a(i,j),a(j,i)); b(j,i) = b(i,j);
end
end
nreps = 100; % Total number of replications
domindex=0; % Best z-value across all replications
for iii = 1:nreps
s = randperm(n); % Create a random permutation
z = sum(sum(triu(a(s,s)))); % Sum above the main diagonal
z = z - sum(diag(a));
zbest = z; % Best z-value for particular replication
sb = s;
dtarg=1; % Perform pairwise interchange to guarantee a
% local optimum with respect to that operation
while dtarg > eps
dtarg = 0;
for i = 1:n-1
for j = i+1:n
delta=a(s(j),s(i))-a(s(i),s(j));
for k = i+1:j-1
delta=delta-a(s(i),s(k))+a(s(j),s(k))+a(s(k),s(i))-a(s(k),s(j));
end
if delta > dtarg
dtarg = delta;
z = z + dtarg;
jdum = s(j);
s(j) = s(i);
s(i) = jdum;
end
end
end
end
if z > domindex
domindex = z;
end
end
z = domindex - 1
q = zeros(1,n+1); s = zeros(1,n+1);
m = 1; q(m) = 1; s(1) = 1; trig1 = 0; trigend = 0;
z1 = 0;
for j = 1:n
if j ~= q(m)
z1 = z1 + a(q(m),j);
end
end
while trigend == 0
if trig1 == 0
m = m + 1; trig1 = 1; % advance pointer
end
q(m) = q(m) + 1;
if s(q(m)) == 1 % redundancy
continue
end
if m == 1 & q(m) > n % terminate
trigend = 1;
continue
end
if m > 1 & q(m) > n % retract
s(q(m)) = 0; q(m) = 0; m = m - 1;
for i = 1:n
if s(i)==1 & i ~= q(m)
continue
end
z1 = z1 - a(q(m),i);
end
s(q(m)) = 0;
continue
end
if m == n
% for i = 1:n
% for j = 1:n-1
% if q(j) == i
% break
% end
% q(n) = i;
% if q(6) == 6 & q(7) == 10
% junk = 1;
% end
% end
% end
zbd = 0;
for i = 1:n-1
for j = i+1:n
zbd = zbd + a(q(i),q(j));
end
end
if zbd > z
z = zbd; x = q;
end
elseif m == 1
z1 = 0;
for j = 1:n
if j ~= q(m)
z1 = z1 + a(q(m),j);
end
end
trig1 = 0; s(q(m))=1;
else
if a(q(m),q(m-1)) > a(q(m-1),q(m))
continue
end
if a(q(m),q(m-1)) == a(q(m-1),q(m)) & q(m) < q(m-1)
continue
end
rtrig = 0;
for mm = m-2:-1:1
rdx = 0;
for i = mm:m-1
rdx = rdx + a(q(m),q(i)) - a(q(i),q(m));
end
if rdx > 0
rtrig = 1;
end
end
if rtrig == 1
continue
end
z2 = 0;
for i = 1:n
if s(i) == 0 & i ~= q(m)
z2 = z2 + a(q(m),i);
end
end
z3 = 0;
for i = 1:n-1
if s(i) == 1 || i == q(m)
continue
end
for j = i+1:n
if s(j) == 1 || j == q(m)
continue
end
z3 = z3 + b(i,j); %max(a(i,j),a(j,i));
end
end
if z1 + z2 + z3 > z
trig1 = 0; s(q(m)) = 1; z1 = z1 + z2;
end
end
end
x(n+1) = [];
reordered_matrix = a(x,x);
summat = sum(sum(a)); lindx = z./summat; con = 0; incon = 0;
for i = 1:n-1
for j = i+1:n
if reordered_matrix(i,j) > reordered_matrix(j,i)
con = con + 1;
end
if reordered_matrix(i,j) < reordered_matrix(j,i)
incon = incon + 1;
end
end
end
domindex = z; permopt = x;
toc;
Program 5: BBSUBSET.M
function [subset, index] = bbsubset(a, num_subsets, sub_size)
%BBSUBSET.M: This program reads an n-by-n matrix 'a' and will extract
%'num_subsets' each of size 'sub_size' based on the objective of maximizing
%the within subset sums of matrix elements minus the between subset sums of
%matrix elements. The program is fast when sub_size = 2, but can be much
%slower for sub_size > 2.
tic;
[n,n1]=size(a); ict = 0; c = num_subsets; gs = sub_size;
for k = 1:c
for h = 1:gs
ict = ict + 1; tau(ict) = k;
end
end
targ = ict;
ict = 0;
for i = 1:n-1
for j = i+1:n
ict = ict + 1; r(ict) = a(i,j); pair1(ict) = i; pair2(ict) = j;
end
end
% This block is designed to establish bounds for the between subset
% and within subset sums.
r = -r; [r,idx] = sort(r); r = -r; pair1 = pair1(idx); pair2 = pair2(idx);
delta = zeros(1,targ); omega = zeros(1,targ);
ws = 0; bs = 0;
ic = 0;
for k = 1:c
for h = 1:gs
ic = ic + 1;
for l = 1:h-1
ws = ws + 1;
end
delta(ic) = ws;
for l = 1:k-1
for u = 1:gs
bs = bs + 1;
end
end
omega(ic) = bs;
end
end
within_terms = 0; between_terms = 0;
for k = 1:c
within_terms = within_terms + gs.*(gs-1)./2;
end
for k = 1:c-1;
for h = k+1:c;
between_terms = between_terms + gs.*gs;
end
end
%This block is a greedy heuristic for obtaining an initial lower bound for
%the within subset sums minus the between subset sums.
sel = zeros(1,n);
prt = zeros(k,n);
for k = 1:c
amax = -99999999;
for i = 1:n-1
if sel(i) == 1
continue
end
for j = i+1:n
if sel(j) == 1
continue
end
asum = 0;
for h = 1:k-1
for l = 1:gs
asum = asum - a(i,prt(h,l)) - a(j,prt(h,l));
end
end
if a(i,j) + asum > amax
amax = a(i,j)+asum; isel = i; jsel = j;
end
end
end
prt(k,1) = isel; prt(k,2) = jsel; sel(isel) = 1; sel(jsel) = 1;
for i = 3:gs
amax = -99999999;
for j = 1:n
if sel(j) == 1
continue
end
asum = 0;
for h = 1:i-1
asum = asum + a(j,prt(k,h));
end
for h = 1:k-1
for l = 1:gs
asum = asum - a(j,prt(h,l));
end
end
if asum > amax
jsel = j; amax = asum;
end
end
prt(k,i) = jsel; sel(jsel) = 1;
end
end
z = 0;
for k = 1:c
for i = 1:gs-1
for j = i+1:gs
z = z + a(prt(k,i),prt(k,j));
end
end
end
for k = 1:c-1
for l = k+1:c
for i = 1:gs
for j = 1:gs
z = z - a(prt(k,i),prt(l,j));
end
end
end
end
z = z-1;
%The branch-and-bound algorithm begins here
q = zeros(1,n+1); s = zeros(1,n+1); test_sum = 0; tsum = zeros(1,n);
m = 1; q(m) = 1; s(1) = 1; trig1 = 0; trigend = 0;
while trigend == 0
if trig1 == 0
m = m + 1; trig1 = 1; % advance pointer
end
q(m) = q(m) + 1;
if s(q(m)) ~= 0 % redundancy
continue
end
if m > 2
if tau(m-1)==tau(m) & q(m) < q(m-1) % redundancy
continue
end
if tau(m-1)~=tau(m) & q(m) < q(m-gs) % redundancy
continue
end
end
if m == 1 & q(m) > n % terminate
trigend = 1;
continue
end
if m > 1 & q(m) > n % retract
s(q(m)) = 0; q(m) = 0; m = m - 1; test_sum = test_sum - tsum(m); tsum(m) = 0;
s(q(m)) = 0;
continue
end
for i = 1:m-1
v = q(i);
if tau(i) == tau(m)
tsum(m) = tsum(m) + a(v,q(m));
else
tsum(m) = tsum(m) - a(v,q(m));
end
end
test_sum = test_sum + tsum(m);
w = within_terms - delta(m); % b = between_terms - omega(m);
rbb = 0; rff = 0; isum = 0; isel = 0;
while isel < w
isum = isum + 1; pr1 = pair1(isum); pr2 = pair2(isum);
if s(pr1)~=0 & s(pr2) ~= 0
continue
end
if s(pr1) > 0 & s(pr1) ~= tau(m)
continue
end
if s(pr2) > 0 & s(pr2) ~= tau(m)
continue
end
rbb = rbb + r(isum); isel = isel + 1;
end
if test_sum + rbb - rff > z % Partial solution passes so
if m == targ % update incumbent if an optimum
z = test_sum;
partition = q;
test_sum = test_sum - tsum(m); tsum(m) = 0;
continue
end
trig1 = 0; s(q(m))=tau(m); % Otherwise, branch forward
else
test_sum = test_sum - tsum(m); tsum(m) = 0;
end
end
subset = partition; index = z;
toc