Supplemental Online Material for Clustering, Seriation, and Subset Extraction of Confusion

Supplemental online material for “Clustering, Seriation, and Subset Extraction of Confusion Data,” by Michael J. Brusco and J. Douglas Steinley.

Program 1: PREPARE.M

function [s1, s2, s3, d1, d2, d3] = prepare(confuse)

%PREPARE2 reads in a raw confusion matrix 'confuse', and prepares new matrices

%Five output matrices are produced (all diagonal terms are zeroed out)

%Matrix s1 is a symmetric similarity matrix obtained by the arithmetic mean

% of off diagonals

%Matrix s2 is a symmetric similarity matrix obtained by the arithmetic mean

% of off diagonals

%Matrix s3 is a symmetric similarity matrix based on Luce's (1963) SCM

%Matrix d1 is a symmetric dissimilarity matrix based on s1

%Matrix d2 is a symmetric dissimilarity matrix based on s2

%Matrix d3 is Shepard's (1957) pseudo Euclidean distance, which is

% computed as the natural log of the terms in s3.

epsilon = .000000000001; sse = 99999; [n,n1] = size(confuse); lambda = ones(n,n);

s1 = confuse' + confuse; s1 = s1./2;

for i = 1:n-1

for j = 1+i:n

s2(i,j) = (confuse(i,j).*confuse(j,i)).^.5; s2(j,i) = s2(i,j);

end

trig = 0;

for i = 1:n % Find out if there are any cases where

for j = 1:n % a(i,j) = a(j,i) = 0

if confuse(i,j) == 0 & confuse(j,i)==0

trig = 1;

end

if trig == 1 % Use Heiser's "1/2 rule" for raw confusion data

for i = 1:n-1 % and Gilmore et al.'s rule for confusion pcts.

for j = i+1:n

if confuse(i,j) == 0 & confuse(j,i)==0

if trace(confuse) > 2.*n

confuse(i,j) = confuse(i,j) + .5; confuse(j,i) = confuse(j,i) + .5;

else

confuse(i,j) = confuse(i,j) + .001; confuse(j,i) = confuse(j,i) + .001;

end

% This block fits the Shepard-Luce similarity choice matrix (s2) by using

% iterative proportional fitting as described by Heiser (1988).

lambda_new = zeros(n,n); eta = zeros(n,n);

while sse > epsilon

lambprev = lambda;

csum = sum(sum(confuse(:,:)));

lambda_sum = sum(sum(lambda(:,:)));

lambda = (csum./lambda_sum).*lambda;

rowcsum = sum(confuse');

rowlambdasum = sum(lambda');

for i = 1:n

for j = 1:n

lambda_new(i,j) = (rowcsum(i)./rowlambdasum(i)).* lambda(i,j);

end

lambda = lambda_new;

colcsum = sum(confuse);

collambdasum = sum(lambda);

for j = 1:n

for i = 1:n

lambda_new(i,j) = (colcsum(j)./collambdasum(j)).* lambda(i,j);

end

lambda = lambda_new;

for i = 1:n

for j = 1:n

lambda_new(i,j) = (confuse(i,j)+confuse(j,i)).*lambda(i,j)./(lambda(i,j)+lambda(j,i));

end

lambda = lambda_new;

diff = (lambda - lambprev).^2;

sse = sum(sum(diff(:,:)));

end

for i = 1:n

for j = 1:n

eta(i,j) = ((lambda(i,j).*lambda(j,i))./(lambda(i,i).*lambda(j,j))).^.5;

end

s3 = eta;

d3 = -log(s3); % Matrix d3 is Shepard's distance based on s3

%The block below will obtain dissimilarity matrices d1 and d2 based on

%matrices s1 and s2, respectively.

for i = 1:n

s1(i,i) = 0; s2(i,i) = 0;

end

maxs1 = max(max(s1)); maxs2 = max(max(s2)); d1 = maxs1 - s1; d2 = maxs2-s2;

for i = 1:n

d1(i,i) = 0; d2(i,i) = 0; d3(i,i) = 0;

end

Program 2: BBDIAM.M

function [partition, diameter] = bbdiam(a, num_clusters)

%BBDIAM find a minimum-diameter partition of 'e' objects corresponding to

% the dissimilarity matrix 'a'. The number of clusters in the

% partition is c = 'num_clusters'. The minimum diameter is

% 'diameter' and the 'partition' contains the optimal partition.

% This program uses 10 replication of an exchange algorithm to get

% a good upper bound on partition diameter.

% It also reorders using the rule from Brusco and Cradit (2004), but

% this is transparent to the user because the solution is remapped

% backed to the original ordering prior to returning 'partition'

tic;

[n1,e] = size(a); epsilon = .0000001; c = num_clusters;

bestz = 99999999999;

for ijk = 1:10

pp = 1:c; pp = repmat(pp,1,e); qq = randperm(e.*c); pp = pp(qq); pp = pp(1:e);

for k = 1:c

numk(k)=sum(pp==k); % Run exchange heuristic to get

end % a good initial bound

trig = 0;

while trig == 0

trig = 1;

for i = 1:e

k1 = pp(i);

if numk(k1)==1

continue

end

k1max = 0;

for l = 1:e

if pp(l) == k1 & a(i,l) > k1max

k1max = a(i,l);

end

for k = 1:c

if k1 == k

continue

end

kmax = 0;

for l = 1:e

if pp(l) == k & a(i,l) > kmax

kmax = a(i,l);

end

if kmax < k1max

numk(k) = numk(k)+1; numk(k1) = numk(k1) - 1; k1max = kmax;

pp(i) = k; k1 = k; trig = 0;

end

z = 0;

for i = 1:e-1

for j = i+1:e

if pp(i)==pp(j) & a(i,j) > z

z = a(i,j);

end

if z < bestz

bestz = z;

end

ss = zeros(1,e); % Reorder by looking at the number

for i = 1:e % of elements in each row of the

ss(i) = sum(a(i,:)>=bestz); % dissimilarity matrix that are

end % greater than the heuristic bound

[st,index] = sort(ss);

aa = a;

for i = 1:e

index2(i) = index(e-i+1);

end

a = a(index2,index2);

p = 0; q = c; z = bestz+.01 % Initialize

x = zeros(1,e); xb = zeros(1,e);

n = zeros(1,c);

optfound = 0; trig1 = 0; trig2 = 0;

while optfound == 0

if trig1 == 0 % Branch Forward

p = p + 1; m = 1; n(m) = n(m) + 1; x(p) = m;

if n(m) == 1

q = q - 1;

end

trig1 = 1;

end

if trig2 == 0

if e - p >= q % Check Feasibility

trig7 = 0;

for i = 1:p-1

k1 = x(i);

for j = i+1:p

k2 = x(j);

if k1 == k2 & a(i,j) > z - epsilon

trig7 = 1;

break

end

% if k1 == m & a(i,p) > z-epsilon;

% trig7 = 1;

% break

% end

end

if trig7 == 0 & q == 0

for j = p+1:e

ksum = zeros(1,c);

for i = 1:p

ki = x(i);

if a(i,j) > ksum(ki)

ksum(ki) = a(i,j);

end

kmin = min(ksum);

if kmin > z-epsilon

trig7 = 1;

break

end

if trig7 == 0

if p ~= e

trig1 = 0;

continue

else

xb = x; z = 0; % Install new best solution

for i = 1:p-1

ki = x(i);

for j = i+1:p

kj = x(j);

if ki == kj & a(i,j) > z

z = a(i,j);

end

if m == c || n(m) == 1 % Dispensation (if m = c or n(m) = 1)

x(p) = 0; % then Retract

n(m) = n(m) - 1; p = p - 1;

if n(m) == 0

q = q + 1;

end

if p == 0

optfound = 1;

continue

else

m = x(p); trig1 = 1; trig2 = 1;

end

else

% Otherwise, Branch Right

n(m) = n(m) - 1; m = m + 1; n(m) = n(m)+1; x(p) = m;

if n(m) == 1

q = q - 1;

end

trig1 = 1; trig2 = 0;

continue

end

diameter = z; a = aa; % Remap everything back.

for i = 1:e

partition(index2(i)) = xb(i);

end

toc

Program 3: RANDOPT.M

function [ari,newa,newb] = randopt(mata,matb,parta,partb,diama,diamb)

% RANDOPT.M reads two dissimilarity matrices (mata and matb), two

% partitions of those matrices (parta, partb), and the diameters of the

% partitions (diama and diamb). An exchange algorithm is used to refine

% the partitions to maximize the adjusted Rand index, while maintaining

% minimum-diameter partitions. The best-found ARI and the new partitions

% (newa and newb) are returned as output.

datax = mata; datay = matb; xx = parta; yy= partb; boundx = diama;

boundy = diamb;

m = length(xx); a = 0; b = 0; c = 0; d = 0; x = xx; y = yy;

kx = max(xx); ky = max(yy);

n = m.*(m-1)./2; % compute initial ARI

for i = 1:m-1

for j = i+1:m

if x(i) == x(j) & y(i) == y(j)

a = a + 1;

elseif x(i) ~= x(j) & y(i) ~= y(j)

d = d + 1;

elseif x(i) == x(j) & y(i) ~= y(j)

b = b + 1;

else

c = c + 1;

end

rx = (n.*(a+d)-((a+b).*(a+c)+(c+d).*(b+d)))./(n.*n-((a+b).*(a+c)+(c+d).*(b+d)))

xopt = xx; yopt = yy;

trig = 0; % Begin exchange algorithm here.

while trig == 0 % If at least one improvement is

trig = 1; rb = rx; % found during a cycle, then trig = 0

for ii = 1:m

for k = 1:kx

if k == xx(ii)

continue

end

x = xx; x(ii) = k; itest = 0; y = yy;

for iii = 1:m

if x(ii) == x(iii) & datax(ii,iii) > boundx

itest = 1; break

end

if itest == 1

continue

end

a = 0; b = 0; c = 0; d = 0;

for i = 1:m-1

for j = i+1:m

if x(i) == x(j) & y(i) == y(j)

a = a + 1;

elseif x(i) ~= x(j) & y(i) ~= y(j)

d = d + 1;

elseif x(i) == x(j) & y(i) ~= y(j)

b = b + 1;

else

c = c + 1;

end

rx = (n.*(a+d)-((a+b).*(a+c)+(c+d).*(b+d)))./

(n.*n-((a+b).*(a+c)+(c+d).*(b+d)));

if rx > rb

rb = rx; isel = ii; ksel = k; icol = 1; trig = 0;

end

for ii = 1:m

for k = 1:ky

if k == yy(ii)

continue

end

y = yy; y(ii) = k;itest = 0; x=xx;

for iii = 1:m

if y(ii) == y(iii) & datay(ii,iii) > boundy

itest = 1; break

end

if itest == 1

continue

end

a = 0; b = 0; c = 0; d = 0;

for i = 1:m-1

for j = i+1:m

if x(i) == x(j) & y(i) == y(j)

a = a + 1;

elseif x(i) ~= x(j) & y(i) ~= y(j)

d = d + 1;

elseif x(i) == x(j) & y(i) ~= y(j)

b = b + 1;

else

c = c + 1;

end

rx = (n.*(a+d)-((a+b).*(a+c)+(c+d).*(b+d)))./

(n.*n-((a+b).*(a+c)+(c+d).*(b+d)));

if rx > rb

isel = ii; ksel = k; icol = 2; trig = 0;

rb = rx;

end

if trig == 0

if icol == 1

xx(isel) = ksel; rx = rb

else

yy(isel) = ksel; rx = rb

end

newa = xx; newb = yy; ari=rb;

Program 4: BBDOM.M

function [domindex,permopt, lindx, con, incon] = bbdom(confusion_matrix)

% BBDOM.M uses a branch-and-bound algorithm to fins a reordering of the

% rows and columns of an asymmetric matrix so as to maximize the sum of the

% elements above the main diagonal of the reordered matrix. The CPU time

% for this algorithm begins to accelerate for matrices larger than 20x20

tic; a = confusion_matrix;

[n,n1]=size(a);

for i = 1:n

a(i,i) = 0;

end

for i = 1:n-1

for j = i+1:n

b(i,j) = max(a(i,j),a(j,i)); b(j,i) = b(i,j);

end

nreps = 100; % Total number of replications

domindex=0; % Best z-value across all replications

for iii = 1:nreps

s = randperm(n); % Create a random permutation

z = sum(sum(triu(a(s,s)))); % Sum above the main diagonal

z = z - sum(diag(a));

zbest = z; % Best z-value for particular replication

sb = s;

dtarg=1; % Perform pairwise interchange to guarantee a

% local optimum with respect to that operation

while dtarg > eps

dtarg = 0;

for i = 1:n-1

for j = i+1:n

delta=a(s(j),s(i))-a(s(i),s(j));

for k = i+1:j-1

delta=delta-a(s(i),s(k))+a(s(j),s(k))+a(s(k),s(i))-a(s(k),s(j));

end

if delta > dtarg

dtarg = delta;

z = z + dtarg;

jdum = s(j);

s(j) = s(i);

s(i) = jdum;

end

if z > domindex

domindex = z;

end

z = domindex - 1

q = zeros(1,n+1); s = zeros(1,n+1);

m = 1; q(m) = 1; s(1) = 1; trig1 = 0; trigend = 0;

z1 = 0;

for j = 1:n

if j ~= q(m)

z1 = z1 + a(q(m),j);

end

while trigend == 0

if trig1 == 0

m = m + 1; trig1 = 1; % advance pointer

end

q(m) = q(m) + 1;

if s(q(m)) == 1 % redundancy

continue

end

if m == 1 & q(m) > n % terminate

trigend = 1;

continue

end

if m > 1 & q(m) > n % retract

s(q(m)) = 0; q(m) = 0; m = m - 1;

for i = 1:n

if s(i)==1 & i ~= q(m)

continue

end

z1 = z1 - a(q(m),i);

end

s(q(m)) = 0;

continue

end

if m == n

% for i = 1:n

% for j = 1:n-1

% if q(j) == i

% break

% end

% q(n) = i;

% if q(6) == 6 & q(7) == 10

% junk = 1;

% end

zbd = 0;

for i = 1:n-1

for j = i+1:n

zbd = zbd + a(q(i),q(j));

end

if zbd > z

z = zbd; x = q;

end

elseif m == 1

z1 = 0;

for j = 1:n

if j ~= q(m)

z1 = z1 + a(q(m),j);

end

trig1 = 0; s(q(m))=1;

else

if a(q(m),q(m-1)) > a(q(m-1),q(m))

continue

end

if a(q(m),q(m-1)) == a(q(m-1),q(m)) & q(m) < q(m-1)

continue

end

rtrig = 0;

for mm = m-2:-1:1

rdx = 0;

for i = mm:m-1

rdx = rdx + a(q(m),q(i)) - a(q(i),q(m));

end

if rdx > 0

rtrig = 1;

end

if rtrig == 1

continue

end

z2 = 0;

for i = 1:n

if s(i) == 0 & i ~= q(m)

z2 = z2 + a(q(m),i);

end

z3 = 0;

for i = 1:n-1

if s(i) == 1 || i == q(m)

continue

end

for j = i+1:n

if s(j) == 1 || j == q(m)

continue

end

z3 = z3 + b(i,j); %max(a(i,j),a(j,i));

end

if z1 + z2 + z3 > z

trig1 = 0; s(q(m)) = 1; z1 = z1 + z2;

end

x(n+1) = [];

reordered_matrix = a(x,x);

summat = sum(sum(a)); lindx = z./summat; con = 0; incon = 0;

for i = 1:n-1

for j = i+1:n

if reordered_matrix(i,j) > reordered_matrix(j,i)

con = con + 1;

end

if reordered_matrix(i,j) < reordered_matrix(j,i)

incon = incon + 1;

end

domindex = z; permopt = x;

toc;

Program 5: BBSUBSET.M

function [subset, index] = bbsubset(a, num_subsets, sub_size)

%BBSUBSET.M: This program reads an n-by-n matrix 'a' and will extract

%'num_subsets' each of size 'sub_size' based on the objective of maximizing

%the within subset sums of matrix elements minus the between subset sums of

%matrix elements. The program is fast when sub_size = 2, but can be much

%slower for sub_size > 2.

tic;

[n,n1]=size(a); ict = 0; c = num_subsets; gs = sub_size;

for k = 1:c

for h = 1:gs

ict = ict + 1; tau(ict) = k;

end

targ = ict;

ict = 0;

for i = 1:n-1

for j = i+1:n

ict = ict + 1; r(ict) = a(i,j); pair1(ict) = i; pair2(ict) = j;

end

% This block is designed to establish bounds for the between subset

% and within subset sums.

r = -r; [r,idx] = sort(r); r = -r; pair1 = pair1(idx); pair2 = pair2(idx);

delta = zeros(1,targ); omega = zeros(1,targ);

ws = 0; bs = 0;

ic = 0;

for k = 1:c

for h = 1:gs

ic = ic + 1;

for l = 1:h-1

ws = ws + 1;

end

delta(ic) = ws;

for l = 1:k-1

for u = 1:gs

bs = bs + 1;

end

omega(ic) = bs;

end

within_terms = 0; between_terms = 0;

for k = 1:c

within_terms = within_terms + gs.*(gs-1)./2;

end

for k = 1:c-1;

for h = k+1:c;

between_terms = between_terms + gs.*gs;

end

%This block is a greedy heuristic for obtaining an initial lower bound for

%the within subset sums minus the between subset sums.

sel = zeros(1,n);

prt = zeros(k,n);

for k = 1:c

amax = -99999999;

for i = 1:n-1

if sel(i) == 1

continue

end

for j = i+1:n

if sel(j) == 1

continue

end

asum = 0;

for h = 1:k-1

for l = 1:gs

asum = asum - a(i,prt(h,l)) - a(j,prt(h,l));

end

if a(i,j) + asum > amax

amax = a(i,j)+asum; isel = i; jsel = j;

end

prt(k,1) = isel; prt(k,2) = jsel; sel(isel) = 1; sel(jsel) = 1;

for i = 3:gs

amax = -99999999;

for j = 1:n

if sel(j) == 1

continue

end

asum = 0;

for h = 1:i-1

asum = asum + a(j,prt(k,h));

end

for h = 1:k-1

for l = 1:gs

asum = asum - a(j,prt(h,l));

end

if asum > amax

jsel = j; amax = asum;

end

prt(k,i) = jsel; sel(jsel) = 1;

end

z = 0;

for k = 1:c

for i = 1:gs-1

for j = i+1:gs

z = z + a(prt(k,i),prt(k,j));

end

for k = 1:c-1

for l = k+1:c

for i = 1:gs

for j = 1:gs

z = z - a(prt(k,i),prt(l,j));

end

z = z-1;

%The branch-and-bound algorithm begins here

q = zeros(1,n+1); s = zeros(1,n+1); test_sum = 0; tsum = zeros(1,n);

m = 1; q(m) = 1; s(1) = 1; trig1 = 0; trigend = 0;

while trigend == 0

if trig1 == 0

m = m + 1; trig1 = 1; % advance pointer

end

q(m) = q(m) + 1;

if s(q(m)) ~= 0 % redundancy

continue

end

if m > 2

if tau(m-1)==tau(m) & q(m) < q(m-1) % redundancy

continue

end

if tau(m-1)~=tau(m) & q(m) < q(m-gs) % redundancy

continue

end

if m == 1 & q(m) > n % terminate

trigend = 1;

continue

end

if m > 1 & q(m) > n % retract

s(q(m)) = 0; q(m) = 0; m = m - 1; test_sum = test_sum - tsum(m); tsum(m) = 0;

s(q(m)) = 0;

continue

end

for i = 1:m-1

v = q(i);

if tau(i) == tau(m)

tsum(m) = tsum(m) + a(v,q(m));

else

tsum(m) = tsum(m) - a(v,q(m));

end

test_sum = test_sum + tsum(m);

w = within_terms - delta(m); % b = between_terms - omega(m);

rbb = 0; rff = 0; isum = 0; isel = 0;

while isel < w

isum = isum + 1; pr1 = pair1(isum); pr2 = pair2(isum);

if s(pr1)~=0 & s(pr2) ~= 0

continue

end

if s(pr1) > 0 & s(pr1) ~= tau(m)

continue

end

if s(pr2) > 0 & s(pr2) ~= tau(m)

continue

end

rbb = rbb + r(isum); isel = isel + 1;

end

if test_sum + rbb - rff > z % Partial solution passes so

if m == targ % update incumbent if an optimum

z = test_sum;

partition = q;

test_sum = test_sum - tsum(m); tsum(m) = 0;

continue

end

trig1 = 0; s(q(m))=tau(m); % Otherwise, branch forward

else

test_sum = test_sum - tsum(m); tsum(m) = 0;

end

subset = partition; index = z;

toc