Supplemental online material for “Clustering, Seriation, and Subset Extraction of Confusion Data,” by Michael J. Brusco and J. Douglas Steinley.

Program 1: PREPARE.M

function [s1, s2, s3, d1, d2, d3] = prepare(confuse)

%PREPARE2 reads in a raw confusion matrix 'confuse', and prepares new matrices

%Five output matrices are produced (all diagonal terms are zeroed out)

%Matrix s1 is a symmetric similarity matrix obtained by the arithmetic mean

% of off diagonals

%Matrix s2 is a symmetric similarity matrix obtained by the arithmetic mean

% of off diagonals

%Matrix s3 is a symmetric similarity matrix based on Luce's (1963) SCM

%Matrix d1 is a symmetric dissimilarity matrix based on s1

%Matrix d2 is a symmetric dissimilarity matrix based on s2

%Matrix d3 is Shepard's (1957) pseudo Euclidean distance, which is

% computed as the natural log of the terms in s3.

epsilon = .000000000001; sse = 99999; [n,n1] = size(confuse); lambda = ones(n,n);

s1 = confuse' + confuse; s1 = s1./2;

for i = 1:n-1

for j = 1+i:n

s2(i,j) = (confuse(i,j).*confuse(j,i)).^.5; s2(j,i) = s2(i,j);

end

end

trig = 0;

for i = 1:n % Find out if there are any cases where

for j = 1:n % a(i,j) = a(j,i) = 0

if confuse(i,j) == 0 & confuse(j,i)==0

trig = 1;

end

end

end

if trig == 1 % Use Heiser's "1/2 rule" for raw confusion data

for i = 1:n-1 % and Gilmore et al.'s rule for confusion pcts.

for j = i+1:n

if confuse(i,j) == 0 & confuse(j,i)==0

if trace(confuse) > 2.*n

confuse(i,j) = confuse(i,j) + .5; confuse(j,i) = confuse(j,i) + .5;

else

confuse(i,j) = confuse(i,j) + .001; confuse(j,i) = confuse(j,i) + .001;

end

end

end

end

end

% This block fits the Shepard-Luce similarity choice matrix (s2) by using

% iterative proportional fitting as described by Heiser (1988).

lambda_new = zeros(n,n); eta = zeros(n,n);

while sse > epsilon

lambprev = lambda;

csum = sum(sum(confuse(:,:)));

lambda_sum = sum(sum(lambda(:,:)));

lambda = (csum./lambda_sum).*lambda;

rowcsum = sum(confuse');

rowlambdasum = sum(lambda');

for i = 1:n

for j = 1:n

lambda_new(i,j) = (rowcsum(i)./rowlambdasum(i)).* lambda(i,j);

end

end

lambda = lambda_new;

colcsum = sum(confuse);

collambdasum = sum(lambda);

for j = 1:n

for i = 1:n

lambda_new(i,j) = (colcsum(j)./collambdasum(j)).* lambda(i,j);

end

end

lambda = lambda_new;

for i = 1:n

for j = 1:n

lambda_new(i,j) = (confuse(i,j)+confuse(j,i)).*lambda(i,j)./(lambda(i,j)+lambda(j,i));

end

end

lambda = lambda_new;

diff = (lambda - lambprev).^2;

sse = sum(sum(diff(:,:)));

end

for i = 1:n

for j = 1:n

eta(i,j) = ((lambda(i,j).*lambda(j,i))./(lambda(i,i).*lambda(j,j))).^.5;

end

end

s3 = eta;

d3 = -log(s3); % Matrix d3 is Shepard's distance based on s3

%The block below will obtain dissimilarity matrices d1 and d2 based on

%matrices s1 and s2, respectively.

for i = 1:n

s1(i,i) = 0; s2(i,i) = 0;

end

maxs1 = max(max(s1)); maxs2 = max(max(s2)); d1 = maxs1 - s1; d2 = maxs2-s2;

for i = 1:n

d1(i,i) = 0; d2(i,i) = 0; d3(i,i) = 0;

end

Program 2: BBDIAM.M

function [partition, diameter] = bbdiam(a, num_clusters)

%BBDIAM find a minimum-diameter partition of 'e' objects corresponding to

% the dissimilarity matrix 'a'. The number of clusters in the

% partition is c = 'num_clusters'. The minimum diameter is

% 'diameter' and the 'partition' contains the optimal partition.

% This program uses 10 replication of an exchange algorithm to get

% a good upper bound on partition diameter.

% It also reorders using the rule from Brusco and Cradit (2004), but

% this is transparent to the user because the solution is remapped

% backed to the original ordering prior to returning 'partition'

tic;

[n1,e] = size(a); epsilon = .0000001; c = num_clusters;

bestz = 99999999999;

for ijk = 1:10

pp = 1:c; pp = repmat(pp,1,e); qq = randperm(e.*c); pp = pp(qq); pp = pp(1:e);

for k = 1:c

numk(k)=sum(pp==k); % Run exchange heuristic to get

end % a good initial bound

trig = 0;

while trig == 0

trig = 1;

for i = 1:e

k1 = pp(i);

if numk(k1)==1

continue

end

k1max = 0;

for l = 1:e

if pp(l) == k1 & a(i,l) > k1max

k1max = a(i,l);

end

end

for k = 1:c

if k1 == k

continue

end

kmax = 0;

for l = 1:e

if pp(l) == k & a(i,l) > kmax

kmax = a(i,l);

end

end

if kmax < k1max

numk(k) = numk(k)+1; numk(k1) = numk(k1) - 1; k1max = kmax;

pp(i) = k; k1 = k; trig = 0;

end

end

end

end

z = 0;

for i = 1:e-1

for j = i+1:e

if pp(i)==pp(j) & a(i,j) > z

z = a(i,j);

end

end

end

if z < bestz

bestz = z;

end

end

ss = zeros(1,e); % Reorder by looking at the number

for i = 1:e % of elements in each row of the

ss(i) = sum(a(i,:)>=bestz); % dissimilarity matrix that are

end % greater than the heuristic bound

[st,index] = sort(ss);

aa = a;

for i = 1:e

index2(i) = index(e-i+1);

end

a = a(index2,index2);

p = 0; q = c; z = bestz+.01 % Initialize

x = zeros(1,e); xb = zeros(1,e);

n = zeros(1,c);

optfound = 0; trig1 = 0; trig2 = 0;

while optfound == 0

if trig1 == 0 % Branch Forward

p = p + 1; m = 1; n(m) = n(m) + 1; x(p) = m;

if n(m) == 1

q = q - 1;

end

trig1 = 1;

end

if trig2 == 0

if e - p >= q % Check Feasibility

trig7 = 0;

for i = 1:p-1

k1 = x(i);

for j = i+1:p

k2 = x(j);

if k1 == k2 & a(i,j) > z - epsilon

trig7 = 1;

break

end

end

% if k1 == m & a(i,p) > z-epsilon;

% trig7 = 1;

% break

% end

end

if trig7 == 0 & q == 0

for j = p+1:e

ksum = zeros(1,c);

for i = 1:p

ki = x(i);

if a(i,j) > ksum(ki)

ksum(ki) = a(i,j);

end

end

kmin = min(ksum);

if kmin > z-epsilon

trig7 = 1;

break

end

end

end

if trig7 == 0

if p ~= e

trig1 = 0;

continue

else

xb = x; z = 0; % Install new best solution

for i = 1:p-1

ki = x(i);

for j = i+1:p

kj = x(j);

if ki == kj & a(i,j) > z

z = a(i,j);

end

end

end

end

end

end

end

if m == c || n(m) == 1 % Dispensation (if m = c or n(m) = 1)

x(p) = 0; % then Retract

n(m) = n(m) - 1; p = p - 1;

if n(m) == 0

q = q + 1;

end

if p == 0

optfound = 1;

continue

else

m = x(p); trig1 = 1; trig2 = 1;

end

else

% Otherwise, Branch Right

n(m) = n(m) - 1; m = m + 1; n(m) = n(m)+1; x(p) = m;

if n(m) == 1

q = q - 1;

end

trig1 = 1; trig2 = 0;

continue

end

end

diameter = z; a = aa; % Remap everything back.

for i = 1:e

partition(index2(i)) = xb(i);

end

toc

Program 3: RANDOPT.M

function [ari,newa,newb] = randopt(mata,matb,parta,partb,diama,diamb)

% RANDOPT.M reads two dissimilarity matrices (mata and matb), two

% partitions of those matrices (parta, partb), and the diameters of the

% partitions (diama and diamb). An exchange algorithm is used to refine

% the partitions to maximize the adjusted Rand index, while maintaining

% minimum-diameter partitions. The best-found ARI and the new partitions

% (newa and newb) are returned as output.

datax = mata; datay = matb; xx = parta; yy= partb; boundx = diama;

boundy = diamb;

m = length(xx); a = 0; b = 0; c = 0; d = 0; x = xx; y = yy;

kx = max(xx); ky = max(yy);

n = m.*(m-1)./2; % compute initial ARI

for i = 1:m-1

for j = i+1:m

if x(i) == x(j) & y(i) == y(j)

a = a + 1;

elseif x(i) ~= x(j) & y(i) ~= y(j)

d = d + 1;

elseif x(i) == x(j) & y(i) ~= y(j)

b = b + 1;

else

c = c + 1;

end

end

end

rx = (n.*(a+d)-((a+b).*(a+c)+(c+d).*(b+d)))./(n.*n-((a+b).*(a+c)+(c+d).*(b+d)))

xopt = xx; yopt = yy;

trig = 0; % Begin exchange algorithm here.

while trig == 0 % If at least one improvement is

trig = 1; rb = rx; % found during a cycle, then trig = 0

for ii = 1:m

for k = 1:kx

if k == xx(ii)

continue

end

x = xx; x(ii) = k; itest = 0; y = yy;

for iii = 1:m

if x(ii) == x(iii) & datax(ii,iii) > boundx

itest = 1; break

end

end

if itest == 1

continue

end

a = 0; b = 0; c = 0; d = 0;

for i = 1:m-1

for j = i+1:m

if x(i) == x(j) & y(i) == y(j)

a = a + 1;

elseif x(i) ~= x(j) & y(i) ~= y(j)

d = d + 1;

elseif x(i) == x(j) & y(i) ~= y(j)

b = b + 1;

else

c = c + 1;

end

end

end

rx = (n.*(a+d)-((a+b).*(a+c)+(c+d).*(b+d)))./

(n.*n-((a+b).*(a+c)+(c+d).*(b+d)));

if rx > rb

rb = rx; isel = ii; ksel = k; icol = 1; trig = 0;

end

end

end

for ii = 1:m

for k = 1:ky

if k == yy(ii)

continue

end

y = yy; y(ii) = k;itest = 0; x=xx;

for iii = 1:m

if y(ii) == y(iii) & datay(ii,iii) > boundy

itest = 1; break

end

end

if itest == 1

continue

end

a = 0; b = 0; c = 0; d = 0;

for i = 1:m-1

for j = i+1:m

if x(i) == x(j) & y(i) == y(j)

a = a + 1;

elseif x(i) ~= x(j) & y(i) ~= y(j)

d = d + 1;

elseif x(i) == x(j) & y(i) ~= y(j)

b = b + 1;

else

c = c + 1;

end

end

end

rx = (n.*(a+d)-((a+b).*(a+c)+(c+d).*(b+d)))./

(n.*n-((a+b).*(a+c)+(c+d).*(b+d)));

if rx > rb

isel = ii; ksel = k; icol = 2; trig = 0;

rb = rx;

end

end

end

if trig == 0

if icol == 1

xx(isel) = ksel; rx = rb

else

yy(isel) = ksel; rx = rb

end

end

end

newa = xx; newb = yy; ari=rb;

Program 4: BBDOM.M

function [domindex,permopt, lindx, con, incon] = bbdom(confusion_matrix)

% BBDOM.M uses a branch-and-bound algorithm to fins a reordering of the

% rows and columns of an asymmetric matrix so as to maximize the sum of the

% elements above the main diagonal of the reordered matrix. The CPU time

% for this algorithm begins to accelerate for matrices larger than 20x20

tic; a = confusion_matrix;

[n,n1]=size(a);

for i = 1:n

a(i,i) = 0;

end

for i = 1:n-1

for j = i+1:n

b(i,j) = max(a(i,j),a(j,i)); b(j,i) = b(i,j);

end

end

nreps = 100; % Total number of replications

domindex=0; % Best z-value across all replications

for iii = 1:nreps

s = randperm(n); % Create a random permutation

z = sum(sum(triu(a(s,s)))); % Sum above the main diagonal

z = z - sum(diag(a));

zbest = z; % Best z-value for particular replication

sb = s;

dtarg=1; % Perform pairwise interchange to guarantee a

% local optimum with respect to that operation

while dtarg > eps

dtarg = 0;

for i = 1:n-1

for j = i+1:n

delta=a(s(j),s(i))-a(s(i),s(j));

for k = i+1:j-1

delta=delta-a(s(i),s(k))+a(s(j),s(k))+a(s(k),s(i))-a(s(k),s(j));

end

if delta > dtarg

dtarg = delta;

z = z + dtarg;

jdum = s(j);

s(j) = s(i);

s(i) = jdum;

end

end

end

end

if z > domindex

domindex = z;

end

end

z = domindex - 1

q = zeros(1,n+1); s = zeros(1,n+1);

m = 1; q(m) = 1; s(1) = 1; trig1 = 0; trigend = 0;

z1 = 0;

for j = 1:n

if j ~= q(m)

z1 = z1 + a(q(m),j);

end

end

while trigend == 0

if trig1 == 0

m = m + 1; trig1 = 1; % advance pointer

end

q(m) = q(m) + 1;

if s(q(m)) == 1 % redundancy

continue

end

if m == 1 & q(m) > n % terminate

trigend = 1;

continue

end

if m > 1 & q(m) > n % retract

s(q(m)) = 0; q(m) = 0; m = m - 1;

for i = 1:n

if s(i)==1 & i ~= q(m)

continue

end

z1 = z1 - a(q(m),i);

end

s(q(m)) = 0;

continue

end

if m == n

% for i = 1:n

% for j = 1:n-1

% if q(j) == i

% break

% end

% q(n) = i;

% if q(6) == 6 & q(7) == 10

% junk = 1;

% end

% end

% end

zbd = 0;

for i = 1:n-1

for j = i+1:n

zbd = zbd + a(q(i),q(j));

end

end

if zbd > z

z = zbd; x = q;

end

elseif m == 1

z1 = 0;

for j = 1:n

if j ~= q(m)

z1 = z1 + a(q(m),j);

end

end

trig1 = 0; s(q(m))=1;

else

if a(q(m),q(m-1)) > a(q(m-1),q(m))

continue

end

if a(q(m),q(m-1)) == a(q(m-1),q(m)) & q(m) < q(m-1)

continue

end

rtrig = 0;

for mm = m-2:-1:1

rdx = 0;

for i = mm:m-1

rdx = rdx + a(q(m),q(i)) - a(q(i),q(m));

end

if rdx > 0

rtrig = 1;

end

end

if rtrig == 1

continue

end

z2 = 0;

for i = 1:n

if s(i) == 0 & i ~= q(m)

z2 = z2 + a(q(m),i);

end

end

z3 = 0;

for i = 1:n-1

if s(i) == 1 || i == q(m)

continue

end

for j = i+1:n

if s(j) == 1 || j == q(m)

continue

end

z3 = z3 + b(i,j); %max(a(i,j),a(j,i));

end

end

if z1 + z2 + z3 > z

trig1 = 0; s(q(m)) = 1; z1 = z1 + z2;

end

end

end

x(n+1) = [];

reordered_matrix = a(x,x);

summat = sum(sum(a)); lindx = z./summat; con = 0; incon = 0;

for i = 1:n-1

for j = i+1:n

if reordered_matrix(i,j) > reordered_matrix(j,i)

con = con + 1;

end

if reordered_matrix(i,j) < reordered_matrix(j,i)

incon = incon + 1;

end

end

end

domindex = z; permopt = x;

toc;

Program 5: BBSUBSET.M

function [subset, index] = bbsubset(a, num_subsets, sub_size)

%BBSUBSET.M: This program reads an n-by-n matrix 'a' and will extract

%'num_subsets' each of size 'sub_size' based on the objective of maximizing

%the within subset sums of matrix elements minus the between subset sums of

%matrix elements. The program is fast when sub_size = 2, but can be much

%slower for sub_size > 2.

tic;

[n,n1]=size(a); ict = 0; c = num_subsets; gs = sub_size;

for k = 1:c

for h = 1:gs

ict = ict + 1; tau(ict) = k;

end

end

targ = ict;

ict = 0;

for i = 1:n-1

for j = i+1:n

ict = ict + 1; r(ict) = a(i,j); pair1(ict) = i; pair2(ict) = j;

end

end

% This block is designed to establish bounds for the between subset

% and within subset sums.

r = -r; [r,idx] = sort(r); r = -r; pair1 = pair1(idx); pair2 = pair2(idx);

delta = zeros(1,targ); omega = zeros(1,targ);

ws = 0; bs = 0;

ic = 0;

for k = 1:c

for h = 1:gs

ic = ic + 1;

for l = 1:h-1

ws = ws + 1;

end

delta(ic) = ws;

for l = 1:k-1

for u = 1:gs

bs = bs + 1;

end

end

omega(ic) = bs;

end

end

within_terms = 0; between_terms = 0;

for k = 1:c

within_terms = within_terms + gs.*(gs-1)./2;

end

for k = 1:c-1;

for h = k+1:c;

between_terms = between_terms + gs.*gs;

end

end

%This block is a greedy heuristic for obtaining an initial lower bound for

%the within subset sums minus the between subset sums.

sel = zeros(1,n);

prt = zeros(k,n);

for k = 1:c

amax = -99999999;

for i = 1:n-1

if sel(i) == 1

continue

end

for j = i+1:n

if sel(j) == 1

continue

end

asum = 0;

for h = 1:k-1

for l = 1:gs

asum = asum - a(i,prt(h,l)) - a(j,prt(h,l));

end

end

if a(i,j) + asum > amax

amax = a(i,j)+asum; isel = i; jsel = j;

end

end

end

prt(k,1) = isel; prt(k,2) = jsel; sel(isel) = 1; sel(jsel) = 1;

for i = 3:gs

amax = -99999999;

for j = 1:n

if sel(j) == 1

continue

end

asum = 0;

for h = 1:i-1

asum = asum + a(j,prt(k,h));

end

for h = 1:k-1

for l = 1:gs

asum = asum - a(j,prt(h,l));

end

end

if asum > amax

jsel = j; amax = asum;

end

end

prt(k,i) = jsel; sel(jsel) = 1;

end

end

z = 0;

for k = 1:c

for i = 1:gs-1

for j = i+1:gs

z = z + a(prt(k,i),prt(k,j));

end

end

end

for k = 1:c-1

for l = k+1:c

for i = 1:gs

for j = 1:gs

z = z - a(prt(k,i),prt(l,j));

end

end

end

end

z = z-1;

%The branch-and-bound algorithm begins here

q = zeros(1,n+1); s = zeros(1,n+1); test_sum = 0; tsum = zeros(1,n);

m = 1; q(m) = 1; s(1) = 1; trig1 = 0; trigend = 0;

while trigend == 0

if trig1 == 0

m = m + 1; trig1 = 1; % advance pointer

end

q(m) = q(m) + 1;

if s(q(m)) ~= 0 % redundancy

continue

end

if m > 2

if tau(m-1)==tau(m) & q(m) < q(m-1) % redundancy

continue

end

if tau(m-1)~=tau(m) & q(m) < q(m-gs) % redundancy

continue

end

end

if m == 1 & q(m) > n % terminate

trigend = 1;

continue

end

if m > 1 & q(m) > n % retract

s(q(m)) = 0; q(m) = 0; m = m - 1; test_sum = test_sum - tsum(m); tsum(m) = 0;

s(q(m)) = 0;

continue

end

for i = 1:m-1

v = q(i);

if tau(i) == tau(m)

tsum(m) = tsum(m) + a(v,q(m));

else

tsum(m) = tsum(m) - a(v,q(m));

end

end

test_sum = test_sum + tsum(m);

w = within_terms - delta(m); % b = between_terms - omega(m);

rbb = 0; rff = 0; isum = 0; isel = 0;

while isel < w

isum = isum + 1; pr1 = pair1(isum); pr2 = pair2(isum);

if s(pr1)~=0 & s(pr2) ~= 0

continue

end

if s(pr1) > 0 & s(pr1) ~= tau(m)

continue

end

if s(pr2) > 0 & s(pr2) ~= tau(m)

continue

end

rbb = rbb + r(isum); isel = isel + 1;

end

if test_sum + rbb - rff > z % Partial solution passes so

if m == targ % update incumbent if an optimum

z = test_sum;

partition = q;

test_sum = test_sum - tsum(m); tsum(m) = 0;

continue

end

trig1 = 0; s(q(m))=tau(m); % Otherwise, branch forward

else

test_sum = test_sum - tsum(m); tsum(m) = 0;

end

end

subset = partition; index = z;

toc