Supplemental online material for Clustering Seriation and Subset by jennyyingdi

VIEWS: 6 PAGES: 14

									Supplemental online material for “Clustering, Seriation, and Subset Extraction of Confusion Data,”
by Michael J. Brusco and J. Douglas Steinley.

Program 1: PREPARE.M
function [s1, s2, s3, d1, d2, d3] = prepare(confuse)
%PREPARE2 reads in a raw confusion matrix 'confuse', and prepares new matrices
%Five output matrices are produced (all diagonal terms are zeroed out)
%Matrix s1 is a symmetric similarity matrix obtained by the arithmetic mean
%            of off diagonals
%Matrix s2 is a symmetric similarity matrix obtained by the arithmetic mean
%            of off diagonals
%Matrix s3 is a symmetric similarity matrix based on Luce's (1963) SCM
%Matrix d1 is a symmetric dissimilarity matrix based on s1
%Matrix d2 is a symmetric dissimilarity matrix based on s2
%Matrix d3 is Shepard's (1957) pseudo Euclidean distance, which is
%            computed as the natural log of the terms in s3.
epsilon = .000000000001; sse = 99999; [n,n1] = size(confuse); lambda =
ones(n,n);
s1 = confuse' + confuse; s1 = s1./2;
for i = 1:n-1
    for j = 1+i:n
        s2(i,j) = (confuse(i,j).*confuse(j,i)).^.5; s2(j,i) = s2(i,j);
    end
end

trig = 0;
for i = 1:n                 % Find out if there are any cases where
    for j = 1:n             % a(i,j) = a(j,i) = 0
        if confuse(i,j) == 0 && confuse(j,i)==0
             trig = 1;
        end
    end
end
if trig == 1                % Use Heiser's "1/2 rule" for raw confusion data
    for i = 1:n-1           % and Gilmore et al.'s rule for confusion pcts.
        for j = i+1:n
             if confuse(i,j) == 0 && confuse(j,i)==0
                 if trace(confuse) > 2.*n
                      confuse(i,j) = confuse(i,j) + .5; confuse(j,i) =
confuse(j,i) + .5;
                 else
                      confuse(i,j) = confuse(i,j) + .001; confuse(j,i) =
confuse(j,i) + .001;
                 end
             end
        end
    end
end

% This block fits the Shepard-Luce similarity choice matrix (s2) by using
% iterative proportional fitting as described by Heiser (1988).
lambda_new = zeros(n,n); eta = zeros(n,n);
while sse > epsilon
    lambprev = lambda;
    csum = sum(sum(confuse(:,:)));
    lambda_sum = sum(sum(lambda(:,:)));
    lambda = (csum./lambda_sum).*lambda;
    rowcsum = sum(confuse');
    rowlambdasum = sum(lambda');
    for i = 1:n
        for j = 1:n
            lambda_new(i,j) = (rowcsum(i)./rowlambdasum(i)).* lambda(i,j);
        end
    end
    lambda = lambda_new;
    colcsum = sum(confuse);
    collambdasum = sum(lambda);
    for j = 1:n
        for i = 1:n
            lambda_new(i,j) = (colcsum(j)./collambdasum(j)).* lambda(i,j);
        end
    end
    lambda = lambda_new;
    for i = 1:n
        for j = 1:n
            lambda_new(i,j) =
(confuse(i,j)+confuse(j,i)).*lambda(i,j)./(lambda(i,j)+lambda(j,i));
        end
    end
    lambda = lambda_new;
    diff = (lambda - lambprev).^2;
    sse = sum(sum(diff(:,:)));
end
for i = 1:n
    for j = 1:n
        eta(i,j) = ((lambda(i,j).*lambda(j,i))./(lambda(i,i).*lambda(j,j))).^.5;
    end
end
s3 = eta;
d3 = -log(s3); % Matrix d3 is Shepard's distance based on s3

%The block below will obtain dissimilarity matrices d1 and d2 based on
%matrices s1 and s2, respectively.
for i = 1:n
  s1(i,i) = 0; s2(i,i) = 0;
end
maxs1 = max(max(s1)); maxs2 = max(max(s2)); d1 = maxs1 - s1; d2 = maxs2-s2;
for i = 1:n
  d1(i,i) = 0; d2(i,i) = 0; d3(i,i) = 0;
end


Program 2: BBDIAM.M
function [partition, diameter] = bbdiam(a, num_clusters)
%BBDIAM find a minimum-diameter partition of 'e' objects corresponding to
%       the dissimilarity matrix 'a'. The number of clusters in the
%       partition is c = 'num_clusters'. The minimum diameter is
%       'diameter' and the 'partition' contains the optimal partition.
%       This program uses 10 replication of an exchange algorithm to get
%       a good upper bound on partition diameter.
%         It also reorders using the rule from Brusco and Cradit (2004), but
%         this is transparent to the user because the solution is remapped
%         backed to the original ordering prior to returning 'partition'


tic;
[n1,e] = size(a); epsilon = .0000001; c = num_clusters;

bestz = 99999999999;
for ijk = 1:10

    pp = 1:c; pp = repmat(pp,1,e); qq = randperm(e.*c); pp = pp(qq); pp = pp(1:e);
    for k = 1:c
        numk(k)=sum(pp==k);                  % Run exchange heuristic to get
    end                                      % a good initial bound
    trig = 0;
    while trig == 0
        trig = 1;
        for i = 1:e
            k1 = pp(i);
            if numk(k1)==1
                continue
            end
            k1max = 0;
            for l = 1:e
                if pp(l) == k1 && a(i,l) > k1max
                    k1max = a(i,l);
                end
            end
            for k = 1:c
                if k1 == k
                    continue
                end
                kmax = 0;
                for l = 1:e
                  if pp(l) == k && a(i,l) > kmax
                    kmax = a(i,l);
                  end
                end
                if kmax < k1max
                    numk(k) = numk(k)+1; numk(k1) = numk(k1) - 1; k1max = kmax;
                    pp(i) = k; k1 = k; trig = 0;
                end
            end
        end
    end
    z = 0;
    for i = 1:e-1
        for j = i+1:e
            if pp(i)==pp(j) && a(i,j) > z
                z = a(i,j);
            end
        end
    end

    if z < bestz
        bestz = z;
  end
end

ss = zeros(1,e);                      %   Reorder by looking at the number
for i = 1:e                           %   of elements in each row of the
    ss(i) = sum(a(i,:)>=bestz);       %   dissimilarity matrix that are
end                                   %   greater than the heuristic bound
[st,index] = sort(ss);
aa = a;
for i = 1:e
    index2(i) = index(e-i+1);
end
a = a(index2,index2);

p = 0; q = c; z = bestz+.01            % Initialize
x = zeros(1,e); xb = zeros(1,e);
n = zeros(1,c);
optfound = 0; trig1 = 0; trig2 = 0;
while optfound == 0
  if trig1 == 0                          % Branch Forward
    p = p + 1; m = 1; n(m) = n(m) + 1; x(p) = m;
    if n(m) == 1
      q = q - 1;
    end
    trig1 = 1;
  end
  if trig2 == 0
    if e - p >= q                       % Check Feasibility
      trig7 = 0;
      for i = 1:p-1
        k1 = x(i);
        for j = i+1:p
           k2 = x(j);
           if k1 == k2 && a(i,j) > z - epsilon
              trig7 = 1;
              break
           end
        end
%         if k1 == m && a(i,p) > z-epsilon;
%           trig7 = 1;
%           break
%         end
      end
      if trig7 == 0 && q == 0
        for j = p+1:e
           ksum = zeros(1,c);
           for i = 1:p
              ki = x(i);
              if a(i,j) > ksum(ki)
                ksum(ki) = a(i,j);
              end
           end
           kmin = min(ksum);
           if kmin > z-epsilon
              trig7 = 1;
              break
           end
         end
       end
       if trig7 == 0
         if p ~= e
           trig1 = 0;
           continue
         else
           xb = x; z = 0;         % Install new best solution
           for i = 1:p-1
              ki = x(i);
              for j = i+1:p
                kj = x(j);
                if ki == kj && a(i,j) > z
                  z = a(i,j);
                end
              end
           end
         end
       end
    end
  end
  if m == c || n(m) == 1          % Dispensation (if m = c or n(m) = 1)
    x(p) = 0;                     % then Retract
    n(m) = n(m) - 1; p = p - 1;
    if n(m) == 0
       q = q + 1;
    end
    if p == 0
       optfound = 1;
       continue
    else
       m = x(p); trig1 = 1; trig2 = 1;
    end
  else
                                % Otherwise, Branch Right
    n(m) = n(m) - 1; m = m + 1; n(m) = n(m)+1; x(p) = m;
    if n(m) == 1
       q = q - 1;
    end
    trig1 = 1; trig2 = 0;
    continue
  end
end
diameter = z; a = aa;          % Remap everything back.
for i = 1:e
    partition(index2(i)) = xb(i);
end
toc


Program 3: RANDOPT.M
function [ari,newa,newb] = randopt(mata,matb,parta,partb,diama,diamb)
% RANDOPT.M reads two dissimilarity matrices (mata and matb), two
% partitions of those matrices (parta, partb), and the diameters of the
% partitions (diama and diamb). An exchange algorithm is used to refine
% the partitions to maximize the adjusted Rand index, while maintaining
% minimum-diameter partitions. The best-found ARI and the new partitions
% (newa and newb) are returned as output.
datax = mata; datay = matb; xx = parta; yy= partb; boundx = diama;
boundy = diamb;
m = length(xx); a = 0; b = 0; c = 0; d = 0; x = xx; y = yy;
kx = max(xx); ky = max(yy);
n = m.*(m-1)./2;                              % compute initial ARI
for i = 1:m-1
  for j = i+1:m
    if x(i) == x(j) && y(i) == y(j)
      a = a + 1;
    elseif x(i) ~= x(j) && y(i) ~= y(j)
      d = d + 1;
    elseif x(i) == x(j) && y(i) ~= y(j)
      b = b + 1;
    else
      c = c + 1;
    end
  end
end
rx = (n.*(a+d)-((a+b).*(a+c)+(c+d).*(b+d)))./(n.*n-((a+b).*(a+c)+(c+d).*(b+d)))
xopt = xx; yopt = yy;
trig = 0;                              % Begin exchange algorithm here.
while trig == 0                        % If at least one improvement is
    trig = 1; rb = rx;                 % found during a cycle, then trig = 0
    for ii = 1:m
         for k = 1:kx
             if k == xx(ii)
                 continue
             end
             x = xx; x(ii) = k; itest = 0; y = yy;
             for iii = 1:m
                 if x(ii) == x(iii) && datax(ii,iii) > boundx
                      itest = 1; break
                 end
             end
             if itest == 1
                 continue
             end
             a = 0; b = 0; c = 0; d = 0;
             for i = 1:m-1
                 for j = i+1:m
                      if x(i) == x(j) && y(i) == y(j)
                           a = a + 1;
                      elseif x(i) ~= x(j) && y(i) ~= y(j)
                           d = d + 1;
                      elseif x(i) == x(j) && y(i) ~= y(j)
                           b = b + 1;
                      else
                           c = c + 1;
                      end
                 end
             end
             rx = (n.*(a+d)-((a+b).*(a+c)+(c+d).*(b+d)))./
                  (n.*n-((a+b).*(a+c)+(c+d).*(b+d)));
             if rx > rb
                  rb = rx; isel = ii; ksel = k; icol = 1; trig = 0;
            end
        end
    end
    for ii = 1:m
        for k = 1:ky
             if k == yy(ii)
                 continue
             end
             y = yy; y(ii) = k;itest = 0; x=xx;
             for iii = 1:m
                 if y(ii) == y(iii) && datay(ii,iii) > boundy
                     itest = 1; break
                 end
             end
             if itest == 1
                 continue
             end
             a = 0; b = 0; c = 0; d = 0;
             for i = 1:m-1
                 for j = i+1:m
                     if x(i) == x(j) && y(i) == y(j)
                          a = a + 1;
                     elseif x(i) ~= x(j) && y(i) ~= y(j)
                          d = d + 1;
                     elseif x(i) == x(j) && y(i) ~= y(j)
                          b = b + 1;
                     else
                          c = c + 1;
                     end
                 end
             end
             rx = (n.*(a+d)-((a+b).*(a+c)+(c+d).*(b+d)))./
                  (n.*n-((a+b).*(a+c)+(c+d).*(b+d)));
             if rx > rb
                 isel = ii; ksel = k; icol = 2; trig = 0;
                 rb = rx;
             end
        end
    end
    if trig == 0
        if icol == 1
             xx(isel) = ksel; rx = rb
        else
             yy(isel) = ksel; rx = rb
        end
    end
end
newa = xx; newb = yy; ari=rb;


Program 4: BBDOM.M
function [domindex,permopt, lindx, con, incon] = bbdom(confusion_matrix)
% BBDOM.M uses a branch-and-bound algorithm to fins a reordering of the
% rows and columns of an asymmetric matrix so as to maximize the sum of the
% elements above the main diagonal of the reordered matrix. The CPU time
% for this algorithm begins to accelerate for matrices larger than 20x20
tic; a = confusion_matrix;
[n,n1]=size(a);
for i = 1:n
    a(i,i) = 0;
end
for i = 1:n-1
    for j = i+1:n
        b(i,j) = max(a(i,j),a(j,i)); b(j,i) = b(i,j);
    end
end
nreps = 100;                     % Total number of replications
domindex=0;                      % Best z-value across all replications
for iii = 1:nreps
  s = randperm(n);               % Create a random permutation
  z = sum(sum(triu(a(s,s))));    % Sum above the main diagonal
  z = z - sum(diag(a));
  zbest = z;                     % Best z-value for particular replication
  sb = s;

  dtarg=1;                       % Perform pairwise interchange to guarantee a
                                 % local optimum with respect to that operation

  while dtarg > eps
    dtarg = 0;
    for i = 1:n-1
      for j = i+1:n
        delta=a(s(j),s(i))-a(s(i),s(j));
        for k = i+1:j-1
          delta=delta-a(s(i),s(k))+a(s(j),s(k))+a(s(k),s(i))-a(s(k),s(j));

        end
        if delta > dtarg
          dtarg = delta;
          z = z + dtarg;
          jdum = s(j);
          s(j) = s(i);
          s(i) = jdum;
        end
      end
    end
  end
  if z > domindex
    domindex = z;
  end
end


z = domindex - 1


q = zeros(1,n+1); s = zeros(1,n+1);
m = 1; q(m) = 1; s(1) = 1; trig1 = 0; trigend = 0;
z1 = 0;
for j = 1:n
  if j ~= q(m)
    z1 = z1 + a(q(m),j);
  end
end
while trigend == 0
  if trig1 == 0
    m = m + 1; trig1 = 1;      % advance pointer
  end
  q(m) = q(m) + 1;
  if s(q(m)) == 1              % redundancy
    continue
  end

  if m == 1 && q(m) > n         % terminate
    trigend = 1;
    continue
  end
  if m > 1 && q(m) > n          % retract
    s(q(m)) = 0; q(m) = 0; m = m - 1;
    for i = 1:n
       if s(i)==1 && i ~= q(m)
          continue
       end
       z1 = z1 - a(q(m),i);
    end
    s(q(m)) = 0;
    continue
  end
  if m == n
%     for i = 1:n
%       for j = 1:n-1
%          if q(j) == i
%            break
%          end
%          q(n) = i;
%         if q(6) == 6 && q(7) == 10
%            junk = 1;
%          end
%       end
%     end

   zbd = 0;
   for i = 1:n-1
       for j = i+1:n
            zbd = zbd + a(q(i),q(j));
       end
   end
   if zbd > z
     z = zbd; x = q;
   end
 elseif m == 1
   z1 = 0;
   for j = 1:n
     if j ~= q(m)
       z1 = z1 + a(q(m),j);
     end
   end
   trig1 = 0; s(q(m))=1;
  else

    if a(q(m),q(m-1)) > a(q(m-1),q(m))
      continue
    end
    if a(q(m),q(m-1)) == a(q(m-1),q(m)) && q(m) < q(m-1)
      continue
    end
    rtrig = 0;
    for mm = m-2:-1:1
      rdx = 0;
      for i = mm:m-1
        rdx = rdx + a(q(m),q(i)) - a(q(i),q(m));
      end
      if rdx > 0
        rtrig = 1;
      end
    end
    if rtrig == 1
      continue
    end
    z2 = 0;
    for i = 1:n
      if s(i) == 0 && i ~= q(m)
        z2 = z2 + a(q(m),i);
      end
    end
    z3 = 0;
    for i = 1:n-1
      if s(i) == 1 || i == q(m)
        continue
      end
      for j = i+1:n
        if s(j) == 1 || j == q(m)
          continue
        end
        z3 = z3 + b(i,j); %max(a(i,j),a(j,i));
      end
    end

    if z1 + z2 + z3 > z
      trig1 = 0; s(q(m)) = 1; z1 = z1 + z2;
    end
  end
end
x(n+1) = [];
reordered_matrix = a(x,x);
summat = sum(sum(a)); lindx = z./summat; con = 0; incon = 0;
for i = 1:n-1
    for j = i+1:n
        if reordered_matrix(i,j) > reordered_matrix(j,i)
             con = con + 1;
        end
        if reordered_matrix(i,j) < reordered_matrix(j,i)
             incon = incon + 1;
        end
    end
end
domindex = z; permopt = x;
toc;


Program 5: BBSUBSET.M
function [subset, index] = bbsubset(a, num_subsets, sub_size)
%BBSUBSET.M: This program reads an n-by-n matrix 'a' and will extract
%'num_subsets' each of size 'sub_size' based on the objective of maximizing
%the within subset sums of matrix elements minus the between subset sums of
%matrix elements. The program is fast when sub_size = 2, but can be much
%slower for sub_size > 2.

tic;
[n,n1]=size(a); ict = 0; c = num_subsets; gs = sub_size;
for k = 1:c
  for h = 1:gs
     ict = ict + 1; tau(ict) = k;
  end
end
targ = ict;

ict = 0;
for i = 1:n-1
  for j = i+1:n
    ict = ict + 1; r(ict) = a(i,j); pair1(ict) = i; pair2(ict) = j;
  end
end

% This block is designed to establish bounds for the between subset
% and within subset sums.
r = -r; [r,idx] = sort(r); r = -r; pair1 = pair1(idx); pair2 = pair2(idx);
delta = zeros(1,targ); omega = zeros(1,targ);

ws = 0; bs = 0;
ic = 0;
for k = 1:c
  for h = 1:gs
    ic = ic + 1;
    for l = 1:h-1
      ws = ws + 1;
    end
    delta(ic) = ws;
    for l = 1:k-1
      for u = 1:gs
        bs = bs + 1;
      end
    end
    omega(ic) = bs;
  end
end

within_terms = 0; between_terms = 0;
for k = 1:c
  within_terms = within_terms + gs.*(gs-1)./2;
end
for k = 1:c-1;
  for h = k+1:c;
    between_terms = between_terms + gs.*gs;
  end
end

%This block is a greedy heuristic for obtaining an initial lower bound for
%the within subset sums minus the between subset sums.

sel = zeros(1,n);
prt = zeros(k,n);
for k = 1:c
  amax = -99999999;
  for i = 1:n-1
    if sel(i) == 1
       continue
    end
    for j = i+1:n
       if sel(j) == 1
         continue
       end
       asum = 0;
       for h = 1:k-1
         for l = 1:gs
           asum = asum - a(i,prt(h,l)) - a(j,prt(h,l));
         end
       end
       if a(i,j) + asum > amax
         amax = a(i,j)+asum; isel = i; jsel = j;
       end
    end
  end
  prt(k,1) = isel; prt(k,2) = jsel; sel(isel) = 1; sel(jsel) = 1;
  for i = 3:gs
    amax = -99999999;
    for j = 1:n
       if sel(j) == 1
         continue
       end
       asum = 0;
       for h = 1:i-1
         asum = asum + a(j,prt(k,h));
       end
       for h = 1:k-1
         for l = 1:gs
           asum = asum - a(j,prt(h,l));
         end
       end
       if asum > amax
         jsel = j; amax = asum;
       end
    end
    prt(k,i) = jsel; sel(jsel) = 1;
  end
end
z = 0;
for k = 1:c
  for i = 1:gs-1
    for j = i+1:gs
      z = z + a(prt(k,i),prt(k,j));
    end
  end
end
for k = 1:c-1
  for l = k+1:c
    for i = 1:gs
      for j = 1:gs
         z = z - a(prt(k,i),prt(l,j));
      end
    end
  end
end
z = z-1;

%The branch-and-bound algorithm begins here
q = zeros(1,n+1); s = zeros(1,n+1); test_sum = 0; tsum = zeros(1,n);
m = 1; q(m) = 1; s(1) = 1; trig1 = 0; trigend = 0;
while trigend == 0
   if trig1 == 0
     m = m + 1; trig1 = 1;     % advance pointer
   end
   q(m) = q(m) + 1;
   if s(q(m)) ~= 0             % redundancy
     continue
   end
   if m > 2
     if tau(m-1)==tau(m) && q(m) < q(m-1)      % redundancy
       continue
     end
     if tau(m-1)~=tau(m) && q(m) < q(m-gs)       % redundancy
       continue
     end
   end
   if m == 1 && q(m) > n       % terminate
     trigend = 1;
     continue
   end
   if m > 1 && q(m) > n        % retract
     s(q(m)) = 0; q(m) = 0; m = m - 1; test_sum = test_sum - tsum(m); tsum(m) =
0;
     s(q(m)) = 0;
     continue
   end

  for i = 1:m-1
    v = q(i);
    if tau(i) == tau(m)
      tsum(m) = tsum(m) + a(v,q(m));
    else
      tsum(m) = tsum(m) - a(v,q(m));
    end
  end
  test_sum = test_sum + tsum(m);
  w = within_terms - delta(m); % b = between_terms - omega(m);
  rbb = 0; rff = 0; isum = 0; isel = 0;
  while isel < w
      isum = isum + 1; pr1 = pair1(isum); pr2 = pair2(isum);
      if s(pr1)~=0 && s(pr2) ~= 0
          continue
      end
      if s(pr1) > 0 && s(pr1) ~= tau(m)
          continue
      end
      if s(pr2) > 0 && s(pr2) ~= tau(m)
          continue
      end
      rbb = rbb + r(isum); isel = isel + 1;
  end

  if test_sum + rbb - rff > z           % Partial solution passes so
    if m == targ                          % update incumbent if an optimum
       z = test_sum;
       partition = q;
       test_sum = test_sum - tsum(m); tsum(m) = 0;
       continue
    end
    trig1 = 0; s(q(m))=tau(m);          % Otherwise, branch forward
  else
    test_sum = test_sum - tsum(m); tsum(m) = 0;
  end
end
subset = partition; index = z;
toc

								
To top