Docstoc

Supplemental online material for Clustering Seriation and Subset

Document Sample
Supplemental online material for Clustering Seriation and Subset Powered By Docstoc
					Supplemental online material for “Clustering, Seriation, and Subset Extraction of Confusion Data,”
by Michael J. Brusco and J. Douglas Steinley.

Program 1: PREPARE.M
function [s1, s2, s3, d1, d2, d3] = prepare(confuse)
%PREPARE2 reads in a raw confusion matrix 'confuse', and prepares new matrices
%Five output matrices are produced (all diagonal terms are zeroed out)
%Matrix s1 is a symmetric similarity matrix obtained by the arithmetic mean
%            of off diagonals
%Matrix s2 is a symmetric similarity matrix obtained by the arithmetic mean
%            of off diagonals
%Matrix s3 is a symmetric similarity matrix based on Luce's (1963) SCM
%Matrix d1 is a symmetric dissimilarity matrix based on s1
%Matrix d2 is a symmetric dissimilarity matrix based on s2
%Matrix d3 is Shepard's (1957) pseudo Euclidean distance, which is
%            computed as the natural log of the terms in s3.
epsilon = .000000000001; sse = 99999; [n,n1] = size(confuse); lambda =
ones(n,n);
s1 = confuse' + confuse; s1 = s1./2;
for i = 1:n-1
    for j = 1+i:n
        s2(i,j) = (confuse(i,j).*confuse(j,i)).^.5; s2(j,i) = s2(i,j);
    end
end

trig = 0;
for i = 1:n                 % Find out if there are any cases where
    for j = 1:n             % a(i,j) = a(j,i) = 0
        if confuse(i,j) == 0 && confuse(j,i)==0
             trig = 1;
        end
    end
end
if trig == 1                % Use Heiser's "1/2 rule" for raw confusion data
    for i = 1:n-1           % and Gilmore et al.'s rule for confusion pcts.
        for j = i+1:n
             if confuse(i,j) == 0 && confuse(j,i)==0
                 if trace(confuse) > 2.*n
                      confuse(i,j) = confuse(i,j) + .5; confuse(j,i) =
confuse(j,i) + .5;
                 else
                      confuse(i,j) = confuse(i,j) + .001; confuse(j,i) =
confuse(j,i) + .001;
                 end
             end
        end
    end
end

% This block fits the Shepard-Luce similarity choice matrix (s2) by using
% iterative proportional fitting as described by Heiser (1988).
lambda_new = zeros(n,n); eta = zeros(n,n);
while sse > epsilon
    lambprev = lambda;
    csum = sum(sum(confuse(:,:)));
    lambda_sum = sum(sum(lambda(:,:)));
    lambda = (csum./lambda_sum).*lambda;
    rowcsum = sum(confuse');
    rowlambdasum = sum(lambda');
    for i = 1:n
        for j = 1:n
            lambda_new(i,j) = (rowcsum(i)./rowlambdasum(i)).* lambda(i,j);
        end
    end
    lambda = lambda_new;
    colcsum = sum(confuse);
    collambdasum = sum(lambda);
    for j = 1:n
        for i = 1:n
            lambda_new(i,j) = (colcsum(j)./collambdasum(j)).* lambda(i,j);
        end
    end
    lambda = lambda_new;
    for i = 1:n
        for j = 1:n
            lambda_new(i,j) =
(confuse(i,j)+confuse(j,i)).*lambda(i,j)./(lambda(i,j)+lambda(j,i));
        end
    end
    lambda = lambda_new;
    diff = (lambda - lambprev).^2;
    sse = sum(sum(diff(:,:)));
end
for i = 1:n
    for j = 1:n
        eta(i,j) = ((lambda(i,j).*lambda(j,i))./(lambda(i,i).*lambda(j,j))).^.5;
    end
end
s3 = eta;
d3 = -log(s3); % Matrix d3 is Shepard's distance based on s3

%The block below will obtain dissimilarity matrices d1 and d2 based on
%matrices s1 and s2, respectively.
for i = 1:n
  s1(i,i) = 0; s2(i,i) = 0;
end
maxs1 = max(max(s1)); maxs2 = max(max(s2)); d1 = maxs1 - s1; d2 = maxs2-s2;
for i = 1:n
  d1(i,i) = 0; d2(i,i) = 0; d3(i,i) = 0;
end


Program 2: BBDIAM.M
function [partition, diameter] = bbdiam(a, num_clusters)
%BBDIAM find a minimum-diameter partition of 'e' objects corresponding to
%       the dissimilarity matrix 'a'. The number of clusters in the
%       partition is c = 'num_clusters'. The minimum diameter is
%       'diameter' and the 'partition' contains the optimal partition.
%       This program uses 10 replication of an exchange algorithm to get
%       a good upper bound on partition diameter.
%         It also reorders using the rule from Brusco and Cradit (2004), but
%         this is transparent to the user because the solution is remapped
%         backed to the original ordering prior to returning 'partition'


tic;
[n1,e] = size(a); epsilon = .0000001; c = num_clusters;

bestz = 99999999999;
for ijk = 1:10

    pp = 1:c; pp = repmat(pp,1,e); qq = randperm(e.*c); pp = pp(qq); pp = pp(1:e);
    for k = 1:c
        numk(k)=sum(pp==k);                  % Run exchange heuristic to get
    end                                      % a good initial bound
    trig = 0;
    while trig == 0
        trig = 1;
        for i = 1:e
            k1 = pp(i);
            if numk(k1)==1
                continue
            end
            k1max = 0;
            for l = 1:e
                if pp(l) == k1 && a(i,l) > k1max
                    k1max = a(i,l);
                end
            end
            for k = 1:c
                if k1 == k
                    continue
                end
                kmax = 0;
                for l = 1:e
                  if pp(l) == k && a(i,l) > kmax
                    kmax = a(i,l);
                  end
                end
                if kmax < k1max
                    numk(k) = numk(k)+1; numk(k1) = numk(k1) - 1; k1max = kmax;
                    pp(i) = k; k1 = k; trig = 0;
                end
            end
        end
    end
    z = 0;
    for i = 1:e-1
        for j = i+1:e
            if pp(i)==pp(j) && a(i,j) > z
                z = a(i,j);
            end
        end
    end

    if z < bestz
        bestz = z;
  end
end

ss = zeros(1,e);                      %   Reorder by looking at the number
for i = 1:e                           %   of elements in each row of the
    ss(i) = sum(a(i,:)>=bestz);       %   dissimilarity matrix that are
end                                   %   greater than the heuristic bound
[st,index] = sort(ss);
aa = a;
for i = 1:e
    index2(i) = index(e-i+1);
end
a = a(index2,index2);

p = 0; q = c; z = bestz+.01            % Initialize
x = zeros(1,e); xb = zeros(1,e);
n = zeros(1,c);
optfound = 0; trig1 = 0; trig2 = 0;
while optfound == 0
  if trig1 == 0                          % Branch Forward
    p = p + 1; m = 1; n(m) = n(m) + 1; x(p) = m;
    if n(m) == 1
      q = q - 1;
    end
    trig1 = 1;
  end
  if trig2 == 0
    if e - p >= q                       % Check Feasibility
      trig7 = 0;
      for i = 1:p-1
        k1 = x(i);
        for j = i+1:p
           k2 = x(j);
           if k1 == k2 && a(i,j) > z - epsilon
              trig7 = 1;
              break
           end
        end
%         if k1 == m && a(i,p) > z-epsilon;
%           trig7 = 1;
%           break
%         end
      end
      if trig7 == 0 && q == 0
        for j = p+1:e
           ksum = zeros(1,c);
           for i = 1:p
              ki = x(i);
              if a(i,j) > ksum(ki)
                ksum(ki) = a(i,j);
              end
           end
           kmin = min(ksum);
           if kmin > z-epsilon
              trig7 = 1;
              break
           end
         end
       end
       if trig7 == 0
         if p ~= e
           trig1 = 0;
           continue
         else
           xb = x; z = 0;         % Install new best solution
           for i = 1:p-1
              ki = x(i);
              for j = i+1:p
                kj = x(j);
                if ki == kj && a(i,j) > z
                  z = a(i,j);
                end
              end
           end
         end
       end
    end
  end
  if m == c || n(m) == 1          % Dispensation (if m = c or n(m) = 1)
    x(p) = 0;                     % then Retract
    n(m) = n(m) - 1; p = p - 1;
    if n(m) == 0
       q = q + 1;
    end
    if p == 0
       optfound = 1;
       continue
    else
       m = x(p); trig1 = 1; trig2 = 1;
    end
  else
                                % Otherwise, Branch Right
    n(m) = n(m) - 1; m = m + 1; n(m) = n(m)+1; x(p) = m;
    if n(m) == 1
       q = q - 1;
    end
    trig1 = 1; trig2 = 0;
    continue
  end
end
diameter = z; a = aa;          % Remap everything back.
for i = 1:e
    partition(index2(i)) = xb(i);
end
toc


Program 3: RANDOPT.M
function [ari,newa,newb] = randopt(mata,matb,parta,partb,diama,diamb)
% RANDOPT.M reads two dissimilarity matrices (mata and matb), two
% partitions of those matrices (parta, partb), and the diameters of the
% partitions (diama and diamb). An exchange algorithm is used to refine
% the partitions to maximize the adjusted Rand index, while maintaining
% minimum-diameter partitions. The best-found ARI and the new partitions
% (newa and newb) are returned as output.
datax = mata; datay = matb; xx = parta; yy= partb; boundx = diama;
boundy = diamb;
m = length(xx); a = 0; b = 0; c = 0; d = 0; x = xx; y = yy;
kx = max(xx); ky = max(yy);
n = m.*(m-1)./2;                              % compute initial ARI
for i = 1:m-1
  for j = i+1:m
    if x(i) == x(j) && y(i) == y(j)
      a = a + 1;
    elseif x(i) ~= x(j) && y(i) ~= y(j)
      d = d + 1;
    elseif x(i) == x(j) && y(i) ~= y(j)
      b = b + 1;
    else
      c = c + 1;
    end
  end
end
rx = (n.*(a+d)-((a+b).*(a+c)+(c+d).*(b+d)))./(n.*n-((a+b).*(a+c)+(c+d).*(b+d)))
xopt = xx; yopt = yy;
trig = 0;                              % Begin exchange algorithm here.
while trig == 0                        % If at least one improvement is
    trig = 1; rb = rx;                 % found during a cycle, then trig = 0
    for ii = 1:m
         for k = 1:kx
             if k == xx(ii)
                 continue
             end
             x = xx; x(ii) = k; itest = 0; y = yy;
             for iii = 1:m
                 if x(ii) == x(iii) && datax(ii,iii) > boundx
                      itest = 1; break
                 end
             end
             if itest == 1
                 continue
             end
             a = 0; b = 0; c = 0; d = 0;
             for i = 1:m-1
                 for j = i+1:m
                      if x(i) == x(j) && y(i) == y(j)
                           a = a + 1;
                      elseif x(i) ~= x(j) && y(i) ~= y(j)
                           d = d + 1;
                      elseif x(i) == x(j) && y(i) ~= y(j)
                           b = b + 1;
                      else
                           c = c + 1;
                      end
                 end
             end
             rx = (n.*(a+d)-((a+b).*(a+c)+(c+d).*(b+d)))./
                  (n.*n-((a+b).*(a+c)+(c+d).*(b+d)));
             if rx > rb
                  rb = rx; isel = ii; ksel = k; icol = 1; trig = 0;
            end
        end
    end
    for ii = 1:m
        for k = 1:ky
             if k == yy(ii)
                 continue
             end
             y = yy; y(ii) = k;itest = 0; x=xx;
             for iii = 1:m
                 if y(ii) == y(iii) && datay(ii,iii) > boundy
                     itest = 1; break
                 end
             end
             if itest == 1
                 continue
             end
             a = 0; b = 0; c = 0; d = 0;
             for i = 1:m-1
                 for j = i+1:m
                     if x(i) == x(j) && y(i) == y(j)
                          a = a + 1;
                     elseif x(i) ~= x(j) && y(i) ~= y(j)
                          d = d + 1;
                     elseif x(i) == x(j) && y(i) ~= y(j)
                          b = b + 1;
                     else
                          c = c + 1;
                     end
                 end
             end
             rx = (n.*(a+d)-((a+b).*(a+c)+(c+d).*(b+d)))./
                  (n.*n-((a+b).*(a+c)+(c+d).*(b+d)));
             if rx > rb
                 isel = ii; ksel = k; icol = 2; trig = 0;
                 rb = rx;
             end
        end
    end
    if trig == 0
        if icol == 1
             xx(isel) = ksel; rx = rb
        else
             yy(isel) = ksel; rx = rb
        end
    end
end
newa = xx; newb = yy; ari=rb;


Program 4: BBDOM.M
function [domindex,permopt, lindx, con, incon] = bbdom(confusion_matrix)
% BBDOM.M uses a branch-and-bound algorithm to fins a reordering of the
% rows and columns of an asymmetric matrix so as to maximize the sum of the
% elements above the main diagonal of the reordered matrix. The CPU time
% for this algorithm begins to accelerate for matrices larger than 20x20
tic; a = confusion_matrix;
[n,n1]=size(a);
for i = 1:n
    a(i,i) = 0;
end
for i = 1:n-1
    for j = i+1:n
        b(i,j) = max(a(i,j),a(j,i)); b(j,i) = b(i,j);
    end
end
nreps = 100;                     % Total number of replications
domindex=0;                      % Best z-value across all replications
for iii = 1:nreps
  s = randperm(n);               % Create a random permutation
  z = sum(sum(triu(a(s,s))));    % Sum above the main diagonal
  z = z - sum(diag(a));
  zbest = z;                     % Best z-value for particular replication
  sb = s;

  dtarg=1;                       % Perform pairwise interchange to guarantee a
                                 % local optimum with respect to that operation

  while dtarg > eps
    dtarg = 0;
    for i = 1:n-1
      for j = i+1:n
        delta=a(s(j),s(i))-a(s(i),s(j));
        for k = i+1:j-1
          delta=delta-a(s(i),s(k))+a(s(j),s(k))+a(s(k),s(i))-a(s(k),s(j));

        end
        if delta > dtarg
          dtarg = delta;
          z = z + dtarg;
          jdum = s(j);
          s(j) = s(i);
          s(i) = jdum;
        end
      end
    end
  end
  if z > domindex
    domindex = z;
  end
end


z = domindex - 1


q = zeros(1,n+1); s = zeros(1,n+1);
m = 1; q(m) = 1; s(1) = 1; trig1 = 0; trigend = 0;
z1 = 0;
for j = 1:n
  if j ~= q(m)
    z1 = z1 + a(q(m),j);
  end
end
while trigend == 0
  if trig1 == 0
    m = m + 1; trig1 = 1;      % advance pointer
  end
  q(m) = q(m) + 1;
  if s(q(m)) == 1              % redundancy
    continue
  end

  if m == 1 && q(m) > n         % terminate
    trigend = 1;
    continue
  end
  if m > 1 && q(m) > n          % retract
    s(q(m)) = 0; q(m) = 0; m = m - 1;
    for i = 1:n
       if s(i)==1 && i ~= q(m)
          continue
       end
       z1 = z1 - a(q(m),i);
    end
    s(q(m)) = 0;
    continue
  end
  if m == n
%     for i = 1:n
%       for j = 1:n-1
%          if q(j) == i
%            break
%          end
%          q(n) = i;
%         if q(6) == 6 && q(7) == 10
%            junk = 1;
%          end
%       end
%     end

   zbd = 0;
   for i = 1:n-1
       for j = i+1:n
            zbd = zbd + a(q(i),q(j));
       end
   end
   if zbd > z
     z = zbd; x = q;
   end
 elseif m == 1
   z1 = 0;
   for j = 1:n
     if j ~= q(m)
       z1 = z1 + a(q(m),j);
     end
   end
   trig1 = 0; s(q(m))=1;
  else

    if a(q(m),q(m-1)) > a(q(m-1),q(m))
      continue
    end
    if a(q(m),q(m-1)) == a(q(m-1),q(m)) && q(m) < q(m-1)
      continue
    end
    rtrig = 0;
    for mm = m-2:-1:1
      rdx = 0;
      for i = mm:m-1
        rdx = rdx + a(q(m),q(i)) - a(q(i),q(m));
      end
      if rdx > 0
        rtrig = 1;
      end
    end
    if rtrig == 1
      continue
    end
    z2 = 0;
    for i = 1:n
      if s(i) == 0 && i ~= q(m)
        z2 = z2 + a(q(m),i);
      end
    end
    z3 = 0;
    for i = 1:n-1
      if s(i) == 1 || i == q(m)
        continue
      end
      for j = i+1:n
        if s(j) == 1 || j == q(m)
          continue
        end
        z3 = z3 + b(i,j); %max(a(i,j),a(j,i));
      end
    end

    if z1 + z2 + z3 > z
      trig1 = 0; s(q(m)) = 1; z1 = z1 + z2;
    end
  end
end
x(n+1) = [];
reordered_matrix = a(x,x);
summat = sum(sum(a)); lindx = z./summat; con = 0; incon = 0;
for i = 1:n-1
    for j = i+1:n
        if reordered_matrix(i,j) > reordered_matrix(j,i)
             con = con + 1;
        end
        if reordered_matrix(i,j) < reordered_matrix(j,i)
             incon = incon + 1;
        end
    end
end
domindex = z; permopt = x;
toc;


Program 5: BBSUBSET.M
function [subset, index] = bbsubset(a, num_subsets, sub_size)
%BBSUBSET.M: This program reads an n-by-n matrix 'a' and will extract
%'num_subsets' each of size 'sub_size' based on the objective of maximizing
%the within subset sums of matrix elements minus the between subset sums of
%matrix elements. The program is fast when sub_size = 2, but can be much
%slower for sub_size > 2.

tic;
[n,n1]=size(a); ict = 0; c = num_subsets; gs = sub_size;
for k = 1:c
  for h = 1:gs
     ict = ict + 1; tau(ict) = k;
  end
end
targ = ict;

ict = 0;
for i = 1:n-1
  for j = i+1:n
    ict = ict + 1; r(ict) = a(i,j); pair1(ict) = i; pair2(ict) = j;
  end
end

% This block is designed to establish bounds for the between subset
% and within subset sums.
r = -r; [r,idx] = sort(r); r = -r; pair1 = pair1(idx); pair2 = pair2(idx);
delta = zeros(1,targ); omega = zeros(1,targ);

ws = 0; bs = 0;
ic = 0;
for k = 1:c
  for h = 1:gs
    ic = ic + 1;
    for l = 1:h-1
      ws = ws + 1;
    end
    delta(ic) = ws;
    for l = 1:k-1
      for u = 1:gs
        bs = bs + 1;
      end
    end
    omega(ic) = bs;
  end
end

within_terms = 0; between_terms = 0;
for k = 1:c
  within_terms = within_terms + gs.*(gs-1)./2;
end
for k = 1:c-1;
  for h = k+1:c;
    between_terms = between_terms + gs.*gs;
  end
end

%This block is a greedy heuristic for obtaining an initial lower bound for
%the within subset sums minus the between subset sums.

sel = zeros(1,n);
prt = zeros(k,n);
for k = 1:c
  amax = -99999999;
  for i = 1:n-1
    if sel(i) == 1
       continue
    end
    for j = i+1:n
       if sel(j) == 1
         continue
       end
       asum = 0;
       for h = 1:k-1
         for l = 1:gs
           asum = asum - a(i,prt(h,l)) - a(j,prt(h,l));
         end
       end
       if a(i,j) + asum > amax
         amax = a(i,j)+asum; isel = i; jsel = j;
       end
    end
  end
  prt(k,1) = isel; prt(k,2) = jsel; sel(isel) = 1; sel(jsel) = 1;
  for i = 3:gs
    amax = -99999999;
    for j = 1:n
       if sel(j) == 1
         continue
       end
       asum = 0;
       for h = 1:i-1
         asum = asum + a(j,prt(k,h));
       end
       for h = 1:k-1
         for l = 1:gs
           asum = asum - a(j,prt(h,l));
         end
       end
       if asum > amax
         jsel = j; amax = asum;
       end
    end
    prt(k,i) = jsel; sel(jsel) = 1;
  end
end
z = 0;
for k = 1:c
  for i = 1:gs-1
    for j = i+1:gs
      z = z + a(prt(k,i),prt(k,j));
    end
  end
end
for k = 1:c-1
  for l = k+1:c
    for i = 1:gs
      for j = 1:gs
         z = z - a(prt(k,i),prt(l,j));
      end
    end
  end
end
z = z-1;

%The branch-and-bound algorithm begins here
q = zeros(1,n+1); s = zeros(1,n+1); test_sum = 0; tsum = zeros(1,n);
m = 1; q(m) = 1; s(1) = 1; trig1 = 0; trigend = 0;
while trigend == 0
   if trig1 == 0
     m = m + 1; trig1 = 1;     % advance pointer
   end
   q(m) = q(m) + 1;
   if s(q(m)) ~= 0             % redundancy
     continue
   end
   if m > 2
     if tau(m-1)==tau(m) && q(m) < q(m-1)      % redundancy
       continue
     end
     if tau(m-1)~=tau(m) && q(m) < q(m-gs)       % redundancy
       continue
     end
   end
   if m == 1 && q(m) > n       % terminate
     trigend = 1;
     continue
   end
   if m > 1 && q(m) > n        % retract
     s(q(m)) = 0; q(m) = 0; m = m - 1; test_sum = test_sum - tsum(m); tsum(m) =
0;
     s(q(m)) = 0;
     continue
   end

  for i = 1:m-1
    v = q(i);
    if tau(i) == tau(m)
      tsum(m) = tsum(m) + a(v,q(m));
    else
      tsum(m) = tsum(m) - a(v,q(m));
    end
  end
  test_sum = test_sum + tsum(m);
  w = within_terms - delta(m); % b = between_terms - omega(m);
  rbb = 0; rff = 0; isum = 0; isel = 0;
  while isel < w
      isum = isum + 1; pr1 = pair1(isum); pr2 = pair2(isum);
      if s(pr1)~=0 && s(pr2) ~= 0
          continue
      end
      if s(pr1) > 0 && s(pr1) ~= tau(m)
          continue
      end
      if s(pr2) > 0 && s(pr2) ~= tau(m)
          continue
      end
      rbb = rbb + r(isum); isel = isel + 1;
  end

  if test_sum + rbb - rff > z           % Partial solution passes so
    if m == targ                          % update incumbent if an optimum
       z = test_sum;
       partition = q;
       test_sum = test_sum - tsum(m); tsum(m) = 0;
       continue
    end
    trig1 = 0; s(q(m))=tau(m);          % Otherwise, branch forward
  else
    test_sum = test_sum - tsum(m); tsum(m) = 0;
  end
end
subset = partition; index = z;
toc

				
DOCUMENT INFO
Shared By:
Categories:
Tags:
Stats:
views:6
posted:3/19/2012
language:
pages:14