Home > gmmbayestb-v1.0 > gmmbvl_kmeans.m

gmmbvl_kmeans

PURPOSE ^

gmmbvl_kmeans - clustering with k-means (or Generalized Lloyd or LBG) algorithm

SYNOPSIS ^

function [Er,M,nb] = gmmbvl_kmeans(X,T,kmax,dyn,bs, killing, pl)

DESCRIPTION ^

 gmmbvl_kmeans - clustering with k-means (or Generalized Lloyd or LBG) algorithm

 [Er,M,nb] = gmmbvl_kmeans(X,T,kmax,dyn,dnb,killing,p)

 X    - (n x d) d-dimensional input data
 T    - (? x d) d-dimensional test data
 kmax - (maximal) number of means
 dyn  - 0: standard k-means, unif. random subset of data init. 
        1: fast global k-means
        2: non-greedy, just use kdtree to initiallize the means
        3: fast global k-means, use kdtree for potential insertion locations  
        4: global k-means algorithm
 dnb  - desired number of buckets on the kd-tree  
 pl   - plot the fitting process

 returns
 Er - sum of squared distances to nearest mean (second column for test data)
 M  - (k x d) matrix of cluster centers; k is computed dynamically
 nb - number of nodes on the kd-tree (option dyn=[2,3])

 Nikos Vlassis & Sjaak Verbeek, 2001, http://www.science.uva.nl/~jverbeek

CROSS-REFERENCE INFORMATION ^

This function calls: This function is called by:

SUBFUNCTIONS ^

SOURCE CODE ^

0001 function [Er,M,nb] = gmmbvl_kmeans(X,T,kmax,dyn,bs, killing, pl)
0002 % gmmbvl_kmeans - clustering with k-means (or Generalized Lloyd or LBG) algorithm
0003 %
0004 % [Er,M,nb] = gmmbvl_kmeans(X,T,kmax,dyn,dnb,killing,p)
0005 %
0006 % X    - (n x d) d-dimensional input data
0007 % T    - (? x d) d-dimensional test data
0008 % kmax - (maximal) number of means
0009 % dyn  - 0: standard k-means, unif. random subset of data init.
0010 %        1: fast global k-means
0011 %        2: non-greedy, just use kdtree to initiallize the means
0012 %        3: fast global k-means, use kdtree for potential insertion locations
0013 %        4: global k-means algorithm
0014 % dnb  - desired number of buckets on the kd-tree
0015 % pl   - plot the fitting process
0016 %
0017 % returns
0018 % Er - sum of squared distances to nearest mean (second column for test data)
0019 % M  - (k x d) matrix of cluster centers; k is computed dynamically
0020 % nb - number of nodes on the kd-tree (option dyn=[2,3])
0021 %
0022 % Nikos Vlassis & Sjaak Verbeek, 2001, http://www.science.uva.nl/~jverbeek
0023 
0024 %
0025 % $Name:  $
0026 
0027 Er=[]; TEr=[];              % error monitorring
0028 
0029 [n,d]     = size(X);
0030 
0031 THRESHOLD = 1e-4;   % relative change in error that is regarded as convergence
0032 nb        = 0;  
0033 
0034 % initialize
0035 if dyn==1            % greedy insertion, possible at all points
0036   k      = 1;
0037   M      = mean(X);
0038   K      = gmmbvl_sqdist(X',X');
0039   L      = X;
0040 elseif dyn==2        % use kd-tree results as means
0041   k      = kmax;
0042   M      = kdtree(X,[1:n]',[],1.5*n/k); 
0043   nb     = size(M,1);
0044   dyn    = 0;
0045 elseif dyn==3
0046   L      = kdtree(X,[1:n]',[],1.5*n/bs);  
0047   nb     = size(L,1);
0048   k      = 1;
0049   M      = mean(X);
0050   K      = gmmbvl_sqdist(X',L');
0051 elseif dyn==4
0052   k      = 1;
0053   M      = mean(X);
0054   K      = gmmbvl_sqdist(X',X');
0055   L      = X;
0056 else                 % use random subset of data as means
0057   k      = kmax;
0058   tmp    = randperm(n);
0059   M      = X(tmp(1:k),:); 
0060 end
0061 
0062 Wold = realmax;
0063 
0064 while k <= kmax
0065   kill = [];
0066 
0067   % squared Euclidean distances to means; Dist (k x n)
0068   Dist = gmmbvl_sqdist(M',X');  
0069 
0070   % Voronoi partitioning
0071   [Dwin,Iwin] = min(Dist',[],2);
0072 
0073   % error measures and mean updates
0074   Wnew = sum(Dwin);
0075  
0076   % update VQ's
0077   for i=1:size(M,1)
0078     I = find(Iwin==i);
0079     if size(I,1)>d
0080       M(i,:) = mean(X(I,:));
0081   elseif killing==1
0082       kill = [kill; i];
0083     end
0084   end
0085 
0086  if 1-Wnew/Wold < THRESHOLD*(10-9*(k==kmax))
0087     if dyn & k < kmax
0088    
0089       if dyn == 4
0090         best_Er = Wnew; 
0091 
0092         for i=1:n;
0093           Wold = Inf;
0094              Wtmp = Wnew;
0095           Mtmp = [M; X(i,:)];
0096           while (1-Wtmp/Wold) > THRESHOLD*10; 
0097         Wold = Wtmp;
0098             Dist = gmmbvl_sqdist(Mtmp',X');  
0099             [Dwin,Iwin] = min(Dist',[],2);
0100             Wtmp = sum(Dwin);
0101             for i = 1 : size(Mtmp,1)
0102               I = find(Iwin==i);
0103               if size(I,1)>d; Mtmp(i,:) = mean(X(I,:)); end
0104             end
0105           end
0106           if Wtmp < best_Er;   best_M = Mtmp; best_Er = Wtmp; end
0107         end
0108 
0109         M = best_M;
0110         Wnew = best_Er;
0111         if ~isempty(T); tmp=gmmbvl_sqdist(T',M'); TEr=[TEr; sum(min(tmp,[],2))];end;
0112         Er=[Er; Wnew];
0113         k = k+1;
0114 
0115       else 
0116         % try to add a new cluster on some point x_i
0117         [tmp,new] = max(sum(max(repmat(Dwin,1,size(K,2))-K,0)));
0118         k = k+1;
0119         M = [M; L(new,:)+eps];
0120         if pl;        fprintf( 'new cluster, k=%d\n', k);      end
0121         [Dwin,Iwin] = min(Dist',[],2);
0122     Wnew        = sum(Dwin);Er=[Er; Wnew];
0123         if ~isempty(T); tmp=gmmbvl_sqdist(T',M'); TEr=[TEr; sum(min(tmp,[],2))];end;
0124       end
0125     else
0126       k = kmax+1;
0127     end  
0128   end
0129   Wold = Wnew;
0130   if pl
0131     figure(1); plot(X(:,1),X(:,2),'g.',M(:,1),M(:,2),'k.',M(:,1),M(:,2),'k+');
0132     drawnow;
0133   end
0134 end
0135 
0136  Er=[Er; Wnew];
0137  if ~isempty(T); tmp=gmmbvl_sqdist(T',M'); TEr=[TEr; sum(min(tmp,[],2))]; Er=[Er TEr];end;
0138 M(kill,:)=[];
0139 
0140 
0141 
0142 function varargout = kdtree(varargin);
0143 error('gmmbvl_kmeans:kdtree was called, but there is no implementation. This is an internal error.');

Generated on Thu 14-Apr-2005 13:50:22 by m2html © 2003