// Question 2 // // This program models a 2-class problem using a Gaussian Mixture Model of specified number // of clusters and uses the EM algorithm to estimate the model's parameters. The performance // of the resulting model is evaluated on the specified data set. // #include "math.h" #include "string.h" #include "stdlib.h" #include "stdio.h" const double MAXDBL = 2000000; // 2-class data container // A, B : pointers to data // NA, NB : data counts // D : data dimensionality typedef struct { double *A, *B; int NA, NB, D; } Data; // model container // muA, muB, sigA, sigB : mean, variance // pA, pB : priors // piA, piB : GMM mixture parameters typedef struct { double *muA, *muB, *sigA, *sigB, *piA, *piB; double pA, pB; } Model; // fills and array using the specified value inline void fill(double* arr, double val, int n) { int i; for( i = 0; i < n; i++ ) arr[i] = val; } // allocates an array and fills it with the specified value inline double* mallocfill(size_t n, size_t size, double val) { double* arr = (double*)malloc(n*size); fill(arr, val, n); return arr; } // performs matrix multiplication void mmult(double* A, double* B, double* C, int M, int D, int N) { int m, n, d; fill(C,0,M*N); for( n = 0; n < N; n++ ) for( m = 0; m < M; m++ ) for( d = 0; d < D; d++ ) C[M*n+m] += A[M*d+m]*B[D*n+d]; } // computes the L-2 norm of a vector double vnorm2(double* v, int D) { double r = 0; int d; for( d = 0; d < D; d++ ) r += v[d]*v[d]; return sqrt(r); } // computes the moment of each cluster of specified order // used to compute the mean and diagonal covariance matrix of each cluster void clustmoment(double* data, unsigned char* labels, int order, int D, int N, int K, double* Nk, double* mu, double* mom) { int d, n, c; fill(mom,0,K*D); for( d = 0; d < D; d++ ) for( n = 0; n < N; n++ ) mom[K*d+labels[n]] += pow(data[d*N+n]-mu[K*d+labels[n]],order)/Nk[labels[n]]; } // computes the means and diagonal covariance matrices for each cluster void suffstat(double* data, unsigned char* labels, int D, int N, int K, double* mu, double* sig, double* Nk) { double* zero = mallocfill(K*D,sizeof(double),0); clustmoment(data,labels,1,D,N,K,Nk,zero,mu); clustmoment(data,labels,2,D,N,K,Nk,mu,sig); free(zero); } // computes the full covariance matrix for the specified cluster void clustcov(double* data, unsigned char* labels, int c, int K, int D, int N, double* mu, double* cov) { int i, j, n; fill(cov,0,D*D); for( i = 0; i < D; i++ ) { for( j = i; j < D; j++ ) { for( n = 0; n < N; n++ ) { if( labels[n] != c ) continue; cov[i*D+j] += (data[i*N+n]-mu[i*K+c])*(data[j*N+n]-mu[j*K+c]); } cov[i*D+j] /= (N-1); cov[j*D+i] = cov[i*D+j]; } } } // counts the number of members of each cluster void clustdatacount(double* Nk, unsigned char* labels, int N, int K) { int n; fill(Nk,0,K); for( n = 0; n < N; n++ ) Nk[labels[n]]++; } // sums of all elements of a matrix along one of the dimensions void vsum(double* A, int N, int D, unsigned char dim, double* v) { int n, d; if( dim == 0 ) { fill(v,0,D); for( d = 0; d < D; d++ ) for( n = 0; n < N; n++ ) v[d] += A[d*N+n]; } else { fill(v,0,N); for( d = 0; d < D; d++ ) for( n = 0; n < N; n++ ) v[n] += A[d*N+n]; } } // finds the maximum value in a vector int vmax(double* A, int N) { int n; int idx = 0; double val = A[0]; for( n = 1; n < N; n++ ) { if( A[n] > val ) { val = A[n]; idx = n; } } return idx; } // computes the eigenvector associated with the dominant void eig(double* mat, int N, double* vcurr) { int maxIter = 1000; int i, n; double norm, e; double* vprev = mallocfill(N,sizeof(double),1); fill(vcurr,0,N); // power method for finding the first eigenvector of M // pick a random vector, multiply by M, then normalize // the result and repeat the process recursively for( i = 0; i < maxIter; i++ ) { mmult(vprev,mat,vcurr,1,N,N); e = 0; norm = vnorm2(vcurr,N); for( n = 0; n < N; n++ ) { vcurr[n] /= norm; e += pow(vcurr[n]-vprev[n],2); } if( e < 1e-15 ) break; memcpy(vprev,vcurr,sizeof(double)*N); } free(vprev); } // computes the 3-D Gaussian distribution with a diagonal covariance matrix inline double indmvnpdf(double* data, int n, int c, int D, int N, int K, double* mu, double* sig) { int d; double a = 1, b = 1; for( d = 0; d < D; d++ ) { a += -0.5*pow(data[d*N+n]-mu[K*d+c],2)/sig[K*d+c]; b *= sig[K*d+c]; } return exp(a)/(sqrt(b)*pow(6.28318531,D/2.0)); } // clusters data into the specified number of clusters and computes their means, // and covariance matrices void kmeans(double* data, double* mu, double* sig, double* Nk, int K, int N, int D) { int i, n, c, d; int maxIter = 10000; int minClustSize = 5; double dot; double norm; double e, eprev = MAXDBL; double* cov = mallocfill(D*D,sizeof(double),0); double* sigsum = mallocfill(K,sizeof(double),0); double* v = mallocfill(D,sizeof(double),0); unsigned char* labels = (unsigned char*)calloc(N,sizeof(int)); // all data are part of cluster 0 // compute cluster statistics clustdatacount(Nk, labels, N, K); suffstat(data, labels, D, N, K, mu, sig, Nk); // initialize kmeans for( i = 1; i < K; i++ ) { // find the cluster with largest variance vsum(sig,K,D,1,sigsum); c = vmax(sigsum,K); // find the principal compoment of the cluster clustcov(data, labels, c, K, D, N, mu, cov); eig(cov,D,v); // split the chosen cluster in half, assigning points // that lie on one side of the principal vector to a new cluster for( n = 0; n < N; n++ ) { if( labels[n] != c ) continue; // compute the dot product of the vector (mu-x)T(v) // where mu is the mean of the chosen cluster, x is the current // data point, and v is the principal component vector dot = 0; for( d = 0; d < D; d++ ) dot += (data[N*d+n]-mu[K*d+c])*v[d]; if( dot > 0 ) labels[n] = i; } // update cluster statistics clustdatacount(Nk, labels, N, K); suffstat(data, labels, D, N, K, mu, sig, Nk); } // run kmeans for( i = 0; i < maxIter; i++ ) { e = 0; for( n = 0; n < N; n++ ) { norm = MAXDBL; // find the closest cluster to the current datum for( c = 0; c < K; c++ ) { dot = 0; for( d = 0; d < D; d++ ) dot += pow(data[N*d+n]-mu[K*d+c],2); // if the distance to the cluster's mean is currently smallest // then reassign the point to this cluster and update the cluster // member counts if( dot < norm && (Nk[labels[n]] >= minClustSize)) { Nk[labels[n]]--; labels[n] = c; norm = dot; Nk[labels[n]]++; } } e += norm; } // update cluster statistics suffstat(data, labels, D, N, K, mu, sig, Nk); if( pow(e-eprev,2) < 1e-15 ) break; eprev = e; } free(cov); free(sigsum); free(v); free(labels); } void EM(double* data, double* mu, double* sig, double* Nk, int K, int N, int D) { int i, n, d, c; double loglikprev = -1; double loglik; double val; int maxIter = 10000; double* resp = mallocfill(N*K,sizeof(double),0); double* respNorm = mallocfill(N,sizeof(double),0); double* pi = (double*)malloc(sizeof(double)*K); // EM algorithm for( i = 0; i < maxIter; i++ ) { // E-STEP: compute responsibilities // responsibilities are stored in two datastructures // resp: stores the unnormalized responsibility values // respNorm: stores the normalization values // this is done to avoid unnecessary computation memcpy(pi,Nk,sizeof(double)*K); fill(Nk,0,K); loglik = 0; for( n = 0; n < N; n++ ) { respNorm[n] = 0; for( c = 0; c < K; c++ ) { resp[c*N+n] = pi[c]*indmvnpdf(data,n,c,D,N,K,mu,sig); respNorm[n] += resp[c*N+n]; } for( c = 0; c < K; c++ ) Nk[c] += resp[c*N+n]/respNorm[n]; loglik += respNorm[n]/N; } // M-STEP fill(mu,0,K*D); fill(sig,0,K*D); // compute the new estimate of the means for( c = 0; c < K; c++ ) { for( d = 0; d < D; d++ ) { for( n = 0; n < N; n++ ) mu[d*K+c] += resp[c*N+n]*data[N*d+n]/respNorm[n]; mu[d*K+c] /= Nk[c]; } } // compute the new esitmate of the covariance matrices for( c = 0; c < K; c++ ) { for( d = 0; d < D; d++ ) { for( n = 0; n < N; n++ ) sig[d*K+c] += resp[c*N+n]*pow(data[N*d+n]-mu[K*d+c],2)/respNorm[n]; sig[d*K+c] /= Nk[c]; } } if( pow(loglik-loglikprev,2) < 1e-12 ) break; loglikprev = loglik; } // compute the mixing coefficients for( c = 0; c < K; c++ ) Nk[c] /= N; free(pi); free(resp); free(respNorm); } // trains the GMM model for the two classes using the EM algorithm Model train(Data data, int K) { Model model; model.muA = mallocfill(data.D*K,sizeof(double),0); model.muB = mallocfill(data.D*K,sizeof(double),0); model.sigA = mallocfill(data.D*K,sizeof(double),0); model.sigB = mallocfill(data.D*K,sizeof(double),0); model.piA = mallocfill(K,sizeof(double),0); model.piB = mallocfill(K,sizeof(double),0); model.pA = (double)data.NA/(data.NA+data.NB); model.pB = (double)data.NB/(data.NA+data.NB); kmeans(data.A, model.muA, model.sigA, model.piA, K, data.NA, data.D); EM(data.A,model.muA,model.sigA,model.piA,K,data.NA,data.D); kmeans(data.B, model.muB, model.sigB, model.piB, K, data.NB, data.D); EM(data.B,model.muB,model.sigB,model.piB,K,data.NB,data.D); return model; } // computes the value of a Gaussian Mixture Model of specified parameters inline double gmm(double* data, int n, int D, int N, int K, double* mu, double* sig, double* pi) { int c; double val = 0; for( c = 0; c < K; c++ ) val += pi[c]*indmvnpdf(data,n,c,D,N,K,mu,sig); return val; } // computes the total error of a model on a data set double test(Data d, Model m, int K) { int n; double err = 0; // evaluate model performance using the MAP decision rule for( n = 0; n < d.NA; n++ ) err += m.pA*gmm(d.A, n, d.D, d.NA, K, m.muA, m.sigA, m.piA) <= m.pB*gmm(d.A, n, d.D, d.NA, K, m.muB, m.sigB, m.piB); for( n = 0; n < d.NB; n++ ) err += m.pA*gmm(d.B, n, d.D, d.NB, K, m.muA, m.sigA, m.piA) >= m.pB*gmm(d.B, n, d.D, d.NB, K, m.muB, m.sigB, m.piB); return err/(d.NA+d.NB); } // reads data from a file and stores it in column-major order [x0 ... xN y0 ... yN z0 ... zN] int readdata(char* fname, Data* set) { FILE* fid = fopen(fname,"r"); int N = 10000; int n; int nA = 0, nB = 0; char* c = (char*)malloc(sizeof(char)*N); double* x = (double*)malloc(sizeof(double)*N); double* y = (double*)malloc(sizeof(double)*N); double* z = (double*)malloc(sizeof(double)*N); set->NA = 0; set->NB = 0; set->D = 3; if( fid == NULL ) { printf("Unable to read %s\n",fname); return -1; } for( n = 0; !feof(fid); n++ ) { if( n == N ) { N *= 2; realloc(x,sizeof(double)*N); realloc(y,sizeof(double)*N); realloc(z,sizeof(double)*N); realloc(c,sizeof(char)*N); } fscanf(fid,"%c: (%lf %lf %lf)\n",&c[n],&x[n],&y[n],&z[n]); c[n] == 'A' ? set->NA++ : set->NB++; } fclose(fid); N = n; set->A = (double*)malloc(sizeof(double)*set->NA*3); set->B = (double*)malloc(sizeof(double)*set->NB*3); for( n = 0; n < N; n++ ) { if( c[n] == 'A' ) { set->A[nA] = x[n]; set->A[set->NA*1+nA] = y[n]; set->A[set->NA*2+nA] = z[n]; nA++; } else { set->B[nB] = x[n]; set->B[set->NB*1+nB] = y[n]; set->B[set->NB*2+nB] = z[n]; nB++; } } free(c); free(x); free(y); free(z); return 0; } void cleanup(Model model) { free(model.muA); free(model.muB); free(model.sigA); free(model.sigB); free(model.piA); free(model.piB); } int main(int argc, char* argv[]) { Data trainData, testData; if( argc < 3 ) { printf("***********************************************\n"); printf("Question 3\n"); printf("Error: Not enough arguments.\n"); printf("Please specify training and testing datasets:\n"); printf("q3 Train.dat Test.dat\n"); printf("***********************************************\n"); } // read data if( readdata(argv[1], &trainData) == -1 || readdata(argv[2], &testData) == -1 ) return 0; // train and test the 2 cluster model Model model2 = train(trainData,2); printf("2 Clusters:\n"); printf("Train set: %2.2f%% correct\n",(1-test(trainData, model2, 2))*100); printf("Test set: %2.2f%% correct\n",(1-test(testData, model2, 2))*100); // train and test the 4 cluster model Model model4 = train(trainData,4); printf("4 Clusters:\n"); printf("Train set: %2.2f%% correct\n",(1-test(trainData, model4, 4))*100); printf("Test set: %2.2f%% correct\n",(1-test(testData, model4, 4))*100); // train and test the 8 cluster model Model model8 = train(trainData,8); printf("8 Clusters:\n"); printf("Train set: %2.2f%% correct\n",(1-test(trainData, model8, 8))*100); printf("Test set: %2.2f%% correct\n",(1-test(testData, model8, 8))*100); cleanup(model2); cleanup(model4); cleanup(model8); free(trainData.A); free(trainData.B); free(testData.A); free(testData.B); }