// QUESTION 1 // // This program models a 2-class problem using two 3-D Gaussian distributions // with diagonal covariance matrices and evaluates the performance of the // resulting model on the specified data set. // #include "math.h" #include "stdlib.h" #include "stdio.h" // 2-class data container // A, B : pointers to data // NA, NB : data counts // D : data dimensionality typedef struct { double *A, *B; int NA, NB, D; } Data; // model container // muA, muB, sigA, sigB : mean, variance // pA, pB : priors typedef struct { double *muA, *muB, *sigA, *sigB; double pA, pB; } Model; // fills an array using the specified value inline void fill(double* arr, double val, int n) { int i; for( i = 0; i < n; i++ ) arr[i] = val; } // allocates an array and fills it with the specified value inline double* mallocfill(size_t n, size_t size, double val) { double* arr = (double*)malloc(n*size); fill(arr, val, n); return arr; } // calculates the moment about the mean of specified order // used to compute mean and standard deviation of data void moment(double* data, int D, int N, int order, double* mu, double* mom) { int d, n; for( d = 0; d < D; d++ ) for( n = 0; n < N; n++ ) mom[d] += pow(data[d*N+n]-mu[d],order); for( d = 0; d < D; d++ ) mom[d] /= N; } // computes the mean and diagonal covariance matrix of a Gaussian void suffstat(double* data, int D, int N, double* mu, double* sig) { double* zero = mallocfill(D,sizeof(double),0); moment(data, D, N, 1, zero, mu); moment(data, D, N, 2, mu, sig); free(zero); } // trains models for the two classes Model train(Data data) { Model model; model.muA = mallocfill(data.D,sizeof(double),0); model.muB = mallocfill(data.D,sizeof(double),0); model.sigA = mallocfill(data.D,sizeof(double),0); model.sigB = mallocfill(data.D,sizeof(double),0); model.pA = (double)data.NA/(data.NA+data.NB); model.pB = (double)data.NB/(data.NA+data.NB); suffstat(data.A, data.D, data.NA, model.muA, model.sigA); suffstat(data.B, data.D, data.NB, model.muB, model.sigB); return model; } // computes the log of a 3-D Gaussian distribution with a diagonal covariance matrix double logindmvnpdf(double* data, int n, int D, int N, double* mu, double* sig) { int d; double a = 0, b = 1; for( d = 0; d < D; d++ ) { a += -pow(data[d*N+n]-mu[d],2)/sig[d]; b *= sig[d]; } return 0.5*(a-log(b)); } // computes the total error of a model on a data set float test(Data d, Model m) { int n; float err = 0; double a, b; // evaluate model performance using the MAP decision rule for( n = 0; n < d.NA; n++ ) err += log(m.pA)+logindmvnpdf(d.A, n, d.D, d.NA, m.muA, m.sigA) <= log(m.pB)+logindmvnpdf(d.A, n, d.D, d.NA, m.muB, m.sigB); for( n = 0; n < d.NB; n++ ) err += log(m.pA)+logindmvnpdf(d.B, n, d.D, d.NB, m.muA, m.sigA) >= log(m.pB)+logindmvnpdf(d.B, n, d.D, d.NB, m.muB, m.sigB); return err/(d.NA+d.NB); } // reads data from a file and stores it in column-major order [x0 ... xN y0 ... yN z0 ... zN] int readdata(char* fname, Data* set) { FILE* fid = fopen(fname,"r"); int N = 10000; int n; int nA = 0, nB = 0; char* c = (char*)malloc(sizeof(char)*N); double* x = (double*)malloc(sizeof(double)*N); double* y = (double*)malloc(sizeof(double)*N); double* z = (double*)malloc(sizeof(double)*N); set->NA = 0; set->NB = 0; set->D = 3; if( fid == NULL ) { printf("Unable to read %s\n",fname); return -1; } for( n = 0; !feof(fid); n++ ) { if( n == N ) { N *= 2; realloc(x,sizeof(double)*N); realloc(y,sizeof(double)*N); realloc(z,sizeof(double)*N); realloc(c,sizeof(char)*N); } fscanf(fid,"%c: (%lf %lf %lf)\n",&c[n],&x[n],&y[n],&z[n]); c[n] == 'A' ? set->NA++ : set->NB++; } fclose(fid); N = n; set->A = (double*)malloc(sizeof(double)*set->NA*3); set->B = (double*)malloc(sizeof(double)*set->NB*3); for( n = 0; n < N; n++ ) { if( c[n] == 'A' ) { set->A[nA] = x[n]; set->A[set->NA*1+nA] = y[n]; set->A[set->NA*2+nA] = z[n]; nA++; } else { set->B[nB] = x[n]; set->B[set->NB*1+nB] = y[n]; set->B[set->NB*2+nB] = z[n]; nB++; } } free(c); free(x); free(y); free(z); return 0; } int main(int argc, char* argv[]) { Data trainData, testData; if( argc < 3 ) { printf("***********************************************\n"); printf("Question 3\n"); printf("Error: Not enough arguments.\n"); printf("Please specify training and testing datasets:\n"); printf("q3 Train.dat Test.dat\n"); printf("***********************************************\n"); } // read data if( readdata(argv[1], &trainData) == -1 || readdata(argv[2], &testData) == -1 ) return 0; // train model Model model = train(trainData); // test model printf("Train set: %2.2f%% correct\n",(1-test(trainData, model))*100); printf("Test set: %2.2f%% correct\n",(1-test(testData, model))*100); free(model.muA); free(model.muB); free(model.sigA); free(model.sigB); free(trainData.A); free(trainData.B); free(testData.A); free(testData.B); }