// Question 2
//
// This program models a 2-class problem using a Gaussian Mixture Model of specified number
// of clusters and uses the EM algorithm to estimate the model's parameters. The performance
// of the resulting model is evaluated on the specified data set.
//


#include "math.h"
#include "string.h"
#include "stdlib.h"
#include "stdio.h"

const double MAXDBL = 2000000;

// 2-class data container
//		A, B	: pointers to data
//		NA, NB	: data counts
//		D		: data dimensionality 
typedef struct
{
	double *A, *B;
	int NA, NB, D;
} Data;

// model container
//		muA, muB, sigA, sigB	: mean, variance
//		pA, pB					: priors
//		piA, piB				: GMM mixture parameters
typedef struct
{
	double *muA, *muB, *sigA, *sigB, *piA, *piB;
	double pA, pB;
} Model;

// fills and array using the specified value
inline void fill(double* arr, double val, int n)
{
    int i;
    
    for( i = 0; i < n; i++ )
        arr[i] = val;    
}

// allocates an array and fills it with the specified value
inline double* mallocfill(size_t n, size_t size, double val)
{
    double* arr = (double*)malloc(n*size);
    fill(arr, val, n);
    return arr;
}


// performs matrix multiplication
void mmult(double* A, double* B, double* C, int M, int D, int N)
{
    int m, n, d;
    
    fill(C,0,M*N);
    
    for( n = 0; n < N; n++ )
        for( m = 0; m < M; m++ )
            for( d = 0; d < D; d++ )
                C[M*n+m] += A[M*d+m]*B[D*n+d];
}

// computes the L-2 norm of a vector
double vnorm2(double* v, int D)
{
    double r = 0;
    int d;
    
    for( d = 0; d < D; d++ )
        r += v[d]*v[d];
    
    return sqrt(r);
}

// computes the moment of each cluster of specified order
// used to compute the mean and diagonal covariance matrix of each cluster
void clustmoment(double* data, unsigned char* labels, int order, int D, int N, int K, double* Nk, double* mu, double* mom)
{
    int d, n, c;
    
    fill(mom,0,K*D);
    
    for( d = 0; d < D; d++ )
        for( n = 0; n < N; n++ )
            mom[K*d+labels[n]] += pow(data[d*N+n]-mu[K*d+labels[n]],order)/Nk[labels[n]];
    
}

// computes the means and diagonal covariance matrices for each cluster
void suffstat(double* data, unsigned char* labels, int D, int N, int K, double* mu, double* sig, double* Nk)
{
    double* zero = mallocfill(K*D,sizeof(double),0);
    
    clustmoment(data,labels,1,D,N,K,Nk,zero,mu);
    clustmoment(data,labels,2,D,N,K,Nk,mu,sig);
    
    free(zero);
}

// computes the full covariance matrix for the specified cluster
void clustcov(double* data, unsigned char* labels, int c, int K, int D, int N, double* mu, double* cov)
{
    int i, j, n; 
    
    fill(cov,0,D*D);
    
    for( i = 0; i < D; i++ )
    {
        for( j = i; j < D; j++ )
        {
            for( n = 0; n < N; n++ )
            {
                if( labels[n] != c )
                    continue;
                
                cov[i*D+j] += (data[i*N+n]-mu[i*K+c])*(data[j*N+n]-mu[j*K+c]);
            }
            
            cov[i*D+j] /= (N-1);
            cov[j*D+i] = cov[i*D+j];
        }
    }
}

// counts the number of members of each cluster
void clustdatacount(double* Nk, unsigned char* labels, int N, int K)
{
    int n;
    
    fill(Nk,0,K);

    for( n = 0; n < N; n++ )
        Nk[labels[n]]++;   
}

// sums of all elements of a matrix along one of the dimensions
void vsum(double* A, int N, int D, unsigned char dim, double* v)
{
    int n, d;
    
    if( dim == 0 )
    {
        fill(v,0,D);
        
        for( d = 0; d < D; d++ )
            for( n = 0; n < N; n++ )
                v[d] += A[d*N+n];
    }
    else
    {
        fill(v,0,N);
        
        for( d = 0; d < D; d++ )
            for( n = 0; n < N; n++ )
                v[n] += A[d*N+n];
    }
}

// finds the maximum value in a vector
int vmax(double* A, int N)
{
    int n;
    int idx = 0;
    double val = A[0];
    
    for( n = 1; n < N; n++ )
    {
        if( A[n] > val )
        {
            val = A[n];
            idx = n;
        }
    }
    
    return idx;
}

// computes the eigenvector associated with the dominant
void eig(double* mat, int N, double* vcurr)
{
    int maxIter = 1000;
    int i, n;
    double norm, e;
    double* vprev = mallocfill(N,sizeof(double),1);
    
    fill(vcurr,0,N);
    
	// power method for finding the first eigenvector of M
	// pick a random vector, multiply by M, then normalize
	// the result and repeat the process recursively
    for( i = 0; i < maxIter; i++ )
    {
        mmult(vprev,mat,vcurr,1,N,N);
        
        e = 0;
        norm = vnorm2(vcurr,N);
        for( n = 0; n < N; n++ )
        {
            vcurr[n] /= norm;
            e += pow(vcurr[n]-vprev[n],2);
        }
        
        if( e < 1e-15 )
            break;
        
        memcpy(vprev,vcurr,sizeof(double)*N);        
    } 
    
    free(vprev);
}

// computes the 3-D Gaussian distribution with a diagonal covariance matrix
inline double indmvnpdf(double* data, int n, int c, int D, int N, int K, double* mu, double* sig)
{   
    int d;
    
    double a = 1, b = 1;
    
    for( d = 0; d < D; d++ )
    {
        a += -0.5*pow(data[d*N+n]-mu[K*d+c],2)/sig[K*d+c];
        b *= sig[K*d+c];
    }
    
    return exp(a)/(sqrt(b)*pow(6.28318531,D/2.0));
}

// clusters data into the specified number of clusters and computes their means, 
// and covariance matrices
void kmeans(double* data, double* mu, double* sig, double* Nk, int K, int N, int D)
{
    int i, n, c, d;
    int maxIter = 10000;
    int minClustSize = 5;
    double dot;
    double norm;
    double e, eprev = MAXDBL;
    double* cov = mallocfill(D*D,sizeof(double),0);
    double* sigsum = mallocfill(K,sizeof(double),0);
    double* v = mallocfill(D,sizeof(double),0);
    unsigned char* labels = (unsigned char*)calloc(N,sizeof(int));

	// all data are part of cluster 0
	// compute cluster statistics
    clustdatacount(Nk, labels, N, K);
    suffstat(data, labels, D, N, K, mu, sig, Nk);
    
    // initialize kmeans
    for( i = 1; i < K; i++ )
    {
		// find the cluster with largest variance
        vsum(sig,K,D,1,sigsum);
        c = vmax(sigsum,K);        
		// find the principal compoment of the cluster
        clustcov(data, labels, c, K, D, N, mu, cov);
        eig(cov,D,v);
        
		// split the chosen cluster in half, assigning points
		// that lie on one side of the principal vector to a new cluster
        for( n = 0; n < N; n++ )
        {
            if( labels[n] != c )
                continue;
            
			// compute the dot product of the vector (mu-x)T(v)
			// where mu is the mean of the chosen cluster, x is the current
			// data point, and v is the principal component vector
            dot = 0;
            for( d = 0; d < D; d++ )
               dot += (data[N*d+n]-mu[K*d+c])*v[d];

            if( dot > 0 )
                labels[n] = i;
        }
        
		// update cluster statistics
        clustdatacount(Nk, labels, N, K);
        suffstat(data, labels, D, N, K, mu, sig, Nk);
    }
    
    // run kmeans
    for( i = 0; i < maxIter; i++ )
    {
        e = 0; 
        for( n = 0; n < N; n++ )
        {
            norm = MAXDBL;
            
			// find the closest cluster to the current datum
            for( c = 0; c < K; c++ )
            {
                dot = 0;
                for( d = 0; d < D; d++ )
                    dot += pow(data[N*d+n]-mu[K*d+c],2);
                
				// if the distance to the cluster's mean is currently smallest
				// then reassign the point to this cluster and update the cluster
				// member counts
                if( dot < norm && (Nk[labels[n]] >= minClustSize))
                {
                    Nk[labels[n]]--;
                    labels[n] = c;
                    norm = dot;
                    Nk[labels[n]]++;
                }
            }
            e += norm;
        }
               
		// update cluster statistics
        suffstat(data, labels, D, N, K, mu, sig, Nk);        
        
        if( pow(e-eprev,2) < 1e-15 )
            break;
        
        eprev = e;
    }
      
    free(cov);
    free(sigsum);
    free(v);
    free(labels);
}


void EM(double* data, double* mu, double* sig, double* Nk, int K, int N, int D)
{
    int i, n, d, c;
    double loglikprev = -1;
    double loglik;
    double val;
    int maxIter = 10000;
    double* resp = mallocfill(N*K,sizeof(double),0);
    double* respNorm = mallocfill(N,sizeof(double),0);
    double* pi = (double*)malloc(sizeof(double)*K);
	
	// EM algorithm
    for( i = 0; i < maxIter; i++ )
    {
        // E-STEP: compute responsibilities
		// responsibilities are stored in two datastructures
		// resp: stores the unnormalized responsibility values
		// respNorm: stores the normalization values
		// this is done to avoid unnecessary computation
        memcpy(pi,Nk,sizeof(double)*K);
        fill(Nk,0,K);
        loglik = 0;
        for( n = 0; n < N; n++ )
        {
            respNorm[n] = 0;
            for( c = 0; c < K; c++ )
            {
                resp[c*N+n] = pi[c]*indmvnpdf(data,n,c,D,N,K,mu,sig);
                respNorm[n] += resp[c*N+n];
            }
			
            for( c = 0; c < K; c++ )
                Nk[c] += resp[c*N+n]/respNorm[n];
            
            loglik += respNorm[n]/N;
        }
        
        // M-STEP
        fill(mu,0,K*D);
        fill(sig,0,K*D);
        
		// compute the new estimate of the means
        for( c = 0; c < K; c++ )
        {
            for( d = 0; d < D; d++ )
            {
                for( n = 0; n < N; n++ )
                    mu[d*K+c] += resp[c*N+n]*data[N*d+n]/respNorm[n];
                
                mu[d*K+c] /= Nk[c];
            }                
        }
        
		// compute the new esitmate of the covariance matrices
        for( c = 0; c < K; c++ )
        {
            for( d = 0; d < D; d++ )
            {
                for( n = 0; n < N; n++ )
                    sig[d*K+c] += resp[c*N+n]*pow(data[N*d+n]-mu[K*d+c],2)/respNorm[n];
                
                sig[d*K+c] /= Nk[c];
            }
        }
		        
        if( pow(loglik-loglikprev,2) < 1e-12 )
            break;
        
        loglikprev = loglik;
    }    
	    
	// compute the mixing coefficients
    for( c = 0; c < K; c++ )
        Nk[c] /= N;
    
    free(pi);
    free(resp);
    free(respNorm);
}

// trains the GMM model for the two classes using the EM algorithm
Model train(Data data, int K)
{
	Model model;
    model.muA  = mallocfill(data.D*K,sizeof(double),0);
    model.muB  = mallocfill(data.D*K,sizeof(double),0);
    model.sigA = mallocfill(data.D*K,sizeof(double),0);
    model.sigB = mallocfill(data.D*K,sizeof(double),0);
	model.piA  = mallocfill(K,sizeof(double),0);
	model.piB  = mallocfill(K,sizeof(double),0);	
	model.pA = (double)data.NA/(data.NA+data.NB);
	model.pB = (double)data.NB/(data.NA+data.NB);
	
    kmeans(data.A, model.muA, model.sigA, model.piA, K, data.NA, data.D);
    EM(data.A,model.muA,model.sigA,model.piA,K,data.NA,data.D);

	kmeans(data.B, model.muB, model.sigB, model.piB, K, data.NB, data.D);
    EM(data.B,model.muB,model.sigB,model.piB,K,data.NB,data.D);

	return model;
}

// computes the value of a Gaussian Mixture Model of specified parameters
inline double gmm(double* data, int n, int D, int N, int K, double* mu, double* sig, double* pi)
{
    int c;
    double val = 0;
    
    for( c = 0; c < K; c++ )
        val += pi[c]*indmvnpdf(data,n,c,D,N,K,mu,sig);
    
    return val;    
}

// computes the total error of a model on a data set
double test(Data d, Model m, int K)
{
    int n;
    double err = 0;

	// evaluate model performance using the MAP decision rule    
    for( n = 0; n < d.NA; n++ )
        err += m.pA*gmm(d.A, n, d.D, d.NA, K, m.muA, m.sigA, m.piA) <= m.pB*gmm(d.A, n, d.D, d.NA, K, m.muB, m.sigB, m.piB);

    for( n = 0; n < d.NB; n++ )
        err += m.pA*gmm(d.B, n, d.D, d.NB, K, m.muA, m.sigA, m.piA) >= m.pB*gmm(d.B, n, d.D, d.NB, K, m.muB, m.sigB, m.piB);
    
    return err/(d.NA+d.NB);
}

// reads data from a file and stores it in column-major order [x0 ... xN y0 ... yN z0 ... zN]
int readdata(char* fname, Data* set)
{
    FILE* fid = fopen(fname,"r");
    int N = 10000;
    int n;
    int nA = 0, nB = 0;
    char* c = (char*)malloc(sizeof(char)*N);
    double* x = (double*)malloc(sizeof(double)*N);
    double* y = (double*)malloc(sizeof(double)*N);
    double* z = (double*)malloc(sizeof(double)*N);
	set->NA = 0;
	set->NB = 0;
	set->D = 3;
	
	if( fid == NULL )
	{
		printf("Unable to read %s\n",fname);
		return -1;
	}
	
    for( n = 0; !feof(fid); n++ )
    {
        if( n == N )
        {
            N *= 2;
            realloc(x,sizeof(double)*N);
            realloc(y,sizeof(double)*N);
            realloc(z,sizeof(double)*N);
            realloc(c,sizeof(char)*N);
        }
		
        fscanf(fid,"%c: (%lf %lf %lf)\n",&c[n],&x[n],&y[n],&z[n]);
        
        c[n] == 'A' ? set->NA++ : set->NB++;
    }
	
    fclose(fid);
	
    N = n;
    set->A = (double*)malloc(sizeof(double)*set->NA*3);
    set->B = (double*)malloc(sizeof(double)*set->NB*3);	
	
	for( n = 0; n < N; n++ )
	{
		if( c[n] == 'A' )
		{
			set->A[nA] = x[n];
			set->A[set->NA*1+nA] = y[n];
			set->A[set->NA*2+nA] = z[n];
			nA++;
		}
		else
		{
			set->B[nB] = x[n];
			set->B[set->NB*1+nB] = y[n];
			set->B[set->NB*2+nB] = z[n];
			nB++;
		}
	}
	
	free(c);
	free(x);
	free(y);
	free(z);
	
	return 0;
}

void cleanup(Model model)
{
	free(model.muA);
	free(model.muB);
	free(model.sigA);
	free(model.sigB);
	free(model.piA);
	free(model.piB);
}	

int main(int argc, char* argv[])
{   
	Data trainData, testData;
	
	if( argc < 3 )
	{
		printf("***********************************************\n");
		printf("Question 3\n");
		printf("Error: Not enough arguments.\n");
		printf("Please specify training and testing datasets:\n");
		printf("q3 Train.dat Test.dat\n");
		printf("***********************************************\n");		
	}
	
	// read data
	if( readdata(argv[1], &trainData) == -1 || readdata(argv[2], &testData) == -1 )
		return 0;

	// train and test the 2 cluster model
	Model model2 = train(trainData,2);
	printf("2 Clusters:\n");
    printf("Train set: %2.2f%% correct\n",(1-test(trainData, model2, 2))*100);
    printf("Test  set: %2.2f%% correct\n",(1-test(testData, model2, 2))*100);	

	// train and test the 4 cluster model
	Model model4 = train(trainData,4);	
	printf("4 Clusters:\n");
    printf("Train set: %2.2f%% correct\n",(1-test(trainData, model4, 4))*100);
    printf("Test  set: %2.2f%% correct\n",(1-test(testData, model4, 4))*100);	

	// train and test the 8 cluster model
	Model model8 = train(trainData,8);	
	printf("8 Clusters:\n");
    printf("Train set: %2.2f%% correct\n",(1-test(trainData, model8, 8))*100);
    printf("Test  set: %2.2f%% correct\n",(1-test(testData, model8, 8))*100);	
        
    cleanup(model2);
	cleanup(model4);
	cleanup(model8);
	free(trainData.A);
	free(trainData.B);
	free(testData.A);
	free(testData.B);
}