import numpy as np
import numpy.linalg as LA
import matplotlib.pyplot as plt
import GMMem
import pickle

np.random.seed(0)

# Fill in the missing parts marked with ?


# ***** DATA

totalComponents = 10  # max number of mixture components

# load data
with open("data.pickle", "rb") as f:
	X, labels = pickle.load(f)

D, N = X.shape   # dimension and number of data points

ratio = 0.75
train_ind = np.random.choice(N, int(ratio * N), replace=False)   # training data index
test_ind = np.setdiff1d(np.arange(N), train_ind)                 # test data index

Xtrain = X[:,train_ind]            # training data
Xtrain_labels = labels[train_ind]  # training data labels

Xtest = X[:,test_ind]            # test data
Xtest_labels = labels[test_ind]  # test data labels

# plot training and test data
def plot_data():
	for i in sorted(set(Xtrain_labels)):
		X_comp = Xtrain[:, Xtrain_labels == i]
		plt.plot(X_comp[0], X_comp[1], '.' + 'brgmcyk'[i-1], markersize=6)
	
	plt.plot(Xtest[0], Xtest[1], 'kd', markersize=4, markeredgewidth=0.5, markerfacecolor="None")

plot_data()
plt.title('training data, test data (in black)')
plt.show()



# ***** Use cross validation to choose the number of components
# (Only this part differs from the first template, where BIC is used instead)

foldCount = 5    # number of folds

loglikelihoods = np.zeros((totalComponents, foldCount))  # collect log-likelihoods

Nlearning = Xtrain.shape[1]
order = np.random.permutation(Nlearning)  # you can randomize the order of training samples
                                          # when constructing the folds in the loop

for H in range(1, totalComponents+1):     # number of mixture components
	print("H: {}".format(H))
	
	for fold in range(foldCount):    # K-fold cross validation
		
		training_indices = ?   # cv training sample indices
		
		val_indices = ?        # cv validation sample indices
		
		X_train = Xtrain[:,training_indices]  # cv training data
		X_val   = Xtrain[:,val_indices]       # cv validation data
		
		# train model
		P, m, S, loglik, phgn = GMMem.GMMem(X_train, H, 100)   # fit model
		
		# Predict using the cv trained model for validation data X_val
		logl = ?   # use the function GMMem.GMMloglik
		loglikelihoods[H-1,fold] = ?


# plot the accuracy curve
plt.bar(np.arange(1, totalComponents+1), np.mean(loglikelihoods, axis=1))
plt.xlabel('Number of Mixture Components')
plt.ylabel('CV Likelihood')
plt.title('Model Selection (Cross Validation)');
plt.show()

# select the number of mixture components which maximizes the accuracy
h = np.argmax(np.mean(loglikelihoods, axis=1)) + 1



# ***** TRAIN

# Now train full model with selected number of mixture components
P, m, S, loglik, phgn = GMMem.GMMem(Xtrain, h, 100)  # fit to data

# Predict using the full trained model (Use GMMem.GMMloglik)
logl = ?

print('Test Data Likelihood = {0:f}'.format(?))


# Plot the best GMM model
plot_data()

for i in range(h):
	dV, E = LA.eig(S[i,:,:])
	
	theta = np.arange(0, 2*np.pi, 0.1)
	p = np.sqrt(dV.reshape(D,1)) * [np.cos(theta), np.sin(theta)]
	x = (E @ p) + np.tile(m[:,i:i+1], (1, len(theta)))
	
	plt.plot(x[0], x[1], 'r-', linewidth=2)

plt.title('training data, test data (in black)')
plt.show()