Adithya Bellary Kourosh Arasteh Arko Bank

In [1]:
#imports
%matplotlib inline

import numpy as np
import scipy.stats as stats
import os
import matplotlib.pyplot as plt

Speaker dependent feature extraction

We assign the 5th utterance of each word of each speaker to the test set. All else will be used to train our HMM on.

In [2]:
#speaker dependent partitions
#sdeptrain dimensions sdeptrain[speaker][word][utterance idx] with idxs given alphabetically
sdeptrain = [[[[] for i in range(4)] for j in range(5)] for k in range(4)]
#sdeptest dimensions sdeptrain[speaker][word] with idxs given alphabetically
sdeptest = [[[] for i in range(5)] for j in range(4)]
p = './feature'
speakerList = ['dg', 'ls', 'mh', 'yx']
wordList = ['asr','cnn','dnn','hmm','tts']
snum = 0
for s in speakerList:
    speakerpath = p+'/'+s
    wnum = 0
    for w in wordList:
        wordpath = speakerpath +'/'+s+'_'+w
        #set up train
        for utt in range(1,5):
            fpath = wordpath + str(utt)+'.fea'
            f = open(fpath,'r')
            data = f.readlines()
            data = [i.split(',') for i in data]
            for sample in range(len(data)):
                data[sample] = [float(i) for i in data[sample]]
            for d in data:
                sdeptrain[snum][wnum][utt-1].append(d)
        #set up test
        fpath2 = wordpath + str(5)+'.fea'
        f2 = open(fpath2,'r')
        data = f2.readlines()
        data = [i.split(',') for i in data]
        for d in data:
            sdeptest[snum][wnum].append(d)
        wnum += 1
    snum += 1
sdeptest = np.array(sdeptest)
sdeptrain = np.array(sdeptrain)

Creating the mean and covariance matrices

For this specific lab, we are modeling each of our 5 states as Gaussians. Therefore, each state will be characterized by a mean matrix and a covariance matrix. Below, we initialize these matrices across all utterances of that word in the training set. Each state will start out with the same mean and covariance matrix. Throughout the training process, these values will be updated and tuned to give our HMM the optimal set of parameters.

We choose to use the diagonal covariance matrix instead of the full covariance matrix. We make this design choice because when we tried to use the full matrix, our covariance matrix did not stay positive semidefinite after a number of iterations. Only taking the values across the diagonal also greatly helped our numerical stability of the HMM.

In [3]:
bigasr = []
bigcnn = []
bigdnn = []
bighmm = []
bigtts = []
for i in range(4):
    for j in range(4):
        bigasr.extend(sdeptrain[i][0][j])
        bigcnn.extend(sdeptrain[i][1][j])
        bigdnn.extend(sdeptrain[i][2][j])
        bighmm.extend(sdeptrain[i][3][j])
        bigtts.extend(sdeptrain[i][4][j])
sdepasr_cov = np.diag(np.diag(np.cov(np.array(bigasr).T)))
sdepcnn_cov = np.diag(np.diag(np.cov(np.array(bigcnn).T)))
sdepdnn_cov = np.diag(np.diag(np.cov(np.array(bigdnn).T)))
sdephmm_cov = np.diag(np.diag(np.cov(np.array(bighmm).T)))
sdeptts_cov = np.diag(np.diag(np.cov(np.array(bigtts).T)))
sdepasr_mn = np.mean(bigasr,axis=0)
sdepcnn_mn = np.mean(bigcnn,axis=0)
sdepdnn_mn = np.mean(bigdnn,axis=0)
sdephmm_mn = np.mean(bighmm,axis=0)
sdeptts_mn = np.mean(bigtts,axis=0)

Speaker Independent Feature Extraction

All utterances of speaker mh will be used as test data. The rest will be assigned as training data.

In [4]:
#speaker independent partitions
#sindeptrain dimensions sdeptrain[speaker][word][utterance idx] with idxs given alphabetically
sindeptrain = [[[[] for i in range(5)] for j in range(5)] for k in range(3)]
#sindeptest dimensions sdeptest[word][utterance idx] with idxs given alphabetically
sindeptest = [[[] for i in range(5)] for j in range(5)]
p = './feature'
speakerList2 = ['dg', 'ls', 'mh']#'mh'
wordList = ['asr','cnn','dnn','hmm','tts']
snum = 0
for s in speakerList2:
    speakerpathi = p+'/'+s
    wnum = 0
    for w in wordList:
        wordpathi = speakerpathi +'/'+s+'_'+w
        #set up train
        for utt in range(1,6):
            fpathi = wordpathi + str(utt)+'.fea'
            fi = open(fpathi,'r')
            datai = fi.readlines()
            datai = [i.split(',') for i in datai]
            for sample in range(len(datai)):
                datai[sample] = [float(i) for i in datai[sample]]
            for d in datai:
                sindeptrain[snum][wnum][utt-1].append(d)
        wnum += 1
    snum += 1

#set up test
s2 = 'yx'
speakertestpath = p+'/'+s2
wnum = 0
for w in wordList:
    wordtestpath = speakertestpath +'/'+s2+'_'+w
    for utt in range(1,6):
        ftestpath = wordtestpath + str(utt)+'.fea'
        fitest = open(ftestpath,'r')
        dataitest = fitest.readlines()
        dataitest = [i.split(',') for i in dataitest]
        for sample in range(len(dataitest)):
            dataitest[sample] = [float(i) for i in dataitest[sample]]
        for d in dataitest:
            sindeptest[wnum][utt-1].append(d)
    wnum += 1

sindeptrain = np.array(sindeptrain)
sindeptest = np.array(sindeptest)

We initialize the mean and covariance matrices in the same manner as we did for the speaker dependent method

In [5]:
bigasri = []
bigcnni = []
bigdnni = []
bighmmi = []
bigttsi = []
for i in range(3):
    for j in range(5):
        bigasri.extend(sindeptrain[i][0][j])
        bigcnni.extend(sindeptrain[i][1][j])
        bigdnni.extend(sindeptrain[i][2][j])
        bighmmi.extend(sindeptrain[i][3][j])
        bigttsi.extend(sindeptrain[i][4][j])
sdepasri_cov = np.diag(np.diag(np.cov(np.array(bigasri).T)))
sdepcnni_cov = np.diag(np.diag(np.cov(np.array(bigcnni).T)))
sdepdnni_cov = np.diag(np.diag(np.cov(np.array(bigdnni).T)))
sdephmmi_cov = np.diag(np.diag(np.cov(np.array(bighmmi).T)))
sdepttsi_cov = np.diag(np.diag(np.cov(np.array(bigttsi).T)))
sdepasri_mn = np.mean(bigasri,axis=0)
sdepcnni_mn = np.mean(bigcnni,axis=0)
sdepdnni_mn = np.mean(bigdnni,axis=0)
sdephmmi_mn = np.mean(bighmmi,axis=0)
sdepttsi_mn = np.mean(bigttsi,axis=0)

Initial Probability and Transition Probability DIstribution

The initial probability matrix defines the probability of which state the process starts in. Since we have 5 states per HMM and we want there to be an equal chance that each state is the first state, each state will have 20%.

The transition probability will define how we move from state to state. Since we are training a left-to-right nonskip HMM, then the only option for a next state is just the next one. It will move on with probability 20% and stay in the same state 80%. These are the only two options. The transition probability matrix will be updated as we move through the training process.

These two matrices as well as the mean and covariance matrices we just computed will be the 4 parameters that define an HMM at a given time. These parameters are often denoted as $\Theta = (\pi, A, \mu, \sigma)$, with $\pi$ being the initial state probability distribution and A being the transition probability matrix.

In [6]:
initialProbabilities = [0.2, 0.2, 0.2, 0.2, 0.2]

initialTransition = [[0.8,0.2,  0,  0,   0],
                     [  0,0.8,0.2,  0,   0],
                     [  0,  0,0.8,0.2,   0],
                     [  0,  0,  0,0.8, 0.2],
                     [  0,  0,  0,  0,   1]]

HMM Training Helper Functions

Now that we have the parameters that define the HMM at a particular time, we need to define how these parameters are going to be updated. We will train each HMM according to the Baum-Welch algorithm at this link here: https://en.wikipedia.org/wiki/Baum%E2%80%93Welch_algorithm.

The helper functions below generate the $\alpha, \beta, \gamma,$ and $\xi$ values that we need to update our initial probabilities $\pi$ and transition matrix A.

In [7]:
##### HELPER FUNCTIONSSSSS
#implemented from https://en.wikipedia.org/wiki/Baum%E2%80%93Welch_algorithm

#Forward Pass
def calculateBMatrix(X, mu, sigma,N):
    #X is the data in a file
    #mu is the average across all states 
    #sigma is the covariance matrix across all states 
    #N is the number of states
    #BMatrix = [T,N]
    #print(sigma[0])
    T = len(X)
    bMatrix = np.zeros((N,T))
    for frame in range(T):
        temp_frame = []
        for state in range(N):
            bMatrix[state,frame] = stats.multivariate_normal(mean=(mu[state]),cov=sigma[state]).pdf(X[frame])
    return bMatrix

def calculateAlphaMatrix(transMatrix, BMatrix,priors):
    #transMatrix is A 
    #BMatrix is B
    #priors is pi
    #populate our alpha matrix
    T = len(BMatrix[0])
    N = len(BMatrix)
    Amat = np.array(transMatrix)
    alpha = np.zeros((N,T))
    #alphaMatrix = [T,N]
    for i in range(N):
        alpha[i,0]=priors[i]*BMatrix[i,0]
    for t in range(1,T):
        for i in range(0,N):
            alpha[i,t]=BMatrix[i,t]*np.inner(alpha[:,t-1],Amat[:,i])
    return alpha

def calculateAlphaBetaTildeG(transMatrix,BMatrix,priors):
    T = len(BMatrix[0])
    N = len(BMatrix)
    baralpha = np.zeros((N,T))
    tildealpha = np.zeros((N,T))
    tildebeta = np.zeros((N,T))
    log_g = np.zeros((T))
    for i in range(0,N):
        baralpha[i,0]=priors[i]*BMatrix[i,0]
    log_g[0] = np.log(np.sum(baralpha[:,0]))
    tildealpha[:,0]=baralpha[:,0]/np.exp(log_g[0])

    for t in range(1,T):
        for i in range(0,N):
            baralpha[i,t]=BMatrix[i,t]*np.inner(tildealpha[:,t-1],transMatrix[:,i])
        log_g[t] = np.log(np.sum(baralpha[:,t]))
        tildealpha[:,t]=baralpha[:,t]/np.exp(log_g[t])
    for i in range(0,N):
        tildebeta[i,T-1] = 1/np.exp(log_g[T-1])

    for t in range(T-2,-1,-1):
        for i in range(0,N):
            tildebeta[i,t]=np.inner(transMatrix[i,0:N],tildebeta[:,t+1]*BMatrix[:,t+1])/np.exp(log_g[t+1])

    return tildealpha,tildebeta,log_g
#Backward Pass

def calculateBetaMatrix(transMatrix, bMatrix):
    #populate the Beta matrix
    T = len(bMatrix[0])
    N = len(bMatrix)
    beta = np.zeros((N,T))
    for i in range(N):
        beta[i,T-1]=1
    for t in range(T-2,-1,-1):
        for i in range(0,N):
            beta[i,t]=np.inner(transMatrix[i,0:N],beta[:,t+1]*bMatrix[:,t+1])
    return beta
#Update step helper functions

def calculateGammaMatrix(alphaMatrix, betaMatrix):
    T = len(betaMatrix[0])
    N = len(betaMatrix)
    gamma = np.zeros((N,T))
    for t in range(T):
        gamma[:,t]=alphaMatrix[:,t]*betaMatrix[:,t]
        gamma[:,t]=gamma[:,t]/np.sum(gamma[:,t])
    return gamma

def calculateXiMatrix( transMatrix, alphaMatrix, betaMatrix, bMatrix):
    #caluculate xi 
    #probability of going from state n to state j at times t and t+1 respectively
    T = len(bMatrix[0])
    N = len(bMatrix)
    xi = np.zeros((2*N,T))
    for t in range(0,T):
        for i in range(0,N):
            for j in range(i,i+2):
                if j>= N:
                    xi[i+j,t] = 0
                else:
                    xi[i+j,t]=alphaMatrix[i,t]*transMatrix[i,j]
                if (t<T-1):
                    if j==N:
                        xi[i+j,t]=0
                    else:
                        xi[i+j,t] = xi[i+j,t]*bMatrix[j,t+1]*betaMatrix[j,t+1]
        xi[:,t]=xi[:,t]/np.sum(xi[:,t])
    return xi

Training the HMM Independent of speaker

We train each HMM until the parameters reach convergence. To find this optimal number of iterations, we plot the maximum probability of the HMM and stop once it does change anymore. We found that 25 iterations is sufficient for the model parameters to reach convergence.

The general method of how we train the HMM for the speaker independent and speaker dependent version is the same. The difference stems from how we make the test and train split above. In the speaker independent version, there are only 3 speakers, and the speaker dependent, we train on utterances from all 4 speakers. These differences will change the code structure slightly.

In [8]:
#SDEP general
def SINDEPTRAIN(initialTransition,initialProbabilities,initialmeans,initialcov,wordidx,numiterations,numcoeff,Sindeptrain):
    A = np.array(initialTransition)
    mn = initialmeans
    cov = initialcov
    logprobs = np.zeros((15,numiterations))
    for iteration in range(numiterations):
        alphas = []
        betas = []
        gammas = []
        xis = []
        for s in range(3):#for each speaker
            for u in range(5): # for each utterance
                B = calculateBMatrix(Sindeptrain[s][wordidx][u],mn,cov,5)
                Alpha,Beta,logg = calculateAlphaBetaTildeG(A,B,initialProbabilities)
                Gamma = calculateGammaMatrix(Alpha,Beta)
                xi = calculateXiMatrix(A,Alpha,Beta,B)
                alphas.append(Alpha)
                betas.append(Beta)
                gammas.append(Gamma)
                xis.append(xi)
                logprobs[:,iteration] = np.sum(logg)

        #upd8 
        num_files = len(alphas)
        N = len(gammas[0])
        A_new = np.zeros((N,N))
        mu_new = np.zeros((N,numcoeff))
        sigma_new = np.zeros((N,numcoeff, numcoeff))
        # find aij
        denomsum = np.zeros((N))
        for i in range(0,N):

            for f in range(num_files):
                for t in range(len(gammas[f][i])):
                    denomsum[i] += gammas[f][i][t]

            for j in range(i,i+2):
                if j<N:
                    numsum = 0
                    for f in range(num_files):
                        for t in range(len(gammas[f][i])):
                            numsum += xis[f][i+j][t]
                    A_new[i][j] = numsum/denomsum[i]
        #find mui
        for i in range(0,N):
            numsum = 0
            for f in range(num_files):
                for t in range(len(Sindeptrain[f//5][wordidx][f%5])):
                    numsum += (np.array(gammas[f][i][t]) * np.array(Sindeptrain[f//5][wordidx][f%5][t]))
            mu_new[i] = numsum/denomsum[i]

        #find covdi
        for i in range(0,N):
            numsum = 0
            for f in range(num_files):
                for t in range(len(Sindeptrain[f//5][wordidx][f%5])):
                    numsum += (np.array(gammas[f][i][t]) * np.array(np.outer((Sindeptrain[f//5][wordidx][f%5][t]-mn[i]),(Sindeptrain[f//5][wordidx][f%5][t]-mn[i]))))
            sigma_new[i] = numsum/denomsum[i]
        #save the new parameters
        A = A_new
        mn = mu_new
        cov = sigma_new
    print("indep hmm completed for word",wordidx)
    return A,mn,cov
In [9]:
#SDEP general
def SDEPTRAIN(initialTransition,initialProbabilities,initialmeans,initialcov,wordidx,numiterations,numcoeff,Sdeptrain):
    A = np.array(initialTransition)
    mn = initialmeans
    cov = initialcov
    logprobs = np.zeros((16,numiterations))
    for iteration in range(numiterations):
        # calc params
        alphas = []
        betas = []
        gammas = []
        xis = []
        for s in range(4):#for each speaker
            for u in range(4): # for each utterance
                B = calculateBMatrix(Sdeptrain[s][wordidx][u],mn,cov,5)
                Alpha,Beta,logg = calculateAlphaBetaTildeG(A,B,initialProbabilities)
                Gamma = calculateGammaMatrix(Alpha,Beta)
                xi = calculateXiMatrix(A,Alpha,Beta,B)
                alphas.append(Alpha)
                betas.append(Beta)
                gammas.append(Gamma)
                xis.append(xi)
                logprobs[:,iteration] = np.sum(logg)

        #upd8 
        num_files = len(alphas)
        N = len(gammas[0])
        A_new = np.zeros((N,N))
        mu_new = np.zeros((N,numcoeff))
        sigma_new = np.zeros((N,numcoeff, numcoeff))
        # find aij
        denomsum = np.zeros((N))
        for i in range(0,N):

            for f in range(num_files):
                for t in range(len(gammas[f][i])):
                    denomsum[i] += gammas[f][i][t]

            for j in range(i,i+2):
                if j<N:
                    numsum = 0
                    for f in range(num_files):
                        for t in range(len(gammas[f][i])):
                            numsum += xis[f][i+j][t]
                    A_new[i][j] = numsum/denomsum[i]
        #find mui
        for i in range(0,N):

            numsum = 0
            for f in range(num_files):
                for t in range(len(gammas[f][i])):
                    numsum += (np.array(gammas[f][i][t]) * np.array(Sdeptrain[f//4][wordidx][f%4][t]))
            mu_new[i] = numsum/denomsum[i]

        #find covdi
        for i in range(0,N):

            numsum = 0
            for f in range(num_files):
                for t in range(len(gammas[f][i])):
                    numsum += (np.array(gammas[f][i][t]) * np.array(np.outer((Sdeptrain[f//4][wordidx][f%4][t]-mn[i]),(Sdeptrain[f//4][wordidx][f%4][t]-mn[i]))))
            sigma_new[i] = numsum/denomsum[i]

        A = A_new
        mn = mu_new
        cov = sigma_new
    print(A_new)
    print("dep hmm completed for word",wordidx)
    return A,mn,cov

Here we get the optimal parameters for the speaker dependent HMM of each word by training on that portion of the train dataset.

In [10]:
#TRAINING FOR SPEAKER DEPENDENT
asrmean = [sdepasr_mn,sdepasr_mn,sdepasr_mn,sdepasr_mn,sdepasr_mn]
asrcova = [sdepasr_cov,sdepasr_cov,sdepasr_cov,sdepasr_cov,sdepasr_cov]
cnnmean = [sdepcnn_mn,sdepcnn_mn,sdepcnn_mn,sdepcnn_mn,sdepcnn_mn]
cnncova = [sdepcnn_cov,sdepcnn_cov,sdepcnn_cov,sdepcnn_cov,sdepcnn_cov]
dnnmean = [sdepdnn_mn,sdepdnn_mn,sdepdnn_mn,sdepdnn_mn,sdepdnn_mn]
dnncova = [sdepdnn_cov,sdepdnn_cov,sdepdnn_cov,sdepdnn_cov,sdepdnn_cov]
hmmmean = [sdephmm_mn,sdephmm_mn,sdephmm_mn,sdephmm_mn,sdephmm_mn]
hmmcova = [sdephmm_cov,sdephmm_cov,sdephmm_cov,sdephmm_cov,sdephmm_cov]
ttsmean = [sdeptts_mn,sdeptts_mn,sdeptts_mn,sdeptts_mn,sdeptts_mn]
ttscova = [sdeptts_cov,sdeptts_cov,sdeptts_cov,sdeptts_cov,sdeptts_cov]
A_asr,mn_asr,cov_asr = SDEPTRAIN(initialTransition,initialProbabilities,asrmean,asrcova,0,25,14,sdeptrain)
A_cnn,mn_cnn,cov_cnn = SDEPTRAIN(initialTransition,initialProbabilities,cnnmean,cnncova,1,25,14,sdeptrain)
A_dnn,mn_dnn,cov_dnn = SDEPTRAIN(initialTransition,initialProbabilities,dnnmean,dnncova,2,25,14,sdeptrain)
A_hmm,mn_hmm,cov_hmm = SDEPTRAIN(initialTransition,initialProbabilities,hmmmean,hmmcova,3,25,14,sdeptrain)
A_tts,mn_tts,cov_tts = SDEPTRAIN(initialTransition,initialProbabilities,ttsmean,ttscova,4,25,14,sdeptrain)
[[ 0.88004837  0.11995163  0.          0.          0.        ]
 [ 0.          0.91842459  0.08157541  0.          0.        ]
 [ 0.          0.          0.89646572  0.10353428  0.        ]
 [ 0.          0.          0.          0.92693646  0.07306354]
 [ 0.          0.          0.          0.          1.        ]]
dep hmm completed for word 0
[[ 0.93941349  0.06058651  0.          0.          0.        ]
 [ 0.          0.93207714  0.06792286  0.          0.        ]
 [ 0.          0.          0.85274211  0.14725789  0.        ]
 [ 0.          0.          0.          0.94678115  0.05321885]
 [ 0.          0.          0.          0.          1.        ]]
dep hmm completed for word 1
[[ 0.85035388  0.14964612  0.          0.          0.        ]
 [ 0.          0.94331581  0.05668419  0.          0.        ]
 [ 0.          0.          0.89479096  0.10520904  0.        ]
 [ 0.          0.          0.          0.96392057  0.03607943]
 [ 0.          0.          0.          0.          1.        ]]
dep hmm completed for word 2
[[ 0.82018617  0.17981383  0.          0.          0.        ]
 [ 0.          0.94225915  0.05774085  0.          0.        ]
 [ 0.          0.          0.77478542  0.22521458  0.        ]
 [ 0.          0.          0.          0.84477418  0.15522582]
 [ 0.          0.          0.          0.          1.        ]]
dep hmm completed for word 3
[[ 0.90084898  0.09915102  0.          0.          0.        ]
 [ 0.          0.92296079  0.07703921  0.          0.        ]
 [ 0.          0.          0.91058641  0.08941359  0.        ]
 [ 0.          0.          0.          0.96567364  0.03432636]
 [ 0.          0.          0.          0.          1.        ]]
dep hmm completed for word 4

Testing the HMM

We test our test utterances by first making a forward pass and getting the $\alpha$ and $\beta$ values. Then, we can get the maximum probability from that HMM by summing the log of the $g$ vector. Then, we assign the utterance to be the HMM that gave the highest probability.

In [11]:
def testlogProb(testfile,Atest,meanstest,covtest,initProbs):
    At = np.array(Atest)
    Btest = calculateBMatrix(testfile,meanstest,covtest,5)
    logg = np.array(calculateAlphaBetaTildeG(At,Btest,initProbs)[2])
    return np.sum(logg)
In [12]:
#SPEAKER DEPENDENT TEST
#sdeptest[speaker][word]
classifications = np.zeros((5,5))
#classifications[truth][prediction] where idx{0,1,2,3,4} => {'asr','cnn','dnn','hmm','tts'}
for i in range(4):
    for j in range(5):
        file = sdeptest[i][j]
        preds = [testlogProb(file,A_asr,mn_asr,cov_asr,initialProbabilities),
                 testlogProb(file,A_cnn,mn_cnn,cov_cnn,initialProbabilities),
                 testlogProb(file,A_dnn,mn_dnn,cov_dnn,initialProbabilities),
                 testlogProb(file,A_hmm,mn_hmm,cov_hmm,initialProbabilities),
                 testlogProb(file,A_tts,mn_tts,cov_tts,initialProbabilities)]
        classifications[j][np.argmax(preds)] += 1

print('Confusion Matrix\n',classifications/4)
print('Accuracy\n',np.mean(np.diag(classifications/4)))

c:\users\adithya\appdata\local\programs\python\python36\lib\site-packages\ipykernel_launcher.py:59: RuntimeWarning: overflow encountered in double_scalars
Confusion Matrix
 [[ 1.    0.    0.    0.    0.  ]
 [ 0.    1.    0.    0.    0.  ]
 [ 0.    0.25  0.75  0.    0.  ]
 [ 0.    0.    0.    1.    0.  ]
 [ 0.    0.    0.    0.    1.  ]]
Accuracy
 0.95
In [13]:
#SPEAKER INDEPENDENT TRAIN
asrmeani = [sdepasri_mn,sdepasri_mn,sdepasri_mn,sdepasri_mn,sdepasri_mn]
asrcovai = [sdepasri_cov,sdepasri_cov,sdepasri_cov,sdepasri_cov,sdepasri_cov]
cnnmeani = [sdepcnni_mn,sdepcnni_mn,sdepcnni_mn,sdepcnni_mn,sdepcnni_mn]
cnncovai = [sdepcnni_cov,sdepcnni_cov,sdepcnni_cov,sdepcnni_cov,sdepcnni_cov]
dnnmeani = [sdepdnni_mn,sdepdnni_mn,sdepdnni_mn,sdepdnni_mn,sdepdnni_mn]
dnncovai = [sdepdnni_cov,sdepdnni_cov,sdepdnni_cov,sdepdnni_cov,sdepdnni_cov]
hmmmeani = [sdephmmi_mn,sdephmmi_mn,sdephmmi_mn,sdephmmi_mn,sdephmmi_mn]
hmmcovai = [sdephmmi_cov,sdephmmi_cov,sdephmmi_cov,sdephmmi_cov,sdephmmi_cov]
ttsmeani = [sdepttsi_mn,sdepttsi_mn,sdepttsi_mn,sdepttsi_mn,sdepttsi_mn]
ttscovai = [sdepttsi_cov,sdepttsi_cov,sdepttsi_cov,sdepttsi_cov,sdepttsi_cov]
A_asri,mn_asri,cov_asri = SINDEPTRAIN(initialTransition,initialProbabilities,asrmeani,asrcovai,0,25,14, sindeptrain)
A_cnni,mn_cnni,cov_cnni = SINDEPTRAIN(initialTransition,initialProbabilities,cnnmeani,cnncovai,1,25,14, sindeptrain)
A_dnni,mn_dnni,cov_dnni = SINDEPTRAIN(initialTransition,initialProbabilities,dnnmeani,dnncovai,2,25,14, sindeptrain)
A_hmmi,mn_hmmi,cov_hmmi = SINDEPTRAIN(initialTransition,initialProbabilities,hmmmeani,hmmcovai,3,25,14, sindeptrain)
A_ttsi,mn_ttsi,cov_ttsi = SINDEPTRAIN(initialTransition,initialProbabilities,ttsmeani,ttscovai,4,25,14, sindeptrain)
indep hmm completed for word 0
indep hmm completed for word 1
indep hmm completed for word 2
indep hmm completed for word 3
indep hmm completed for word 4
In [14]:
#SPEAKER INDEPENDENT TEST
#test[word][utt]
classificationsi = np.zeros((5,5))
#classifications[truth][prediction] where idx{0,1,2,3,4} => {'asr','cnn','dnn','hmm','tts'}
for i in range(5):
    for j in range(5):
        filei = sindeptest[i][j]
        predsi = [testlogProb(filei,A_asri,mn_asri,cov_asri,initialProbabilities),
                 testlogProb(filei,A_cnni,mn_cnni,cov_cnni,initialProbabilities),
                 testlogProb(filei,A_dnni,mn_dnni,cov_dnni,initialProbabilities),
                 testlogProb(filei,A_hmmi,mn_hmmi,cov_hmmi,initialProbabilities),
                 testlogProb(filei,A_ttsi,mn_ttsi,cov_ttsi,initialProbabilities)]
        classificationsi[i][np.argmax(predsi)] += 1

print('Classification\n',classificationsi/5)
print('Accuracy\n',np.mean(np.diag(classificationsi/5)))

c:\users\adithya\appdata\local\programs\python\python36\lib\site-packages\ipykernel_launcher.py:59: RuntimeWarning: overflow encountered in double_scalars
Classification
 [[ 1.   0.   0.   0.   0. ]
 [ 0.6  0.2  0.2  0.   0. ]
 [ 0.2  0.   0.8  0.   0. ]
 [ 0.   1.   0.   0.   0. ]
 [ 0.2  0.   0.   0.   0.8]]
Accuracy
 0.56

Testing on Recorded Voice

Feature Extraction for Python generated MFCC features

In [15]:
################ FEATURE EXTRACTION FOR PYTHON GENERATED MFCC FEATURES ###########
#speaker independent partitions
#sindeptrain dimensions sdeptrain[speaker][word][utterance idx] with idxs given alphabetically
py_sindeptrain = [[[[] for i in range(5)] for j in range(5)] for k in range(3)]
#sindeptest dimensions sdeptest[word][utterance idx] with idxs given alphabetically
py_sindeptest = [[[] for i in range(5)] for j in range(5)]
py_p = './feature'
py_speakerList2 = ['dg', 'ls', 'mh']#'yx'
py_wordList = ['asr','cnn','dnn','hmm','tts']
py_snum = 0
for s in py_speakerList2:
    py_speakerpathi = py_p+'/py_'+s
    py_wnum = 0
    for w in py_wordList:
        py_wordpathi = py_speakerpathi +'/'+s+'_'+w
        #set up train
        for utt in range(1,6):
            py_fpathi = py_wordpathi + str(utt)+'.fea'
            py_fi = open(py_fpathi,'r')
            py_datai = py_fi.readlines()
            py_datai = [i.split(',') for i in py_datai]
            for sample in range(len(py_datai)):
                py_datai[sample] = [float(i) for i in py_datai[sample]]
                #normalizing training data
                py_datai[sample] = py_datai[sample]/np.linalg.norm(py_datai[sample])
            for d in py_datai:
                py_sindeptrain[py_snum][py_wnum][utt-1].append(d)
        py_wnum += 1
    py_snum += 1

        #set up test
py_s2 = 'yx'
py_speakertestpath = py_p+'/py_'+py_s2
py_wnum = 0
for w in py_wordList:
    py_wordtestpath = py_speakertestpath +'/'+py_s2+'_'+w
    for utt in range(1,6):
        py_ftestpath = py_wordtestpath + str(utt)+'.fea'
        py_fitest = open(py_ftestpath,'r')
        py_dataitest = py_fitest.readlines()
        py_dataitest = [i.split(',') for i in py_dataitest]
        for sample in range(len(py_dataitest)):
            py_dataitest[sample] = [float(i) for i in py_dataitest[sample]]
            py_dataitest[sample] = py_dataitest[sample]/np.linalg.norm(py_dataitest[sample])
        for d in py_dataitest:
            py_sindeptest[py_wnum][utt-1].append(d)
    py_wnum += 1

py_sindeptrain = np.array(py_sindeptrain)
py_sindeptest = np.array(py_sindeptest)

py_bigasri = []
py_bigcnni = []
py_bigdnni = []
py_bighmmi = []
py_bigttsi = []
for i in range(3):
    for j in range(5):
        py_bigasri.extend(py_sindeptrain[i][0][j])
        py_bigcnni.extend(py_sindeptrain[i][1][j])
        py_bigdnni.extend(py_sindeptrain[i][2][j])
        py_bighmmi.extend(py_sindeptrain[i][3][j])
        py_bigttsi.extend(py_sindeptrain[i][4][j])
py_sdepasri_cov = np.diag(np.diag(np.cov(np.array(py_bigasri).T)))
py_sdepcnni_cov = np.diag(np.diag(np.cov(np.array(py_bigcnni).T)))
py_sdepdnni_cov = np.diag(np.diag(np.cov(np.array(py_bigdnni).T)))
py_sdephmmi_cov = np.diag(np.diag(np.cov(np.array(py_bighmmi).T)))
py_sdepttsi_cov = np.diag(np.diag(np.cov(np.array(py_bigttsi).T)))
py_sdepasri_mn = np.mean(py_bigasri,axis=0)
py_sdepcnni_mn = np.mean(py_bigcnni,axis=0)
py_sdepdnni_mn = np.mean(py_bigdnni,axis=0)
py_sdephmmi_mn = np.mean(py_bighmmi,axis=0)
py_sdepttsi_mn = np.mean(py_bigttsi,axis=0)
# print(len(py_sindeptrain[0][0][0][0]))
# print(py_sdepasri_cov)

Training Speaker Independent on python genereated MFCC data

In [16]:
#SPEAKER INDEPENDENT TRAIN
py_asrmeani = [py_sdepasri_mn,py_sdepasri_mn,py_sdepasri_mn,py_sdepasri_mn,py_sdepasri_mn]
py_asrcovai = [py_sdepasri_cov,py_sdepasri_cov,py_sdepasri_cov,py_sdepasri_cov,py_sdepasri_cov]
py_cnnmeani = [py_sdepcnni_mn,py_sdepcnni_mn,py_sdepcnni_mn,py_sdepcnni_mn,py_sdepcnni_mn]
py_cnncovai = [py_sdepcnni_cov,py_sdepcnni_cov,py_sdepcnni_cov,py_sdepcnni_cov,py_sdepcnni_cov]
py_dnnmeani = [py_sdepdnni_mn,py_sdepdnni_mn,py_sdepdnni_mn,py_sdepdnni_mn,py_sdepdnni_mn]
py_dnncovai = [py_sdepdnni_cov,py_sdepdnni_cov,py_sdepdnni_cov,py_sdepdnni_cov,py_sdepdnni_cov]
py_hmmmeani = [py_sdephmmi_mn,py_sdephmmi_mn,py_sdephmmi_mn,py_sdephmmi_mn,py_sdephmmi_mn]
py_hmmcovai = [py_sdephmmi_cov,py_sdephmmi_cov,py_sdephmmi_cov,py_sdephmmi_cov,py_sdephmmi_cov]
py_ttsmeani = [py_sdepttsi_mn,py_sdepttsi_mn,py_sdepttsi_mn,py_sdepttsi_mn,py_sdepttsi_mn]
py_ttscovai = [py_sdepttsi_cov,py_sdepttsi_cov,py_sdepttsi_cov,py_sdepttsi_cov,py_sdepttsi_cov]
py_A_asri,py_mn_asri,py_cov_asri = SINDEPTRAIN(initialTransition,initialProbabilities,py_asrmeani,py_asrcovai,0,25,14,py_sindeptrain)
py_A_cnni,py_mn_cnni,py_cov_cnni = SINDEPTRAIN(initialTransition,initialProbabilities,py_cnnmeani,py_cnncovai,1,25,14,py_sindeptrain)
py_A_dnni,py_mn_dnni,py_cov_dnni = SINDEPTRAIN(initialTransition,initialProbabilities,py_dnnmeani,py_dnncovai,2,25,14,py_sindeptrain)
py_A_hmmi,py_mn_hmmi,py_cov_hmmi = SINDEPTRAIN(initialTransition,initialProbabilities,py_hmmmeani,py_hmmcovai,3,25,14,py_sindeptrain)
py_A_ttsi,py_mn_ttsi,py_cov_ttsi = SINDEPTRAIN(initialTransition,initialProbabilities,py_ttsmeani,py_ttscovai,4,25,14,py_sindeptrain)
indep hmm completed for word 0
indep hmm completed for word 1
indep hmm completed for word 2
indep hmm completed for word 3
indep hmm completed for word 4
In [17]:
#SPEAKER INDEPENDENT TEST
#test[word][utt]
py_classificationsi = np.zeros((5,5))
#classifications[truth][prediction] where idx{0,1,2,3,4} => {'asr','cnn','dnn','hmm','tts'}
for i in range(5):
    for j in range(5):
        py_filei = py_sindeptest[i][j]
        py_predsi = [testlogProb(py_filei,py_A_asri,py_mn_asri,py_cov_asri,initialProbabilities),
                 testlogProb(py_filei,py_A_cnni,py_mn_cnni,py_cov_cnni,initialProbabilities),
                 testlogProb(py_filei,py_A_dnni,py_mn_dnni,py_cov_dnni,initialProbabilities),
                 testlogProb(py_filei,py_A_hmmi,py_mn_hmmi,py_cov_hmmi,initialProbabilities),
                 testlogProb(py_filei,py_A_ttsi,py_mn_ttsi,py_cov_ttsi,initialProbabilities)]
        py_classificationsi[i][np.argmax(py_predsi)] += 1
    print(py_predsi)

print(py_classificationsi/5)
print(np.mean(np.diag(py_classificationsi/5)))
c:\users\adithya\appdata\local\programs\python\python36\lib\site-packages\ipykernel_launcher.py:59: RuntimeWarning: overflow encountered in double_scalars
c:\users\adithya\appdata\local\programs\python\python36\lib\site-packages\ipykernel_launcher.py:59: RuntimeWarning: invalid value encountered in multiply
[-23187.940744276118, -23264.339042853164, -20325.786658585221, -15604.562403920221, -18667.387289781167]
[-28575.038009401538, -28281.825071188057, -25638.959716546709, -19225.415901204269, -22814.788792711639]
[-27312.365223374931, -26502.464761314477, -23994.241010773985, -17959.646806842913, -21755.114404550881]
c:\users\adithya\appdata\local\programs\python\python36\lib\site-packages\ipykernel_launcher.py:52: RuntimeWarning: divide by zero encountered in log
c:\users\adithya\appdata\local\programs\python\python36\lib\site-packages\ipykernel_launcher.py:53: RuntimeWarning: invalid value encountered in true_divide
c:\users\adithya\appdata\local\programs\python\python36\lib\site-packages\ipykernel_launcher.py:55: RuntimeWarning: divide by zero encountered in double_scalars
[-26648.102430315932, -26103.16494999432, -22819.513087620453, -17191.653344745013, -20792.990596113719]
[-38224.962738793984, -36901.22909767172, -31298.69141470499, -23828.226556382848, -30217.026489409163]
[[ 0.   0.   0.   1.   0. ]
 [ 0.   0.   0.   1.   0. ]
 [ 0.   0.   0.   1.   0. ]
 [ 0.4  0.   0.   0.6  0. ]
 [ 0.2  0.   0.   0.8  0. ]]
0.12
In [18]:
#access voicetest[word][utt]
voicetest = [[[] for i in range(5)] for j in range(5)]
p = './feature/self_recf'
wordList = ['asr','cnn','dnn','hmm','tts']

speakertestpath = p
wnum = 0
for w in wordList:
    wordtestpath = speakertestpath +'/'+w
    for utt in range(1,6):
        ftestpath = wordtestpath + str(utt)+'.fea'
        fitest = open(ftestpath,'r')
        dataitest = fitest.readlines()
        dataitest = [i.split(',') for i in dataitest]
        for sample in range(len(dataitest)):
            dataitest[sample] = [float(i) for i in dataitest[sample]]
            dataitest[sample] = dataitest[sample]/np.linalg.norm(dataitest[sample])
#             print(dataitest)

        for d in dataitest:
            voicetest[wnum][utt-1].append(d)
    wnum += 1

voicetest = np.array(voicetest)
#SPEAKER INDEPENDENT TEST
#test[word][utt]
py_classificationsvoice = np.zeros((5,5))
#classifications[truth][prediction] where idx{0,1,2,3,4} => {'asr','cnn','dnn','hmm','tts'}
for i in range(5):
    for j in range(5):
        filei = voicetest[i][j]
        py_predsi = [testlogProb(filei,py_A_asri,py_mn_asri,py_cov_asri,initialProbabilities),
                 testlogProb(filei,py_A_cnni,py_mn_cnni,py_cov_cnni,initialProbabilities),
                 testlogProb(filei,py_A_dnni,py_mn_dnni,py_cov_dnni,initialProbabilities),
                 testlogProb(filei,py_A_hmmi,py_mn_hmmi,py_cov_hmmi,initialProbabilities),
                 testlogProb(filei,py_A_ttsi,py_mn_ttsi,py_cov_ttsi,initialProbabilities)]
        py_classificationsvoice[i][np.argmax(py_predsi)] += 1
#         print(predsi)

print('Confusion Matrix\n',py_classificationsvoice/5)
print('Accuracy\n',np.mean(np.diag(py_classificationsi/5)))
c:\users\adithya\appdata\local\programs\python\python36\lib\site-packages\ipykernel_launcher.py:52: RuntimeWarning: divide by zero encountered in log
c:\users\adithya\appdata\local\programs\python\python36\lib\site-packages\ipykernel_launcher.py:53: RuntimeWarning: invalid value encountered in true_divide
c:\users\adithya\appdata\local\programs\python\python36\lib\site-packages\ipykernel_launcher.py:46: RuntimeWarning: divide by zero encountered in log
c:\users\adithya\appdata\local\programs\python\python36\lib\site-packages\ipykernel_launcher.py:47: RuntimeWarning: invalid value encountered in true_divide
c:\users\adithya\appdata\local\programs\python\python36\lib\site-packages\ipykernel_launcher.py:55: RuntimeWarning: divide by zero encountered in double_scalars
c:\users\adithya\appdata\local\programs\python\python36\lib\site-packages\ipykernel_launcher.py:59: RuntimeWarning: invalid value encountered in multiply
Confusion Matrix
 [[ 0.   0.2  0.6  0.   0.2]
 [ 0.2  0.   0.4  0.   0.4]
 [ 0.2  0.2  0.6  0.   0. ]
 [ 0.   0.6  0.2  0.   0.2]
 [ 0.   0.6  0.4  0.   0. ]]
Accuracy
 0.12

Results

Comparing the speaker independent and dependent results, we found that the speaker dependent accuracy did better. This makes sense, because with this way, the HMM was able to train on utterances from all the speakers. Each speaker has their own personal sound and way they pronounce certain words. Some people have accents we well, and that can impact how a certain word sounds when they say it. Therefore, it is a good idea to train each HMM on each speaker as well. That way, the model parameters that are produced can represent a variety voices. Speaker dependent training trains the HMM in a more robust fashion.

Upon further inspection, it makes sense why the speaker independent HMM performed worse than the speaker dependent HMMs.

One reason why the accuracy for our our voice testing is a bit lower is because we did not have access to the Matlab mfcc fucntion. Therefore, we had to use a python library that generated the mfcc coefficients. However, when we tried to recreate the Matlab mfcc values using the python library, we were unable to. There must be something different with the python library and it is possible it is not able to capture the same information from the wav file we pass into it. However, to be consistent, we train the independent HMM parameters on the mfcc values from the python library. We run that library on the wav files given to us and use those valued instead of the given mfcc values. If we had access to the Matlab mfcc function we would be able to recreate the right values. Another reason why the accuracy for this specific test is low is because training the HMM independent of the speaker is not a good idea. It matters very much who the speaker is because as mentioned before, everyone has their own way of saying the same words. Their intonation and the time they spend on each syllable is very different from other people. So, if we want our HMM to have the best performance, we should train it on as many voices as possible. The speaker does matter a lot.

Feature extraction for EC

For the extra credit we have opted to use the Mel Frequency Spectral Coefficients. Since the MFCC comes from taking the IDFT of the of the MFSC, we obtained the MFSC by taking the fft of the MFCC features given to us.

In [19]:
#speaker dependent partitions
#sdeptrain dimensions sdeptrain[speaker][word][utterance idx] with idxs given alphabetically
ec_sdeptrain = [[[[] for i in range(4)] for j in range(5)] for k in range(4)]
#ec_sdeptest dimensions sdeptrain[speaker][word] with idxs given alphabetically
ec_sdeptest = [[[] for i in range(5)] for j in range(4)]
p = './feature'
speakerListec = ['dg', 'ls', 'mh', 'yx']
wordListec = ['asr','cnn','dnn','hmm','tts']
snum = 0
for s in speakerListec:
    speakerpathec = p+'/'+s
    wnum = 0
    for w in wordListec:
        wordpathec = speakerpathec +'/'+s+'_'+w
        #set up train
        for utt in range(1,5):
            fpathec = wordpathec + str(utt)+'.fea'
            fec = open(fpathec,'r')
            data = fec.readlines()
            data = [i.split(',') for i in data]
            for sample in range(len(data)):
                data[sample] = [float(i) for i in data[sample]]
                #normalize
                data[sample] = data[sample]/np.linalg.norm(data[sample])

            for d in data:
                ec_sdeptrain[snum][wnum][utt-1].append(np.real(np.fft.fft(d)))
        #set up test
        fpath2ec = wordpathec + str(5)+'.fea'
        f2ec = open(fpath2ec,'r')
        dataec = f2ec.readlines()
        dataec = [i.split(',') for i in dataec]
        for d in dataec:
            ec_sdeptest[snum][wnum].append(np.real(np.fft.fft(np.array(d).astype(np.float))))
        wnum += 1
    snum += 1
ec_sdeptrain = np.array(ec_sdeptrain)


ec_bigasr = []
ec_bigcnn = []
ec_bigdnn = []
ec_bighmm = []
ec_bigtts = []
for i in range(4):
    for j in range(4):
        ec_bigasr.extend(ec_sdeptrain[i][0][j])
        ec_bigcnn.extend(ec_sdeptrain[i][1][j])
        ec_bigdnn.extend(ec_sdeptrain[i][2][j])
        ec_bighmm.extend(ec_sdeptrain[i][3][j])
        ec_bigtts.extend(ec_sdeptrain[i][4][j])
# ec_sdepasr_cov = (np.cov(np.array(ec_bigasr).T)+0.3*np.identity(14))
# ec_sdepcnn_cov = (np.cov(np.array(ec_bigcnn).T)+0.3*np.identity(14))
# ec_sdepdnn_cov = (np.cov(np.array(ec_bigdnn).T)+0.3*np.identity(14))
# ec_sdephmm_cov = (np.cov(np.array(ec_bighmm).T)+0.3*np.identity(14))
# ec_sdeptts_cov = (np.cov(np.array(ec_bigtts).T)+0.3*np.identity(14))

ec_sdepasr_cov = np.diag(np.diag(np.cov(np.array(ec_bigasr).T)))
ec_sdepcnn_cov = np.diag(np.diag(np.cov(np.array(ec_bigcnn).T)))
ec_sdepdnn_cov = np.diag(np.diag(np.cov(np.array(ec_bigdnn).T)))
ec_sdephmm_cov = np.diag(np.diag(np.cov(np.array(ec_bighmm).T)))
ec_sdeptts_cov = np.diag(np.diag(np.cov(np.array(ec_bigtts).T)))


ec_sdepasr_mn = np.mean(ec_bigasr,axis=0)
ec_sdepcnn_mn = np.mean(ec_bigcnn,axis=0)
ec_sdepdnn_mn = np.mean(ec_bigdnn,axis=0)
ec_sdephmm_mn = np.mean(ec_bighmm,axis=0)
ec_sdeptts_mn = np.mean(ec_bigtts,axis=0)
# print(ec_sdepasr_cov)
In [20]:
#TRAINING FOR SPEAKER DEPENDENT FOR EXTRA CREDIT
ec_asrmean = [ec_sdepasr_mn,ec_sdepasr_mn,ec_sdepasr_mn,ec_sdepasr_mn,ec_sdepasr_mn]
ec_asrcova = [ec_sdepasr_cov,ec_sdepasr_cov,ec_sdepasr_cov,ec_sdepasr_cov,ec_sdepasr_cov]
ec_cnnmean = [ec_sdepcnn_mn,ec_sdepcnn_mn,ec_sdepcnn_mn,ec_sdepcnn_mn,ec_sdepcnn_mn]
ec_cnncova = [ec_sdepcnn_cov,ec_sdepcnn_cov,ec_sdepcnn_cov,ec_sdepcnn_cov,ec_sdepcnn_cov]
ec_dnnmean = [ec_sdepdnn_mn,ec_sdepdnn_mn,ec_sdepdnn_mn,ec_sdepdnn_mn,ec_sdepdnn_mn]
ec_dnncova = [ec_sdepdnn_cov,ec_sdepdnn_cov,ec_sdepdnn_cov,ec_sdepdnn_cov,ec_sdepdnn_cov]
ec_hmmmean = [ec_sdephmm_mn,ec_sdephmm_mn,ec_sdephmm_mn,ec_sdephmm_mn,ec_sdephmm_mn]
ec_hmmcova = [ec_sdephmm_cov,ec_sdephmm_cov,ec_sdephmm_cov,ec_sdephmm_cov,ec_sdephmm_cov]
ec_ttsmean = [ec_sdeptts_mn,ec_sdeptts_mn,ec_sdeptts_mn,ec_sdeptts_mn,ec_sdeptts_mn]
ec_ttscova = [ec_sdeptts_cov,ec_sdeptts_cov,ec_sdeptts_cov,ec_sdeptts_cov,ec_sdeptts_cov]
# print(ec_asrcova)
ec_A_asr,ec_mn_asr,ec_cov_asr = SDEPTRAIN(initialTransition,initialProbabilities,ec_asrmean,ec_asrcova,0,25,14,ec_sdeptrain)
print('did 1')
ec_A_cnn,ec_mn_cnn,ec_cov_cnn = SDEPTRAIN(initialTransition,initialProbabilities,ec_cnnmean,ec_cnncova,1,25,14,ec_sdeptrain)
print('did 1')
ec_A_dnn,ec_mn_dnn,ec_cov_dnn = SDEPTRAIN(initialTransition,initialProbabilities,ec_dnnmean,ec_dnncova,2,25,14,ec_sdeptrain)
print('did 1')
ec_A_hmm,ec_mn_hmm,ec_cov_hmm = SDEPTRAIN(initialTransition,initialProbabilities,ec_hmmmean,ec_hmmcova,3,25,14,ec_sdeptrain)
print('did 1')
ec_A_tts,ec_mn_tts,ec_cov_tts = SDEPTRAIN(initialTransition,initialProbabilities,ec_ttsmean,ec_ttscova,4,25,14,ec_sdeptrain)
print('did 1')
---------------------------------------------------------------------------
LinAlgError                               Traceback (most recent call last)
<ipython-input-20-3fddd0c158ad> in <module>()
     11 ec_ttscova = [ec_sdeptts_cov,ec_sdeptts_cov,ec_sdeptts_cov,ec_sdeptts_cov,ec_sdeptts_cov]
     12 # print(ec_asrcova)
---> 13 ec_A_asr,ec_mn_asr,ec_cov_asr = SDEPTRAIN(initialTransition,initialProbabilities,ec_asrmean,ec_asrcova,0,25,14,ec_sdeptrain)
     14 print('did 1')
     15 ec_A_cnn,ec_mn_cnn,ec_cov_cnn = SDEPTRAIN(initialTransition,initialProbabilities,ec_cnnmean,ec_cnncova,1,25,14,ec_sdeptrain)

<ipython-input-9-4e1c13b0be7a> in SDEPTRAIN(initialTransition, initialProbabilities, initialmeans, initialcov, wordidx, numiterations, numcoeff, Sdeptrain)
     13         for s in range(4):#for each speaker
     14             for u in range(4): # for each utterance
---> 15                 B = calculateBMatrix(Sdeptrain[s][wordidx][u],mn,cov,5)
     16                 Alpha,Beta,logg = calculateAlphaBetaTildeG(A,B,initialProbabilities)
     17                 Gamma = calculateGammaMatrix(Alpha,Beta)

<ipython-input-7-afccbb000942> in calculateBMatrix(X, mu, sigma, N)
     15         temp_frame = []
     16         for state in range(N):
---> 17             bMatrix[state,frame] = stats.multivariate_normal(mean=(mu[state]),cov=sigma[state]).pdf(X[frame])
     18     return bMatrix
     19

c:\users\adithya\appdata\local\programs\python\python36\lib\site-packages\scipy\stats\_multivariate.py in __call__(self, mean, cov, allow_singular, seed)
    355         return multivariate_normal_frozen(mean, cov,
    356                                           allow_singular=allow_singular,
--> 357                                           seed=seed)
    358
    359     def _process_parameters(self, dim, mean, cov):

c:\users\adithya\appdata\local\programs\python\python36\lib\site-packages\scipy\stats\_multivariate.py in __init__(self, mean, cov, allow_singular, seed, maxpts, abseps, releps)
    725         self.dim, self.mean, self.cov = self._dist._process_parameters(
    726                                                             None, mean, cov)
--> 727         self.cov_info = _PSD(self.cov, allow_singular=allow_singular)
    728         if not maxpts:
    729             maxpts = 1000000 * self.dim

c:\users\adithya\appdata\local\programs\python\python36\lib\site-packages\scipy\stats\_multivariate.py in __init__(self, M, cond, rcond, lower, check_finite, allow_singular)
    157         d = s[s > eps]
    158         if len(d) < len(s) and not allow_singular:
--> 159             raise np.linalg.LinAlgError('singular matrix')
    160         s_pinv = _pinv_1d(s, eps)
    161         U = np.multiply(u, np.sqrt(s_pinv))

LinAlgError: singular matrix
In [ ]:
#SPEAKER DEPENDENT TEST FOR EXTRA CREDIT
ec_classifications = np.zeros((5,5))
#classifications[truth][prediction] where idx{0,1,2,3,4} => {'asr','cnn','dnn','hmm','tts'}
for i in range(4):
    for j in range(5):
        ec_file = ec_sdeptrain[0][i][j]
        ec_preds = [testlogProb(ec_file,ec_A_asr,ec_mn_asr,ec_cov_asr,initialProbabilities),
                 testlogProb(ec_file,ec_A_cnn,ec_mn_cnn,ec_cov_cnn,initialProbabilities),
                 testlogProb(ec_file,ec_A_dnn,ec_mn_dnn,ec_cov_dnn,initialProbabilities),
                 testlogProb(ec_file,ec_A_hmm,ec_mn_hmm,ec_cov_hmm,initialProbabilities),
                 testlogProb(ec_file,ec_A_tts,ec_mn_tts,ec_cov_tts,initialProbabilities)]
        print(ec_preds)
        ec_classifications[j][np.argmax(ec_preds)] += 1

print('Confusion Matrix\n',ec_classifications)
print('Accuracy\n',np.mean(np.diag(ec_classifications/5)))