In [1]:
import numpy as np

# General Parameters

In [2]:
train = ['<s> a girl likes eating by herself </s>',
         '<s> a cat likes eating meat </s>',
         '<s> the cat likes eating fish </s>',
         '<s> the girl herself </s>',
         '<s> the cat likes eating by itself </s>']

vocab_to_ind = {'<s>': 0, 'a': 1, 'girl': 2, 'likes': 3, 'eating': 4, 'by': 5, 
                'herself': 6, '</s>': 7, 'cat': 8, 'meat': 9, 'the': 10, 'fish': 11, 'itself': 12}

ind_to_word = {0: '<s>', 1: 'a', 2: 'girl', 3: 'likes', 4: 'eating', 5: 'by', 6: 'herself', 
               7: '</s>', 8: 'cat', 9: 'meat', 10: 'the', 11: 'fish', 12: 'itself'}

vocab_size = 13

one_hot_matrix = np.identity(13)
print(one_hot_matrix)

[[1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1.]]


# FFNN PARAMETERS

In [3]:
# a row in ffnn_E corresponds to a word.
# this is an embedding matrix that maps similar words closer together
# the first row is an embedding for '<s>'
# the third row represents 'girl'
# the row ids for words are stored in ind_to_word and vocab_to_ind
ffnn_E = np.array([[-0.7488, -0.7405],
                   [ 0.1707, -2.9365],
                   [ 0.0066,  0.8496],
                   [-1.0986, -0.7243],
                   [ 0.5666, -0.4858],
                   [ 0.7944,  0.2611],
                   [ 0.6391,  0.3745],
                   [ 0.0274, -0.2199],
                   [ 0.6098,  3.7667],
                   [ 0.1053, -0.4606],
                   [-1.5627, -0.6244],
                   [ 0.0744, -0.4101],
                   [ 0.5101,  0.3651]])

# a matrix that maps concatenated history vectors into a hiddent representation of previous context
# i.e. transforms concatenated 1x4 vector into a new 1x3 vector
ffnn_C = np.array([[-1.5706, -0.1656,  1.0511],
                   [-0.1356, -2.6759,  1.9245],
                   [-0.1939, -0.1617, -1.6314],
                   [-0.1262,  3.3976, -1.2029]])

ffnn_C_bias = [-0.1961, -0.3983,  0.6573]

# a matrix that maps 1x3 context representation back into one-hot dimension
# 3x13
ffnn_O = np.array([[-0.6443, -0.1039,  1.7336, -1.9931, -2.7326,  2.6318, -0.1785,
                    -3.8493,  1.7996,  2.6828, -0.1305,  2.6788, -2.028 ],
                   [-0.0994, -0.6617, -0.2894,  3.3309, -0.3907, -0.5301,  2.2434,
                    -2.9899, -0.7255, -0.4918, -0.0127, -0.5847,  2.2868],
                   [-0.8333, -0.1645,  2.905 , -1.1158,  3.9174, -2.2903, -1.3207,
                     0.8432,  2.9637, -1.8749, -0.4478, -1.9564, -1.2326]])

ffnn_O_bias = [-1.2955, -1.6286, -1.7336, -3.3365, -1.5713,  1.3081,  1.6140,  6.0426,
                -1.2782,  0.4446, -1.4849,  0.5554,  1.7558]

history_seed = ['a', 'girl'] # a sequence to start prediction

# RNN PARAMETERS

In [4]:
# a row in rnn_E correspond to a word.
# this is an embedding matrix that maps similar words closer together
# the first row is an embedding for '<s>'
# the third row represents 'girl'
# the row ids for words are stored in ind_to_word and vocab_to_ind
rnn_E = np.array([[ 0.1504, -1.4721],
                  [-0.0435, -0.2958],
                  [-0.7905,  1.4365],
                  [-0.9066,  1.3296],
                  [ 0.847 , -1.6606],
                  [-0.8622,  0.8892],
                  [ 0.6194,  0.0187],
                  [-0.0282, -0.0557],
                  [-1.4697, -0.2535],
                  [ 0.1498, -0.2534],
                  [-0.0733,  0.227 ],
                  [ 0.4131, -0.1014],
                  [ 0.3008,  0.1147]])

# a matrix to transform a word representation at time t
rnn_W_in = np.array([[-1.1178, -1.5827,  1.4378],
                    [ 2.7663, -0.3488, -1.2599]])

rnn_bias_in = [0.7628,  1.8086, -0.5996]

# a matrix to transform a history representation for a word at time t
rnn_W_rec= np.array([[ 0.6662, -1.175 ,  0.7991],
                    [-0.4765, -0.7113, -0.3551],
                    [ 0.4244, -0.6188,  0.6422]])

rnn_bias_rec = [1.0755, 2.1216, 0.4872]

# a matrix that maps 1x3 RNN representation of a word and its history back into one-hot dimension
rnn_O = np.array([[-1.056 , -1.3623,  0.3412,  0.2341,  1.9789, -0.8509,  1.2732,
                 0.4541,  0.4645, -1.5578, -1.4404, -1.3516,  1.5191],
               [-0.1945,  1.5966, -0.8853,  1.7061, -2.1607, -0.0595,  0.1308,
                -0.4909, -0.9871, -0.5746,  1.7799, -0.4463, -1.0677],
               [-0.4435, -0.0419, -0.9198, -1.6843, -2.4888,  1.6182, -0.173 ,
                 1.0161, -0.8084,  1.4198,  0.1248,  1.5048, -0.6309]])

rnn_O_bias = [-0.3343,  0.2753,  3.2959, -0.0294, -0.8674, -1.1811, -0.7062, -0.0348,
         3.4313, -0.2929,  0.2799, -0.6557, -0.8015]

history_seed = ['<s>','a', 'girl'] # a sequence to start prediction

# Torch models
You can calculate everything by hand. Or you can try using models defined with pytorch. You don't really have to, it is enogh (and even preferable) to do everything by hand. The models are here just for the refence of how models will look IRL.

In [5]:
import torch
import torch.nn as nn

## FFNN
Below you can see how our FF LM is contrsucted.
After initializing our exact arcitecture with FFNN class, we set the parameters to the ones we need. Now the model is ready to make predictions. For example, to predict an ending for "cat likes".

In [6]:
class FFNN(nn.Module):
    def __init__(self, vocab_to_ind, embed_dim, context_dim, history_len):
        super(FFNN, self).__init__()
        
        self.vocab_to_ind=vocab_to_ind
        self.embed_dim=embed_dim
        self.context_dim=context_dim
        self.history_len = history_len
        self.word_embed = nn.Linear(len(self.vocab_to_ind), self.embed_dim, bias=False) # V x word_dim
        self.context_embed = nn.Linear(self.embed_dim*self.history_len, self.context_dim) # 2word_dim x context_dim
        self.out = nn.Linear(self.context_dim, len(self.vocab_to_ind)) # context_dim x V
        
    def word_to_one_hot(self,word):
        one_hot_matrix = torch.diag(torch.ones(len(self.vocab_to_ind)))
        ind = self.vocab_to_ind[word]
        return one_hot_matrix[ind].view(1,one_hot_matrix.shape[1])
    
    def forward(self, n_gram):
        # STARTS THE FLOW OF THE INFORMATION THROUGH THE NETWORK
        context = []
        # embed every word in a history and concat them
        for word in n_gram:
            one_hot = self.word_to_one_hot(word)
            embed = self.word_embed(one_hot)
            context.append(embed)
        context = torch.cat(context,1) # 1 x 2*word_dim (for bi-gram)
        context_representation = nn.functional.relu(self.context_embed(context)) # 1 x context_dim
        
        context_representation_to_logits = self.out(context_representation) # 1 x V
        prediction = nn.functional.softmax(context_representation_to_logits, dim=1) # 1 x V
        return prediction

In [7]:
# initialize a model 
# word embeddings are 2d
# word history embeddings are 3d
# LM takes two words as history

ffnn_model =  FFNN(vocab_to_ind, 2, 3, 2)

# set parameters to ours
ffnn_model.word_embed.weight = torch.nn.Parameter(torch.tensor(ffnn_E.T, dtype=torch.float32))

ffnn_model.context_embed.weight = torch.nn.Parameter(torch.tensor(ffnn_C.T, dtype=torch.float32))
ffnn_model.context_embed.bias = torch.nn.Parameter(torch.tensor(ffnn_C_bias))

ffnn_model.out.weight = torch.nn.Parameter(torch.tensor(ffnn_O.T, dtype=torch.float32))
ffnn_model.out.bias = torch.nn.Parameter(torch.tensor(ffnn_O_bias))

# get the probability distribution for every word in vocabulary to follow "cat likes"
with torch.no_grad():
    prediction = ffnn_model(['cat','likes'])

for i in range(len(ind_to_word)):
    print(ind_to_word[i], '{:f}'.format(prediction[0][i]))

<s> 0.000000
a 0.000000
girl 0.000010
likes 0.000000
eating 0.999959
by 0.000000
herself 0.000000
</s> 0.000000
cat 0.000030
meat 0.000000
the 0.000000
fish 0.000000
itself 0.000000


## RNN
Below you can see how our RNN LM is contrsucted.
You can set it up just as in the assignment using the example of FFNN above.

In [8]:
class RNN(nn.Module):
    def __init__(self, vocab_to_ind, embed_dim, rnn_dim):
        super(RNN, self).__init__()
        
        self.vocab_to_ind=vocab_to_ind
        self.embed_dim=embed_dim
        self.rnn_dim=rnn_dim
        self.word_embed = nn.Linear(len(self.vocab_to_ind), self.embed_dim, bias=False) # V x word_dim
        self.rnn = nn.RNN(self.embed_dim, rnn_dim, nonlinearity='relu') # word_dim x rnn_dim
        self.out = nn.Linear(self.rnn_dim, len(self.vocab_to_ind)) # rnn_dim x V
    
    def word_to_one_hot(self, word):
        one_hot_matrix = torch.diag(torch.ones(len(self.vocab_to_ind)))
        ind = self.vocab_to_ind[word]
        return one_hot_matrix[ind].view(1,one_hot_matrix.shape[1])
    
    def forward(self, sentence):
        # STARTS THE FLOW OF THE INFORMATION THROUGH THE NETWORK
        sentence_inds = [self.word_to_one_hot(word) for word in sentence]
        sentence_one_hot = torch.cat(sentence_inds,0) # len(sentence) x V
        embeds = self.word_embed(sentence_one_hot) # len(sentence) x word_dim
        
        rnn_out, _ = self.rnn(embeds.view(len(sentence), 1, -1)) # len(sent) x 1 x rnn_dim --> len(sent) x rnn_dim
        
        word_logits = self.out(rnn_out.view(len(sentence), -1))
        probabilities = nn.functional.softmax(word_logits, dim=1)

        return probabilities 

In [None]:
# initialize a model 
# word embeddings are 2d
# rnn word history embeddings are 3d

rnn_model =  RNN(vocab_to_ind, 2, 3)

# set parameters to ours
rnn_model.word_embed.weight = #######

rnn_model.rnn.weight_ih_l0 = #######
rnn_model.rnn.bias_ih_l0 = #######

rnn_model.rnn.weight_hh_l0 = #######
rnn_model.rnn.bias_hh_l0 = #######

rnn_model.out.weight = #######
rnn_model.out.bias = #######

# get the probability distribution for every word in vocabulary to follow "<s> a girl"
with torch.no_grad():
    prediction = rnn_model(['<s>','a','girl'])

for i in range(len(ind_to_word)):
    print(ind_to_word[i], '{:f}'.format(prediction[2][i]))