{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import numpy as np"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# General Parameters"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[[1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]\n",
      " [0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]\n",
      " [0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]\n",
      " [0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0.]\n",
      " [0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0.]\n",
      " [0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0.]\n",
      " [0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0.]\n",
      " [0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0.]\n",
      " [0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0.]\n",
      " [0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0.]\n",
      " [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0.]\n",
      " [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0.]\n",
      " [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1.]]\n"
     ]
    }
   ],
   "source": [
    "train = ['<s> a girl likes eating by herself </s>',\n",
    "         '<s> a cat likes eating meat </s>',\n",
    "         '<s> the cat likes eating fish </s>',\n",
    "         '<s> the girl herself </s>',\n",
    "         '<s> the cat likes eating by itself </s>']\n",
    "\n",
    "vocab_to_ind = {'<s>': 0, 'a': 1, 'girl': 2, 'likes': 3, 'eating': 4, 'by': 5, \n",
    "                'herself': 6, '</s>': 7, 'cat': 8, 'meat': 9, 'the': 10, 'fish': 11, 'itself': 12}\n",
    "\n",
    "ind_to_word = {0: '<s>', 1: 'a', 2: 'girl', 3: 'likes', 4: 'eating', 5: 'by', 6: 'herself', \n",
    "               7: '</s>', 8: 'cat', 9: 'meat', 10: 'the', 11: 'fish', 12: 'itself'}\n",
    "\n",
    "vocab_size = 13\n",
    "\n",
    "one_hot_matrix = np.identity(13)\n",
    "print(one_hot_matrix)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# FFNN PARAMETERS"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "# a row in ffnn_E corresponds to a word.\n",
    "# this is an embedding matrix that maps similar words closer together\n",
    "# the first row is an embedding for '<s>'\n",
    "# the third row represents 'girl'\n",
    "# the row ids for words are stored in ind_to_word and vocab_to_ind\n",
    "ffnn_E = np.array([[-0.7488, -0.7405],\n",
    "                   [ 0.1707, -2.9365],\n",
    "                   [ 0.0066,  0.8496],\n",
    "                   [-1.0986, -0.7243],\n",
    "                   [ 0.5666, -0.4858],\n",
    "                   [ 0.7944,  0.2611],\n",
    "                   [ 0.6391,  0.3745],\n",
    "                   [ 0.0274, -0.2199],\n",
    "                   [ 0.6098,  3.7667],\n",
    "                   [ 0.1053, -0.4606],\n",
    "                   [-1.5627, -0.6244],\n",
    "                   [ 0.0744, -0.4101],\n",
    "                   [ 0.5101,  0.3651]])\n",
    "\n",
    "# a matrix that maps concatenated history vectors into a hiddent representation of previous context\n",
    "# i.e. transforms concatenated 1x4 vector into a new 1x3 vector\n",
    "ffnn_C = np.array([[-1.5706, -0.1656,  1.0511],\n",
    "                   [-0.1356, -2.6759,  1.9245],\n",
    "                   [-0.1939, -0.1617, -1.6314],\n",
    "                   [-0.1262,  3.3976, -1.2029]])\n",
    "\n",
    "ffnn_C_bias = [-0.1961, -0.3983,  0.6573]\n",
    "\n",
    "# a matrix that maps 1x3 context representation back into one-hot dimension\n",
    "# 3x13\n",
    "ffnn_O = np.array([[-0.6443, -0.1039,  1.7336, -1.9931, -2.7326,  2.6318, -0.1785,\n",
    "                    -3.8493,  1.7996,  2.6828, -0.1305,  2.6788, -2.028 ],\n",
    "                   [-0.0994, -0.6617, -0.2894,  3.3309, -0.3907, -0.5301,  2.2434,\n",
    "                    -2.9899, -0.7255, -0.4918, -0.0127, -0.5847,  2.2868],\n",
    "                   [-0.8333, -0.1645,  2.905 , -1.1158,  3.9174, -2.2903, -1.3207,\n",
    "                     0.8432,  2.9637, -1.8749, -0.4478, -1.9564, -1.2326]])\n",
    "\n",
    "ffnn_O_bias = [-1.2955, -1.6286, -1.7336, -3.3365, -1.5713,  1.3081,  1.6140,  6.0426,\n",
    "                -1.2782,  0.4446, -1.4849,  0.5554,  1.7558]\n",
    "\n",
    "history_seed = ['a', 'girl'] # a sequence to start prediction"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# RNN PARAMETERS"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "# a row in rnn_E correspond to a word.\n",
    "# this is an embedding matrix that maps similar words closer together\n",
    "# the first row is an embedding for '<s>'\n",
    "# the third row represents 'girl'\n",
    "# the row ids for words are stored in ind_to_word and vocab_to_ind\n",
    "rnn_E = np.array([[ 0.1504, -1.4721],\n",
    "                  [-0.0435, -0.2958],\n",
    "                  [-0.7905,  1.4365],\n",
    "                  [-0.9066,  1.3296],\n",
    "                  [ 0.847 , -1.6606],\n",
    "                  [-0.8622,  0.8892],\n",
    "                  [ 0.6194,  0.0187],\n",
    "                  [-0.0282, -0.0557],\n",
    "                  [-1.4697, -0.2535],\n",
    "                  [ 0.1498, -0.2534],\n",
    "                  [-0.0733,  0.227 ],\n",
    "                  [ 0.4131, -0.1014],\n",
    "                  [ 0.3008,  0.1147]])\n",
    "\n",
    "# a matrix to transform a word representation at time t\n",
    "rnn_W_in = np.array([[-1.1178, -1.5827,  1.4378],\n",
    "                    [ 2.7663, -0.3488, -1.2599]])\n",
    "\n",
    "rnn_bias_in = [0.7628,  1.8086, -0.5996]\n",
    "\n",
    "# a matrix to transform a history representation for a word at time t\n",
    "rnn_W_rec= np.array([[ 0.6662, -1.175 ,  0.7991],\n",
    "                    [-0.4765, -0.7113, -0.3551],\n",
    "                    [ 0.4244, -0.6188,  0.6422]])\n",
    "\n",
    "rnn_bias_rec = [1.0755, 2.1216, 0.4872]\n",
    "\n",
    "# a matrix that maps 1x3 RNN representation of a word and its history back into one-hot dimension\n",
    "rnn_O = np.array([[-1.056 , -1.3623,  0.3412,  0.2341,  1.9789, -0.8509,  1.2732,\n",
    "                 0.4541,  0.4645, -1.5578, -1.4404, -1.3516,  1.5191],\n",
    "               [-0.1945,  1.5966, -0.8853,  1.7061, -2.1607, -0.0595,  0.1308,\n",
    "                -0.4909, -0.9871, -0.5746,  1.7799, -0.4463, -1.0677],\n",
    "               [-0.4435, -0.0419, -0.9198, -1.6843, -2.4888,  1.6182, -0.173 ,\n",
    "                 1.0161, -0.8084,  1.4198,  0.1248,  1.5048, -0.6309]])\n",
    "\n",
    "rnn_O_bias = [-0.3343,  0.2753,  3.2959, -0.0294, -0.8674, -1.1811, -0.7062, -0.0348,\n",
    "         3.4313, -0.2929,  0.2799, -0.6557, -0.8015]\n",
    "\n",
    "history_seed = ['<s>','a', 'girl'] # a sequence to start prediction"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Torch models\n",
    "You can calculate everything by hand. Or you can try using models defined with pytorch. You don't really have to, it is enogh (and even preferable) to do everything by hand. The models are here just for the refence of how models will look IRL."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "import torch\n",
    "import torch.nn as nn"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## FFNN\n",
    "Below you can see how our FF LM is contrsucted.\n",
    "After initializing our exact arcitecture with FFNN class, we set the parameters to the ones we need. Now the model is ready to make predictions. For example, to predict an ending for \"cat likes\"."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "class FFNN(nn.Module):\n",
    "    def __init__(self, vocab_to_ind, embed_dim, context_dim, history_len):\n",
    "        super(FFNN, self).__init__()\n",
    "        \n",
    "        self.vocab_to_ind=vocab_to_ind\n",
    "        self.embed_dim=embed_dim\n",
    "        self.context_dim=context_dim\n",
    "        self.history_len = history_len\n",
    "        self.word_embed = nn.Linear(len(self.vocab_to_ind), self.embed_dim, bias=False) # V x word_dim\n",
    "        self.context_embed = nn.Linear(self.embed_dim*self.history_len, self.context_dim) # 2word_dim x context_dim\n",
    "        self.out = nn.Linear(self.context_dim, len(self.vocab_to_ind)) # context_dim x V\n",
    "        \n",
    "    def word_to_one_hot(self,word):\n",
    "        one_hot_matrix = torch.diag(torch.ones(len(self.vocab_to_ind)))\n",
    "        ind = self.vocab_to_ind[word]\n",
    "        return one_hot_matrix[ind].view(1,one_hot_matrix.shape[1])\n",
    "    \n",
    "    def forward(self, n_gram):\n",
    "        # STARTS THE FLOW OF THE INFORMATION THROUGH THE NETWORK\n",
    "        context = []\n",
    "        # embed every word in a history and concat them\n",
    "        for word in n_gram:\n",
    "            one_hot = self.word_to_one_hot(word)\n",
    "            embed = self.word_embed(one_hot)\n",
    "            context.append(embed)\n",
    "        context = torch.cat(context,1) # 1 x 2*word_dim (for bi-gram)\n",
    "        context_representation = nn.functional.relu(self.context_embed(context)) # 1 x context_dim\n",
    "        \n",
    "        context_representation_to_logits = self.out(context_representation) # 1 x V\n",
    "        prediction = nn.functional.softmax(context_representation_to_logits, dim=1) # 1 x V\n",
    "        return prediction"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<s> 0.000000\n",
      "a 0.000000\n",
      "girl 0.000010\n",
      "likes 0.000000\n",
      "eating 0.999959\n",
      "by 0.000000\n",
      "herself 0.000000\n",
      "</s> 0.000000\n",
      "cat 0.000030\n",
      "meat 0.000000\n",
      "the 0.000000\n",
      "fish 0.000000\n",
      "itself 0.000000\n"
     ]
    }
   ],
   "source": [
    "# initialize a model \n",
    "# word embeddings are 2d\n",
    "# word history embeddings are 3d\n",
    "# LM takes two words as history\n",
    "\n",
    "ffnn_model =  FFNN(vocab_to_ind, 2, 3, 2)\n",
    "\n",
    "# set parameters to ours\n",
    "ffnn_model.word_embed.weight = torch.nn.Parameter(torch.tensor(ffnn_E.T, dtype=torch.float32))\n",
    "\n",
    "ffnn_model.context_embed.weight = torch.nn.Parameter(torch.tensor(ffnn_C.T, dtype=torch.float32))\n",
    "ffnn_model.context_embed.bias = torch.nn.Parameter(torch.tensor(ffnn_C_bias))\n",
    "\n",
    "ffnn_model.out.weight = torch.nn.Parameter(torch.tensor(ffnn_O.T, dtype=torch.float32))\n",
    "ffnn_model.out.bias = torch.nn.Parameter(torch.tensor(ffnn_O_bias))\n",
    "\n",
    "# get the probability distribution for every word in vocabulary to follow \"cat likes\"\n",
    "with torch.no_grad():\n",
    "    prediction = ffnn_model(['cat','likes'])\n",
    "\n",
    "for i in range(len(ind_to_word)):\n",
    "    print(ind_to_word[i], '{:f}'.format(prediction[0][i]))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## RNN\n",
    "Below you can see how our RNN LM is contrsucted.\n",
    "You can set it up just as in the assignment using the example of FFNN above."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [],
   "source": [
    "class RNN(nn.Module):\n",
    "    def __init__(self, vocab_to_ind, embed_dim, rnn_dim):\n",
    "        super(RNN, self).__init__()\n",
    "        \n",
    "        self.vocab_to_ind=vocab_to_ind\n",
    "        self.embed_dim=embed_dim\n",
    "        self.rnn_dim=rnn_dim\n",
    "        self.word_embed = nn.Linear(len(self.vocab_to_ind), self.embed_dim, bias=False) # V x word_dim\n",
    "        self.rnn = nn.RNN(self.embed_dim, rnn_dim, nonlinearity='relu') # word_dim x rnn_dim\n",
    "        self.out = nn.Linear(self.rnn_dim, len(self.vocab_to_ind)) # rnn_dim x V\n",
    "    \n",
    "    def word_to_one_hot(self, word):\n",
    "        one_hot_matrix = torch.diag(torch.ones(len(self.vocab_to_ind)))\n",
    "        ind = self.vocab_to_ind[word]\n",
    "        return one_hot_matrix[ind].view(1,one_hot_matrix.shape[1])\n",
    "    \n",
    "    def forward(self, sentence):\n",
    "        # STARTS THE FLOW OF THE INFORMATION THROUGH THE NETWORK\n",
    "        sentence_inds = [self.word_to_one_hot(word) for word in sentence]\n",
    "        sentence_one_hot = torch.cat(sentence_inds,0) # len(sentence) x V\n",
    "        embeds = self.word_embed(sentence_one_hot) # len(sentence) x word_dim\n",
    "        \n",
    "        rnn_out, _ = self.rnn(embeds.view(len(sentence), 1, -1)) # len(sent) x 1 x rnn_dim --> len(sent) x rnn_dim\n",
    "        \n",
    "        word_logits = self.out(rnn_out.view(len(sentence), -1))\n",
    "        probabilities = nn.functional.softmax(word_logits, dim=1)\n",
    "\n",
    "        return probabilities "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# initialize a model \n",
    "# word embeddings are 2d\n",
    "# rnn word history embeddings are 3d\n",
    "\n",
    "rnn_model =  RNN(vocab_to_ind, 2, 3)\n",
    "\n",
    "# set parameters to ours\n",
    "rnn_model.word_embed.weight = #######\n",
    "\n",
    "rnn_model.rnn.weight_ih_l0 = #######\n",
    "rnn_model.rnn.bias_ih_l0 = #######\n",
    "\n",
    "rnn_model.rnn.weight_hh_l0 = #######\n",
    "rnn_model.rnn.bias_hh_l0 = #######\n",
    "\n",
    "rnn_model.out.weight = #######\n",
    "rnn_model.out.bias = #######\n",
    "\n",
    "# get the probability distribution for every word in vocabulary to follow \"<s> a girl\"\n",
    "with torch.no_grad():\n",
    "    prediction = rnn_model(['<s>','a','girl'])\n",
    "\n",
    "for i in range(len(ind_to_word)):\n",
    "    print(ind_to_word[i], '{:f}'.format(prediction[2][i]))"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.9.5"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}