{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import gzip\n",
    "import random\n",
    "import scipy\n",
    "import tensorflow as tf\n",
    "from collections import defaultdict\n",
    "from implicit import bpr\n",
    "from surprise import SVD, Reader, Dataset\n",
    "from surprise.model_selection import train_test_split"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Data is available at http://cseweb.ucsd.edu/~jmcauley/pml/data/. Download and save to your own directory"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "dataDir = \"/home/jmcauley/pml_data/\""
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Latent factor model (Surprise)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Using the library's inbuilt data reader, extract tsv-formatted data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "reader = Reader(line_format='user item rating', sep='\\t')\n",
    "data = Dataset.load_from_file(dataDir + \"goodreads_fantasy.tsv\", reader=reader)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Standard latent-factor model"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "model = SVD()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Inbuilt functions to split into training and test fractions"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "trainset, testset = train_test_split(data, test_size=.25)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Fit the model and extract predictions"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "model.fit(trainset)\n",
    "predictions = model.test(testset)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Estimate for a single (test) rating"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "3.6334479463688463"
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "predictions[0].est"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "MSE for model predictions (test set)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "1.1883531641648757\n"
     ]
    }
   ],
   "source": [
    "sse = 0\n",
    "for p in predictions:\n",
    "    sse += (p.r_ui - p.est)**2\n",
    "\n",
    "print(sse / len(predictions))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Bayesian Personalized Ranking (Implicit)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [],
   "source": [
    "def parseData(fname):\n",
    "    for l in gzip.open(fname):\n",
    "        d = eval(l)\n",
    "        del d['review_text'] # Discard the reviews, to save memory when we don't use them\n",
    "        yield d"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Full dataset of Goodreads fantasy reviews (fairly memory-hungry, could be replaced by something smaller)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [],
   "source": [
    "data = list(parseData(dataDir + \"goodreads_reviews_fantasy_paranormal.json.gz\"))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [],
   "source": [
    "random.shuffle(data)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Example from the dataset"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'book_id': '13451182',\n",
       " 'date_added': 'Sun Sep 09 18:58:45 -0700 2012',\n",
       " 'date_updated': 'Sun Oct 07 15:13:32 -0700 2012',\n",
       " 'n_comments': 1,\n",
       " 'n_votes': 0,\n",
       " 'rating': 1,\n",
       " 'read_at': 'Sun Sep 09 00:00:00 -0700 2012',\n",
       " 'review_id': 'fec617f0bd2947c641189b01e2433ec9',\n",
       " 'started_at': 'Sun Sep 09 00:00:00 -0700 2012',\n",
       " 'user_id': '30d8035cfe8ee0fb007fa896b5a3ba54'}"
      ]
     },
     "execution_count": 12,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "data[0]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Build a few utility data structures. Since we'll be converting the data to a sparse interaction matrix, the main structure here is to assign each user/item to an ID from 0 to nUsers/nItems."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [],
   "source": [
    "userIDs,itemIDs = {},{}\n",
    "\n",
    "for d in data:\n",
    "    u,i = d['user_id'],d['book_id']\n",
    "    if not u in userIDs: userIDs[u] = len(userIDs)\n",
    "    if not i in itemIDs: itemIDs[i] = len(itemIDs)\n",
    "\n",
    "nUsers,nItems = len(userIDs),len(itemIDs)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(256088, 258212)"
      ]
     },
     "execution_count": 14,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "nUsers,nItems"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Convert dataset to sparse matrix. Only storing positive feedback instances (i.e., rated items)."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [],
   "source": [
    "Xiu = scipy.sparse.lil_matrix((nItems, nUsers))\n",
    "for d in data:\n",
    "    Xiu[itemIDs[d['book_id']],userIDs[d['user_id']]] = 1\n",
    "    \n",
    "Xui = scipy.sparse.csr_matrix(Xiu.T)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Bayesian Personalized Ranking model with 5 latent factors"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [],
   "source": [
    "model = bpr.BayesianPersonalizedRanking(factors = 5)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Fit the model"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "65699ca650a04b198c5db695736d7aed",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "HBox(children=(FloatProgress(value=0.0), HTML(value='')))"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n"
     ]
    }
   ],
   "source": [
    "model.fit(Xiu)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Get recommendations for a particular user (the first one) and to get items related to (similar latent factors) to a particular item"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [],
   "source": [
    "recommended = model.recommend(0, Xui)\n",
    "related = model.similar_items(0)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[(0, 1.0),\n",
       " (42098, 0.9885355),\n",
       " (142964, 0.9845209),\n",
       " (150861, 0.98274595),\n",
       " (231639, 0.9826295),\n",
       " (182330, 0.9813926),\n",
       " (240868, 0.98134804),\n",
       " (226720, 0.9796706),\n",
       " (84748, 0.9783791),\n",
       " (140340, 0.97788805)]"
      ]
     },
     "execution_count": 19,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "related"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Extract user and item factors"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {},
   "outputs": [],
   "source": [
    "itemFactors = model.item_factors\n",
    "userFactors = model.user_factors"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([-0.74582803, -0.10878776,  0.32922822,  0.16516064,  0.38874012,\n",
       "        0.7460656 ], dtype=float32)"
      ]
     },
     "execution_count": 21,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "itemFactors[0]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Latent factor model (Tensorflow)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {},
   "outputs": [],
   "source": [
    "def parse(path):\n",
    "    g = gzip.open(path, 'r')\n",
    "    for l in g:\n",
    "        yield eval(l)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Goodreads comic book data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {},
   "outputs": [],
   "source": [
    "userIDs = {}\n",
    "itemIDs = {}\n",
    "interactions = []\n",
    "\n",
    "for d in parse(dataDir + \"goodreads_reviews_comics_graphic.json.gz\"):\n",
    "    u = d['user_id']\n",
    "    i = d['book_id']\n",
    "    r = d['rating']\n",
    "    if not u in userIDs: userIDs[u] = len(userIDs)\n",
    "    if not i in itemIDs: itemIDs[i] = len(itemIDs)\n",
    "    interactions.append((u,i,r))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "542338"
      ]
     },
     "execution_count": 24,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "random.shuffle(interactions)\n",
    "len(interactions)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Split into train and test sets"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "metadata": {},
   "outputs": [],
   "source": [
    "nTrain = int(len(interactions) * 0.9)\n",
    "nTest = len(interactions) - nTrain\n",
    "interactionsTrain = interactions[:nTrain]\n",
    "interactionsTest = interactions[nTrain:]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "metadata": {},
   "outputs": [],
   "source": [
    "itemsPerUser = defaultdict(list)\n",
    "usersPerItem = defaultdict(list)\n",
    "for u,i,r in interactionsTrain:\n",
    "    itemsPerUser[u].append(i)\n",
    "    usersPerItem[i].append(u)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Mean rating, just for initialization"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "metadata": {},
   "outputs": [],
   "source": [
    "mu = sum([r for _,_,r in interactionsTrain]) / len(interactionsTrain)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Gradient descent optimizer, could experiment with learning rate"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "metadata": {},
   "outputs": [],
   "source": [
    "optimizer = tf.keras.optimizers.Adam(0.1)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Latent factor model tensorflow class"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 29,
   "metadata": {},
   "outputs": [],
   "source": [
    "class LatentFactorModel(tf.keras.Model):\n",
    "    def __init__(self, mu, K, lamb):\n",
    "        super(LatentFactorModel, self).__init__()\n",
    "        # Initialize to average\n",
    "        self.alpha = tf.Variable(mu)\n",
    "        # Initialize to small random values\n",
    "        self.betaU = tf.Variable(tf.random.normal([len(userIDs)],stddev=0.001))\n",
    "        self.betaI = tf.Variable(tf.random.normal([len(itemIDs)],stddev=0.001))\n",
    "        self.gammaU = tf.Variable(tf.random.normal([len(userIDs),K],stddev=0.001))\n",
    "        self.gammaI = tf.Variable(tf.random.normal([len(itemIDs),K],stddev=0.001))\n",
    "        self.lamb = lamb\n",
    "\n",
    "    # Prediction for a single instance (useful for evaluation)\n",
    "    def predict(self, u, i):\n",
    "        p = self.alpha + self.betaU[u] + self.betaI[i] +\\\n",
    "            tf.tensordot(self.gammaU[u], self.gammaI[i], 1)\n",
    "        return p\n",
    "\n",
    "    # Regularizer\n",
    "    def reg(self):\n",
    "        return self.lamb * (tf.reduce_sum(self.betaU**2) +\\\n",
    "                            tf.reduce_sum(self.betaI**2) +\\\n",
    "                            tf.reduce_sum(self.gammaU**2) +\\\n",
    "                            tf.reduce_sum(self.gammaI**2))\n",
    "    \n",
    "    # Prediction for a sample of instances\n",
    "    def predictSample(self, sampleU, sampleI):\n",
    "        u = tf.convert_to_tensor(sampleU, dtype=tf.int32)\n",
    "        i = tf.convert_to_tensor(sampleI, dtype=tf.int32)\n",
    "        beta_u = tf.nn.embedding_lookup(self.betaU, u)\n",
    "        beta_i = tf.nn.embedding_lookup(self.betaI, i)\n",
    "        gamma_u = tf.nn.embedding_lookup(self.gammaU, u)\n",
    "        gamma_i = tf.nn.embedding_lookup(self.gammaI, i)\n",
    "        pred = self.alpha + beta_u + beta_i +\\\n",
    "               tf.reduce_sum(tf.multiply(gamma_u, gamma_i), 1)\n",
    "        return pred\n",
    "    \n",
    "    # Loss\n",
    "    def call(self, sampleU, sampleI, sampleR):\n",
    "        pred = self.predictSample(sampleU, sampleI)\n",
    "        r = tf.convert_to_tensor(sampleR, dtype=tf.float32)\n",
    "        return tf.nn.l2_loss(pred - r) / len(sampleR)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Initialize the model. Could experiment with number of factors and regularization rate."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 30,
   "metadata": {},
   "outputs": [],
   "source": [
    "modelLFM = LatentFactorModel(mu, 5, 0.00001)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Training step (for the batch-based model from Chapter 5)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 31,
   "metadata": {},
   "outputs": [],
   "source": [
    "def trainingStep(model, interactions):\n",
    "    Nsamples = 50000\n",
    "    with tf.GradientTape() as tape:\n",
    "        sampleU, sampleI, sampleR = [], [], []\n",
    "        for _ in range(Nsamples):\n",
    "            u,i,r = random.choice(interactions)\n",
    "            sampleU.append(userIDs[u])\n",
    "            sampleI.append(itemIDs[i])\n",
    "            sampleR.append(r)\n",
    "\n",
    "        loss = model(sampleU,sampleI,sampleR)\n",
    "        loss += model.reg()\n",
    "    gradients = tape.gradient(loss, model.trainable_variables)\n",
    "    optimizer.apply_gradients((grad, var) for\n",
    "                              (grad, var) in zip(gradients, model.trainable_variables)\n",
    "                              if grad is not None)\n",
    "    return loss.numpy()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Run 100 iterations (really 100 batches) of gradient descent"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 32,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "iteration 10, objective = 0.53868735\n",
      "iteration 20, objective = 0.5198331\n",
      "iteration 30, objective = 0.5232154\n",
      "iteration 40, objective = 0.52439743\n",
      "iteration 50, objective = 0.5135148\n",
      "iteration 60, objective = 0.49897566\n",
      "iteration 70, objective = 0.50508446\n",
      "iteration 80, objective = 0.5124023\n",
      "iteration 90, objective = 0.51079476\n",
      "iteration 100, objective = 0.5071494\n"
     ]
    }
   ],
   "source": [
    "for i in range(100):\n",
    "    obj = trainingStep(modelLFM, interactionsTrain)\n",
    "    if (i % 10 == 9): print(\"iteration \" + str(i+1) + \", objective = \" + str(obj))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Prediction for a particular user/item pair"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 33,
   "metadata": {},
   "outputs": [],
   "source": [
    "u,i,r = interactionsTest[0]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 34,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "3.3664043"
      ]
     },
     "execution_count": 34,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "modelLFM.predict(userIDs[u], itemIDs[i]).numpy()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Bayesian personalized ranking (Tensorflow)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 35,
   "metadata": {},
   "outputs": [],
   "source": [
    "items = list(itemIDs.keys())"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Batch-based version from Chapter 5"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 36,
   "metadata": {},
   "outputs": [],
   "source": [
    "class BPRbatch(tf.keras.Model):\n",
    "    def __init__(self, K, lamb):\n",
    "        super(BPRbatch, self).__init__()\n",
    "        # Initialize variables\n",
    "        self.betaI = tf.Variable(tf.random.normal([len(itemIDs)],stddev=0.001))\n",
    "        self.gammaU = tf.Variable(tf.random.normal([len(userIDs),K],stddev=0.001))\n",
    "        self.gammaI = tf.Variable(tf.random.normal([len(itemIDs),K],stddev=0.001))\n",
    "        # Regularization coefficient\n",
    "        self.lamb = lamb\n",
    "\n",
    "    # Prediction for a single instance\n",
    "    def predict(self, u, i):\n",
    "        p = self.betaI[i] + tf.tensordot(self.gammaU[u], self.gammaI[i], 1)\n",
    "        return p\n",
    "\n",
    "    # Regularizer\n",
    "    def reg(self):\n",
    "        return self.lamb * (tf.nn.l2_loss(self.betaI) +\\\n",
    "                            tf.nn.l2_loss(self.gammaU) +\\\n",
    "                            tf.nn.l2_loss(self.gammaI))\n",
    "    \n",
    "    def score(self, sampleU, sampleI):\n",
    "        u = tf.convert_to_tensor(sampleU, dtype=tf.int32)\n",
    "        i = tf.convert_to_tensor(sampleI, dtype=tf.int32)\n",
    "        beta_i = tf.nn.embedding_lookup(self.betaI, i)\n",
    "        gamma_u = tf.nn.embedding_lookup(self.gammaU, u)\n",
    "        gamma_i = tf.nn.embedding_lookup(self.gammaI, i)\n",
    "        x_ui = beta_i + tf.reduce_sum(tf.multiply(gamma_u, gamma_i), 1)\n",
    "        return x_ui\n",
    "\n",
    "    def call(self, sampleU, sampleI, sampleJ):\n",
    "        x_ui = self.score(sampleU, sampleI)\n",
    "        x_uj = self.score(sampleU, sampleJ)\n",
    "        return -tf.reduce_mean(tf.math.log(tf.math.sigmoid(x_ui - x_uj)))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 37,
   "metadata": {},
   "outputs": [],
   "source": [
    "optimizer = tf.keras.optimizers.Adam(0.1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 38,
   "metadata": {},
   "outputs": [],
   "source": [
    "modelBPR = BPRbatch(5, 0.00001)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 39,
   "metadata": {},
   "outputs": [],
   "source": [
    "def trainingStepBPR(model, interactions):\n",
    "    Nsamples = 50000\n",
    "    with tf.GradientTape() as tape:\n",
    "        sampleU, sampleI, sampleJ = [], [], []\n",
    "        for _ in range(Nsamples):\n",
    "            u,i,_ = random.choice(interactions) # positive sample\n",
    "            j = random.choice(items) # negative sample\n",
    "            while j in itemsPerUser[u]:\n",
    "                j = random.choice(items)\n",
    "            sampleU.append(userIDs[u])\n",
    "            sampleI.append(itemIDs[i])\n",
    "            sampleJ.append(itemIDs[j])\n",
    "\n",
    "        loss = model(sampleU,sampleI,sampleJ)\n",
    "        loss += model.reg()\n",
    "    gradients = tape.gradient(loss, model.trainable_variables)\n",
    "    optimizer.apply_gradients((grad, var) for\n",
    "                              (grad, var) in zip(gradients, model.trainable_variables)\n",
    "                              if grad is not None)\n",
    "    return loss.numpy()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Run 100 batches of gradient descent"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 40,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "iteration 10, objective = 0.53062546\n",
      "iteration 20, objective = 0.4767778\n",
      "iteration 30, objective = 0.47098666\n",
      "iteration 40, objective = 0.47340673\n",
      "iteration 50, objective = 0.4744774\n",
      "iteration 60, objective = 0.47567236\n",
      "iteration 70, objective = 0.47382742\n",
      "iteration 80, objective = 0.47544688\n",
      "iteration 90, objective = 0.47159576\n",
      "iteration 100, objective = 0.4722678\n"
     ]
    }
   ],
   "source": [
    "for i in range(100):\n",
    "    obj = trainingStepBPR(modelBPR, interactions)\n",
    "    if (i % 10 == 9): print(\"iteration \" + str(i+1) + \", objective = \" + str(obj))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Prediction for a particular user/item pair. Note that this is an unnormalized score (which can be used for ranking)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 41,
   "metadata": {},
   "outputs": [],
   "source": [
    "u,i,_ = interactionsTest[0]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 42,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "2.2043138"
      ]
     },
     "execution_count": 42,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# In this case just a score (that can be used for ranking), rather than a prediction of a rating\n",
    "modelBPR.predict(userIDs[u], itemIDs[i]).numpy()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Exercises"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 5.1"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Adapt the latent factor model above, simply deleting any terms associated with latent factors"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 43,
   "metadata": {},
   "outputs": [],
   "source": [
    "class LatentFactorModelBiasOnly(tf.keras.Model):\n",
    "    def __init__(self, mu, lamb):\n",
    "        super(LatentFactorModelBiasOnly, self).__init__()\n",
    "        # Initialize to average\n",
    "        self.alpha = tf.Variable(mu)\n",
    "        # Initialize to small random values\n",
    "        self.betaU = tf.Variable(tf.random.normal([len(userIDs)],stddev=0.001))\n",
    "        self.betaI = tf.Variable(tf.random.normal([len(itemIDs)],stddev=0.001))\n",
    "        self.lamb = lamb\n",
    "\n",
    "    # Prediction for a single instance (useful for evaluation)\n",
    "    def predict(self, u, i):\n",
    "        p = self.alpha + self.betaU[u] + self.betaI[i]\n",
    "        return p\n",
    "\n",
    "    # Regularizer\n",
    "    def reg(self):\n",
    "        return self.lamb * (tf.reduce_sum(self.betaU**2) +\\\n",
    "                            tf.reduce_sum(self.betaI**2))\n",
    "    \n",
    "    # Prediction for a sample of instances\n",
    "    def predictSample(self, sampleU, sampleI):\n",
    "        u = tf.convert_to_tensor(sampleU, dtype=tf.int32)\n",
    "        i = tf.convert_to_tensor(sampleI, dtype=tf.int32)\n",
    "        beta_u = tf.nn.embedding_lookup(self.betaU, u)\n",
    "        beta_i = tf.nn.embedding_lookup(self.betaI, i)\n",
    "        pred = self.alpha + beta_u + beta_i\n",
    "        return pred\n",
    "    \n",
    "    # Loss\n",
    "    def call(self, sampleU, sampleI, sampleR):\n",
    "        pred = self.predictSample(sampleU, sampleI)\n",
    "        r = tf.convert_to_tensor(sampleR, dtype=tf.float32)\n",
    "        return tf.nn.l2_loss(pred - r) / len(sampleR)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 44,
   "metadata": {},
   "outputs": [],
   "source": [
    "modelBiasOnly = LatentFactorModelBiasOnly(mu, 0.00001)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 45,
   "metadata": {},
   "outputs": [],
   "source": [
    "def trainingStepBiasOnly(model, interactions):\n",
    "    Nsamples = 50000\n",
    "    with tf.GradientTape() as tape:\n",
    "        sampleU, sampleI, sampleR = [], [], []\n",
    "        for _ in range(Nsamples):\n",
    "            u,i,r = random.choice(interactions)\n",
    "            sampleU.append(userIDs[u])\n",
    "            sampleI.append(itemIDs[i])\n",
    "            sampleR.append(r)\n",
    "\n",
    "        loss = model(sampleU,sampleI,sampleR)\n",
    "        loss += model.reg()\n",
    "    gradients = tape.gradient(loss, model.trainable_variables)\n",
    "    optimizer.apply_gradients((grad, var) for\n",
    "        (grad, var) in zip(gradients, model.trainable_variables)\n",
    "        if grad is not None)\n",
    "    return loss.numpy()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 46,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "iteration 10, objective = 0.5712534\n",
      "iteration 20, objective = 0.53842133\n",
      "iteration 30, objective = 0.52470183\n",
      "iteration 40, objective = 0.5304408\n",
      "iteration 50, objective = 0.52147734\n"
     ]
    }
   ],
   "source": [
    "for i in range(50):\n",
    "    obj = trainingStepBiasOnly(modelBiasOnly, interactionsTrain)\n",
    "    if (i % 10 == 9): print(\"iteration \" + str(i+1) + \", objective = \" + str(obj))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Compute the MSEs for a model which always predicts the mean, versus one which involves bias terms"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 47,
   "metadata": {},
   "outputs": [],
   "source": [
    "def MSE(predictions, labels):\n",
    "    differences = [(x-y)**2 for x,y in zip(predictions,labels)]\n",
    "    return sum(differences) / len(differences)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 48,
   "metadata": {},
   "outputs": [],
   "source": [
    "alwaysPredictMean = [mu for _ in interactionsTest]\n",
    "labels = [r for _,_,r in interactionsTest]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 49,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "1.3266520043138732"
      ]
     },
     "execution_count": 49,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "MSE(alwaysPredictMean, labels)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 50,
   "metadata": {},
   "outputs": [],
   "source": [
    "biasOnlyPredictions =\\\n",
    "    [modelBiasOnly.predict(userIDs[u],itemIDs[i]).numpy() for u,i,_ in interactionsTest]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 51,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "3.3093212"
      ]
     },
     "execution_count": 51,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "biasOnlyPredictions[0]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 52,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0.9999872631832556"
      ]
     },
     "execution_count": 52,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "MSE(biasOnlyPredictions, labels)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 5.2"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Performance of a complete latent factor model (using the latent factor model implementation in the examples above)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 53,
   "metadata": {},
   "outputs": [],
   "source": [
    "optimizer = tf.keras.optimizers.Adam(0.1)\n",
    "modelLFM = LatentFactorModel(mu, 10, 0.00001)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 54,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "iteration 10, objective = 0.530306\n",
      "iteration 20, objective = 0.53029466\n",
      "iteration 30, objective = 0.5466817\n",
      "iteration 40, objective = 0.53111464\n",
      "iteration 50, objective = 0.5286445\n"
     ]
    }
   ],
   "source": [
    "for i in range(50):\n",
    "    obj = trainingStep(modelLFM, interactionsTrain)\n",
    "    if (i % 10 == 9): print(\"iteration \" + str(i+1) + \", objective = \" + str(obj))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 55,
   "metadata": {},
   "outputs": [],
   "source": [
    "predictions = [modelLFM.predict(userIDs[u],itemIDs[i]).numpy() for u,i,_ in interactionsTest]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 56,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "1.0094479508990533"
      ]
     },
     "execution_count": 56,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "MSE(predictions, labels)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "(probably needs a little more tuning in terms of number of latent factors, learning rate, etc.)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 5.3"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Experiment with rounding the predictions"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 57,
   "metadata": {},
   "outputs": [],
   "source": [
    "predictionsRounded = [int(p + 0.5) for p in predictions]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 58,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "1.094756057085961"
      ]
     },
     "execution_count": 58,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "MSE(predictionsRounded, labels)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Seems to result in worse performance. For a rough explanation, consider a random variable that takes a value of \"1\" half the time and \"2\" half the time; in terms of the MSE, always predicting 1.5 (and always incurring moderate errors) is preferable to always predicting either of 1 or 2 (and incurring a large error half the time)."
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 5.4"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Following the BPR code from examples above"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 59,
   "metadata": {},
   "outputs": [],
   "source": [
    "optimizer = tf.keras.optimizers.Adam(0.1)\n",
    "modelBPR = BPRbatch(10, 0.00001)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 60,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "iteration 10, objective = 0.5274262\n",
      "iteration 20, objective = 0.48642576\n",
      "iteration 30, objective = 0.48361415\n",
      "iteration 40, objective = 0.48916948\n",
      "iteration 50, objective = 0.49432752\n"
     ]
    }
   ],
   "source": [
    "for i in range(50):\n",
    "    obj = trainingStepBPR(modelBPR, interactionsTrain)\n",
    "    if (i % 10 == 9): print(\"iteration \" + str(i+1) + \", objective = \" + str(obj))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 61,
   "metadata": {},
   "outputs": [],
   "source": [
    "interactionsTestPerUser = defaultdict(set)\n",
    "itemSet = set()\n",
    "for u,i,_ in interactionsTest:\n",
    "    interactionsTestPerUser[u].add(i)\n",
    "    itemSet.add(i)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "AUC implementation"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 62,
   "metadata": {},
   "outputs": [],
   "source": [
    "def AUCu(u, N): # N samples per user\n",
    "    win = 0\n",
    "    if N > len(interactionsTestPerUser[u]):\n",
    "        N = len(interactionsTestPerUser[u])\n",
    "    positive = random.sample(interactionsTestPerUser[u],N)\n",
    "    negative = random.sample(itemSet.difference(interactionsTestPerUser[u]),N)\n",
    "    for i,j in zip(positive,negative):\n",
    "        si = modelBPR.predict(userIDs[u], itemIDs[i]).numpy()\n",
    "        sj = modelBPR.predict(userIDs[u], itemIDs[j]).numpy()\n",
    "        if si > sj:\n",
    "            win += 1\n",
    "    return win/N"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 63,
   "metadata": {},
   "outputs": [],
   "source": [
    "def AUC():\n",
    "    av = []\n",
    "    for u in interactionsTestPerUser:\n",
    "        av.append(AUCu(u, 10))\n",
    "    return sum(av) / len(av)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 64,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0.793553704111058"
      ]
     },
     "execution_count": 64,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "AUC()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.5.2"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}