{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "import gzip\n", "import random\n", "import scipy\n", "import tensorflow as tf\n", "from collections import defaultdict\n", "from implicit import bpr\n", "from surprise import SVD, Reader, Dataset\n", "from surprise.model_selection import train_test_split" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Data is available at http://cseweb.ucsd.edu/~jmcauley/pml/data/. Download and save to your own directory" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "dataDir = \"/home/jmcauley/pml_data/\"" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Latent factor model (Surprise)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Using the library's inbuilt data reader, extract tsv-formatted data" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "reader = Reader(line_format='user item rating', sep='\\t')\n", "data = Dataset.load_from_file(dataDir + \"goodreads_fantasy.tsv\", reader=reader)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Standard latent-factor model" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [], "source": [ "model = SVD()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Inbuilt functions to split into training and test fractions" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [], "source": [ "trainset, testset = train_test_split(data, test_size=.25)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Fit the model and extract predictions" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [], "source": [ "model.fit(trainset)\n", "predictions = model.test(testset)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Estimate for a single (test) rating" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "3.6334479463688463" ] }, "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "source": [ "predictions[0].est" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "MSE for model predictions (test set)" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "1.1883531641648757\n" ] } ], "source": [ "sse = 0\n", "for p in predictions:\n", " sse += (p.r_ui - p.est)**2\n", "\n", "print(sse / len(predictions))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Bayesian Personalized Ranking (Implicit)" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [], "source": [ "def parseData(fname):\n", " for l in gzip.open(fname):\n", " d = eval(l)\n", " del d['review_text'] # Discard the reviews, to save memory when we don't use them\n", " yield d" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Full dataset of Goodreads fantasy reviews (fairly memory-hungry, could be replaced by something smaller)" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [], "source": [ "data = list(parseData(dataDir + \"goodreads_reviews_fantasy_paranormal.json.gz\"))" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [], "source": [ "random.shuffle(data)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Example from the dataset" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "{'book_id': '13451182',\n", " 'date_added': 'Sun Sep 09 18:58:45 -0700 2012',\n", " 'date_updated': 'Sun Oct 07 15:13:32 -0700 2012',\n", " 'n_comments': 1,\n", " 'n_votes': 0,\n", " 'rating': 1,\n", " 'read_at': 'Sun Sep 09 00:00:00 -0700 2012',\n", " 'review_id': 'fec617f0bd2947c641189b01e2433ec9',\n", " 'started_at': 'Sun Sep 09 00:00:00 -0700 2012',\n", " 'user_id': '30d8035cfe8ee0fb007fa896b5a3ba54'}" ] }, "execution_count": 12, "metadata": {}, "output_type": "execute_result" } ], "source": [ "data[0]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Build a few utility data structures. Since we'll be converting the data to a sparse interaction matrix, the main structure here is to assign each user/item to an ID from 0 to nUsers/nItems." ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [], "source": [ "userIDs,itemIDs = {},{}\n", "\n", "for d in data:\n", " u,i = d['user_id'],d['book_id']\n", " if not u in userIDs: userIDs[u] = len(userIDs)\n", " if not i in itemIDs: itemIDs[i] = len(itemIDs)\n", "\n", "nUsers,nItems = len(userIDs),len(itemIDs)" ] }, { "cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "(256088, 258212)" ] }, "execution_count": 14, "metadata": {}, "output_type": "execute_result" } ], "source": [ "nUsers,nItems" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Convert dataset to sparse matrix. Only storing positive feedback instances (i.e., rated items)." ] }, { "cell_type": "code", "execution_count": 15, "metadata": {}, "outputs": [], "source": [ "Xiu = scipy.sparse.lil_matrix((nItems, nUsers))\n", "for d in data:\n", " Xiu[itemIDs[d['book_id']],userIDs[d['user_id']]] = 1\n", " \n", "Xui = scipy.sparse.csr_matrix(Xiu.T)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Bayesian Personalized Ranking model with 5 latent factors" ] }, { "cell_type": "code", "execution_count": 16, "metadata": {}, "outputs": [], "source": [ "model = bpr.BayesianPersonalizedRanking(factors = 5)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Fit the model" ] }, { "cell_type": "code", "execution_count": 17, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "65699ca650a04b198c5db695736d7aed", "version_major": 2, "version_minor": 0 }, "text/plain": [ "HBox(children=(FloatProgress(value=0.0), HTML(value='')))" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "\n" ] } ], "source": [ "model.fit(Xiu)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Get recommendations for a particular user (the first one) and to get items related to (similar latent factors) to a particular item" ] }, { "cell_type": "code", "execution_count": 18, "metadata": {}, "outputs": [], "source": [ "recommended = model.recommend(0, Xui)\n", "related = model.similar_items(0)" ] }, { "cell_type": "code", "execution_count": 19, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "[(0, 1.0),\n", " (42098, 0.9885355),\n", " (142964, 0.9845209),\n", " (150861, 0.98274595),\n", " (231639, 0.9826295),\n", " (182330, 0.9813926),\n", " (240868, 0.98134804),\n", " (226720, 0.9796706),\n", " (84748, 0.9783791),\n", " (140340, 0.97788805)]" ] }, "execution_count": 19, "metadata": {}, "output_type": "execute_result" } ], "source": [ "related" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Extract user and item factors" ] }, { "cell_type": "code", "execution_count": 20, "metadata": {}, "outputs": [], "source": [ "itemFactors = model.item_factors\n", "userFactors = model.user_factors" ] }, { "cell_type": "code", "execution_count": 21, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array([-0.74582803, -0.10878776, 0.32922822, 0.16516064, 0.38874012,\n", " 0.7460656 ], dtype=float32)" ] }, "execution_count": 21, "metadata": {}, "output_type": "execute_result" } ], "source": [ "itemFactors[0]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Latent factor model (Tensorflow)" ] }, { "cell_type": "code", "execution_count": 22, "metadata": {}, "outputs": [], "source": [ "def parse(path):\n", " g = gzip.open(path, 'r')\n", " for l in g:\n", " yield eval(l)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Goodreads comic book data" ] }, { "cell_type": "code", "execution_count": 23, "metadata": {}, "outputs": [], "source": [ "userIDs = {}\n", "itemIDs = {}\n", "interactions = []\n", "\n", "for d in parse(dataDir + \"goodreads_reviews_comics_graphic.json.gz\"):\n", " u = d['user_id']\n", " i = d['book_id']\n", " r = d['rating']\n", " if not u in userIDs: userIDs[u] = len(userIDs)\n", " if not i in itemIDs: itemIDs[i] = len(itemIDs)\n", " interactions.append((u,i,r))" ] }, { "cell_type": "code", "execution_count": 24, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "542338" ] }, "execution_count": 24, "metadata": {}, "output_type": "execute_result" } ], "source": [ "random.shuffle(interactions)\n", "len(interactions)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Split into train and test sets" ] }, { "cell_type": "code", "execution_count": 25, "metadata": {}, "outputs": [], "source": [ "nTrain = int(len(interactions) * 0.9)\n", "nTest = len(interactions) - nTrain\n", "interactionsTrain = interactions[:nTrain]\n", "interactionsTest = interactions[nTrain:]" ] }, { "cell_type": "code", "execution_count": 26, "metadata": {}, "outputs": [], "source": [ "itemsPerUser = defaultdict(list)\n", "usersPerItem = defaultdict(list)\n", "for u,i,r in interactionsTrain:\n", " itemsPerUser[u].append(i)\n", " usersPerItem[i].append(u)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Mean rating, just for initialization" ] }, { "cell_type": "code", "execution_count": 27, "metadata": {}, "outputs": [], "source": [ "mu = sum([r for _,_,r in interactionsTrain]) / len(interactionsTrain)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Gradient descent optimizer, could experiment with learning rate" ] }, { "cell_type": "code", "execution_count": 28, "metadata": {}, "outputs": [], "source": [ "optimizer = tf.keras.optimizers.Adam(0.1)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Latent factor model tensorflow class" ] }, { "cell_type": "code", "execution_count": 29, "metadata": {}, "outputs": [], "source": [ "class LatentFactorModel(tf.keras.Model):\n", " def __init__(self, mu, K, lamb):\n", " super(LatentFactorModel, self).__init__()\n", " # Initialize to average\n", " self.alpha = tf.Variable(mu)\n", " # Initialize to small random values\n", " self.betaU = tf.Variable(tf.random.normal([len(userIDs)],stddev=0.001))\n", " self.betaI = tf.Variable(tf.random.normal([len(itemIDs)],stddev=0.001))\n", " self.gammaU = tf.Variable(tf.random.normal([len(userIDs),K],stddev=0.001))\n", " self.gammaI = tf.Variable(tf.random.normal([len(itemIDs),K],stddev=0.001))\n", " self.lamb = lamb\n", "\n", " # Prediction for a single instance (useful for evaluation)\n", " def predict(self, u, i):\n", " p = self.alpha + self.betaU[u] + self.betaI[i] +\\\n", " tf.tensordot(self.gammaU[u], self.gammaI[i], 1)\n", " return p\n", "\n", " # Regularizer\n", " def reg(self):\n", " return self.lamb * (tf.reduce_sum(self.betaU**2) +\\\n", " tf.reduce_sum(self.betaI**2) +\\\n", " tf.reduce_sum(self.gammaU**2) +\\\n", " tf.reduce_sum(self.gammaI**2))\n", " \n", " # Prediction for a sample of instances\n", " def predictSample(self, sampleU, sampleI):\n", " u = tf.convert_to_tensor(sampleU, dtype=tf.int32)\n", " i = tf.convert_to_tensor(sampleI, dtype=tf.int32)\n", " beta_u = tf.nn.embedding_lookup(self.betaU, u)\n", " beta_i = tf.nn.embedding_lookup(self.betaI, i)\n", " gamma_u = tf.nn.embedding_lookup(self.gammaU, u)\n", " gamma_i = tf.nn.embedding_lookup(self.gammaI, i)\n", " pred = self.alpha + beta_u + beta_i +\\\n", " tf.reduce_sum(tf.multiply(gamma_u, gamma_i), 1)\n", " return pred\n", " \n", " # Loss\n", " def call(self, sampleU, sampleI, sampleR):\n", " pred = self.predictSample(sampleU, sampleI)\n", " r = tf.convert_to_tensor(sampleR, dtype=tf.float32)\n", " return tf.nn.l2_loss(pred - r) / len(sampleR)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Initialize the model. Could experiment with number of factors and regularization rate." ] }, { "cell_type": "code", "execution_count": 30, "metadata": {}, "outputs": [], "source": [ "modelLFM = LatentFactorModel(mu, 5, 0.00001)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Training step (for the batch-based model from Chapter 5)" ] }, { "cell_type": "code", "execution_count": 31, "metadata": {}, "outputs": [], "source": [ "def trainingStep(model, interactions):\n", " Nsamples = 50000\n", " with tf.GradientTape() as tape:\n", " sampleU, sampleI, sampleR = [], [], []\n", " for _ in range(Nsamples):\n", " u,i,r = random.choice(interactions)\n", " sampleU.append(userIDs[u])\n", " sampleI.append(itemIDs[i])\n", " sampleR.append(r)\n", "\n", " loss = model(sampleU,sampleI,sampleR)\n", " loss += model.reg()\n", " gradients = tape.gradient(loss, model.trainable_variables)\n", " optimizer.apply_gradients((grad, var) for\n", " (grad, var) in zip(gradients, model.trainable_variables)\n", " if grad is not None)\n", " return loss.numpy()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Run 100 iterations (really 100 batches) of gradient descent" ] }, { "cell_type": "code", "execution_count": 32, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "iteration 10, objective = 0.53868735\n", "iteration 20, objective = 0.5198331\n", "iteration 30, objective = 0.5232154\n", "iteration 40, objective = 0.52439743\n", "iteration 50, objective = 0.5135148\n", "iteration 60, objective = 0.49897566\n", "iteration 70, objective = 0.50508446\n", "iteration 80, objective = 0.5124023\n", "iteration 90, objective = 0.51079476\n", "iteration 100, objective = 0.5071494\n" ] } ], "source": [ "for i in range(100):\n", " obj = trainingStep(modelLFM, interactionsTrain)\n", " if (i % 10 == 9): print(\"iteration \" + str(i+1) + \", objective = \" + str(obj))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Prediction for a particular user/item pair" ] }, { "cell_type": "code", "execution_count": 33, "metadata": {}, "outputs": [], "source": [ "u,i,r = interactionsTest[0]" ] }, { "cell_type": "code", "execution_count": 34, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "3.3664043" ] }, "execution_count": 34, "metadata": {}, "output_type": "execute_result" } ], "source": [ "modelLFM.predict(userIDs[u], itemIDs[i]).numpy()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Bayesian personalized ranking (Tensorflow)" ] }, { "cell_type": "code", "execution_count": 35, "metadata": {}, "outputs": [], "source": [ "items = list(itemIDs.keys())" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Batch-based version from Chapter 5" ] }, { "cell_type": "code", "execution_count": 36, "metadata": {}, "outputs": [], "source": [ "class BPRbatch(tf.keras.Model):\n", " def __init__(self, K, lamb):\n", " super(BPRbatch, self).__init__()\n", " # Initialize variables\n", " self.betaI = tf.Variable(tf.random.normal([len(itemIDs)],stddev=0.001))\n", " self.gammaU = tf.Variable(tf.random.normal([len(userIDs),K],stddev=0.001))\n", " self.gammaI = tf.Variable(tf.random.normal([len(itemIDs),K],stddev=0.001))\n", " # Regularization coefficient\n", " self.lamb = lamb\n", "\n", " # Prediction for a single instance\n", " def predict(self, u, i):\n", " p = self.betaI[i] + tf.tensordot(self.gammaU[u], self.gammaI[i], 1)\n", " return p\n", "\n", " # Regularizer\n", " def reg(self):\n", " return self.lamb * (tf.nn.l2_loss(self.betaI) +\\\n", " tf.nn.l2_loss(self.gammaU) +\\\n", " tf.nn.l2_loss(self.gammaI))\n", " \n", " def score(self, sampleU, sampleI):\n", " u = tf.convert_to_tensor(sampleU, dtype=tf.int32)\n", " i = tf.convert_to_tensor(sampleI, dtype=tf.int32)\n", " beta_i = tf.nn.embedding_lookup(self.betaI, i)\n", " gamma_u = tf.nn.embedding_lookup(self.gammaU, u)\n", " gamma_i = tf.nn.embedding_lookup(self.gammaI, i)\n", " x_ui = beta_i + tf.reduce_sum(tf.multiply(gamma_u, gamma_i), 1)\n", " return x_ui\n", "\n", " def call(self, sampleU, sampleI, sampleJ):\n", " x_ui = self.score(sampleU, sampleI)\n", " x_uj = self.score(sampleU, sampleJ)\n", " return -tf.reduce_mean(tf.math.log(tf.math.sigmoid(x_ui - x_uj)))" ] }, { "cell_type": "code", "execution_count": 37, "metadata": {}, "outputs": [], "source": [ "optimizer = tf.keras.optimizers.Adam(0.1)" ] }, { "cell_type": "code", "execution_count": 38, "metadata": {}, "outputs": [], "source": [ "modelBPR = BPRbatch(5, 0.00001)" ] }, { "cell_type": "code", "execution_count": 39, "metadata": {}, "outputs": [], "source": [ "def trainingStepBPR(model, interactions):\n", " Nsamples = 50000\n", " with tf.GradientTape() as tape:\n", " sampleU, sampleI, sampleJ = [], [], []\n", " for _ in range(Nsamples):\n", " u,i,_ = random.choice(interactions) # positive sample\n", " j = random.choice(items) # negative sample\n", " while j in itemsPerUser[u]:\n", " j = random.choice(items)\n", " sampleU.append(userIDs[u])\n", " sampleI.append(itemIDs[i])\n", " sampleJ.append(itemIDs[j])\n", "\n", " loss = model(sampleU,sampleI,sampleJ)\n", " loss += model.reg()\n", " gradients = tape.gradient(loss, model.trainable_variables)\n", " optimizer.apply_gradients((grad, var) for\n", " (grad, var) in zip(gradients, model.trainable_variables)\n", " if grad is not None)\n", " return loss.numpy()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Run 100 batches of gradient descent" ] }, { "cell_type": "code", "execution_count": 40, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "iteration 10, objective = 0.53062546\n", "iteration 20, objective = 0.4767778\n", "iteration 30, objective = 0.47098666\n", "iteration 40, objective = 0.47340673\n", "iteration 50, objective = 0.4744774\n", "iteration 60, objective = 0.47567236\n", "iteration 70, objective = 0.47382742\n", "iteration 80, objective = 0.47544688\n", "iteration 90, objective = 0.47159576\n", "iteration 100, objective = 0.4722678\n" ] } ], "source": [ "for i in range(100):\n", " obj = trainingStepBPR(modelBPR, interactions)\n", " if (i % 10 == 9): print(\"iteration \" + str(i+1) + \", objective = \" + str(obj))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Prediction for a particular user/item pair. Note that this is an unnormalized score (which can be used for ranking)" ] }, { "cell_type": "code", "execution_count": 41, "metadata": {}, "outputs": [], "source": [ "u,i,_ = interactionsTest[0]" ] }, { "cell_type": "code", "execution_count": 42, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "2.2043138" ] }, "execution_count": 42, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# In this case just a score (that can be used for ranking), rather than a prediction of a rating\n", "modelBPR.predict(userIDs[u], itemIDs[i]).numpy()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Exercises" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### 5.1" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Adapt the latent factor model above, simply deleting any terms associated with latent factors" ] }, { "cell_type": "code", "execution_count": 43, "metadata": {}, "outputs": [], "source": [ "class LatentFactorModelBiasOnly(tf.keras.Model):\n", " def __init__(self, mu, lamb):\n", " super(LatentFactorModelBiasOnly, self).__init__()\n", " # Initialize to average\n", " self.alpha = tf.Variable(mu)\n", " # Initialize to small random values\n", " self.betaU = tf.Variable(tf.random.normal([len(userIDs)],stddev=0.001))\n", " self.betaI = tf.Variable(tf.random.normal([len(itemIDs)],stddev=0.001))\n", " self.lamb = lamb\n", "\n", " # Prediction for a single instance (useful for evaluation)\n", " def predict(self, u, i):\n", " p = self.alpha + self.betaU[u] + self.betaI[i]\n", " return p\n", "\n", " # Regularizer\n", " def reg(self):\n", " return self.lamb * (tf.reduce_sum(self.betaU**2) +\\\n", " tf.reduce_sum(self.betaI**2))\n", " \n", " # Prediction for a sample of instances\n", " def predictSample(self, sampleU, sampleI):\n", " u = tf.convert_to_tensor(sampleU, dtype=tf.int32)\n", " i = tf.convert_to_tensor(sampleI, dtype=tf.int32)\n", " beta_u = tf.nn.embedding_lookup(self.betaU, u)\n", " beta_i = tf.nn.embedding_lookup(self.betaI, i)\n", " pred = self.alpha + beta_u + beta_i\n", " return pred\n", " \n", " # Loss\n", " def call(self, sampleU, sampleI, sampleR):\n", " pred = self.predictSample(sampleU, sampleI)\n", " r = tf.convert_to_tensor(sampleR, dtype=tf.float32)\n", " return tf.nn.l2_loss(pred - r) / len(sampleR)" ] }, { "cell_type": "code", "execution_count": 44, "metadata": {}, "outputs": [], "source": [ "modelBiasOnly = LatentFactorModelBiasOnly(mu, 0.00001)" ] }, { "cell_type": "code", "execution_count": 45, "metadata": {}, "outputs": [], "source": [ "def trainingStepBiasOnly(model, interactions):\n", " Nsamples = 50000\n", " with tf.GradientTape() as tape:\n", " sampleU, sampleI, sampleR = [], [], []\n", " for _ in range(Nsamples):\n", " u,i,r = random.choice(interactions)\n", " sampleU.append(userIDs[u])\n", " sampleI.append(itemIDs[i])\n", " sampleR.append(r)\n", "\n", " loss = model(sampleU,sampleI,sampleR)\n", " loss += model.reg()\n", " gradients = tape.gradient(loss, model.trainable_variables)\n", " optimizer.apply_gradients((grad, var) for\n", " (grad, var) in zip(gradients, model.trainable_variables)\n", " if grad is not None)\n", " return loss.numpy()" ] }, { "cell_type": "code", "execution_count": 46, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "iteration 10, objective = 0.5712534\n", "iteration 20, objective = 0.53842133\n", "iteration 30, objective = 0.52470183\n", "iteration 40, objective = 0.5304408\n", "iteration 50, objective = 0.52147734\n" ] } ], "source": [ "for i in range(50):\n", " obj = trainingStepBiasOnly(modelBiasOnly, interactionsTrain)\n", " if (i % 10 == 9): print(\"iteration \" + str(i+1) + \", objective = \" + str(obj))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Compute the MSEs for a model which always predicts the mean, versus one which involves bias terms" ] }, { "cell_type": "code", "execution_count": 47, "metadata": {}, "outputs": [], "source": [ "def MSE(predictions, labels):\n", " differences = [(x-y)**2 for x,y in zip(predictions,labels)]\n", " return sum(differences) / len(differences)" ] }, { "cell_type": "code", "execution_count": 48, "metadata": {}, "outputs": [], "source": [ "alwaysPredictMean = [mu for _ in interactionsTest]\n", "labels = [r for _,_,r in interactionsTest]" ] }, { "cell_type": "code", "execution_count": 49, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "1.3266520043138732" ] }, "execution_count": 49, "metadata": {}, "output_type": "execute_result" } ], "source": [ "MSE(alwaysPredictMean, labels)" ] }, { "cell_type": "code", "execution_count": 50, "metadata": {}, "outputs": [], "source": [ "biasOnlyPredictions =\\\n", " [modelBiasOnly.predict(userIDs[u],itemIDs[i]).numpy() for u,i,_ in interactionsTest]" ] }, { "cell_type": "code", "execution_count": 51, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "3.3093212" ] }, "execution_count": 51, "metadata": {}, "output_type": "execute_result" } ], "source": [ "biasOnlyPredictions[0]" ] }, { "cell_type": "code", "execution_count": 52, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "0.9999872631832556" ] }, "execution_count": 52, "metadata": {}, "output_type": "execute_result" } ], "source": [ "MSE(biasOnlyPredictions, labels)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### 5.2" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Performance of a complete latent factor model (using the latent factor model implementation in the examples above)" ] }, { "cell_type": "code", "execution_count": 53, "metadata": {}, "outputs": [], "source": [ "optimizer = tf.keras.optimizers.Adam(0.1)\n", "modelLFM = LatentFactorModel(mu, 10, 0.00001)" ] }, { "cell_type": "code", "execution_count": 54, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "iteration 10, objective = 0.530306\n", "iteration 20, objective = 0.53029466\n", "iteration 30, objective = 0.5466817\n", "iteration 40, objective = 0.53111464\n", "iteration 50, objective = 0.5286445\n" ] } ], "source": [ "for i in range(50):\n", " obj = trainingStep(modelLFM, interactionsTrain)\n", " if (i % 10 == 9): print(\"iteration \" + str(i+1) + \", objective = \" + str(obj))" ] }, { "cell_type": "code", "execution_count": 55, "metadata": {}, "outputs": [], "source": [ "predictions = [modelLFM.predict(userIDs[u],itemIDs[i]).numpy() for u,i,_ in interactionsTest]" ] }, { "cell_type": "code", "execution_count": 56, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "1.0094479508990533" ] }, "execution_count": 56, "metadata": {}, "output_type": "execute_result" } ], "source": [ "MSE(predictions, labels)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "(probably needs a little more tuning in terms of number of latent factors, learning rate, etc.)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### 5.3" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Experiment with rounding the predictions" ] }, { "cell_type": "code", "execution_count": 57, "metadata": {}, "outputs": [], "source": [ "predictionsRounded = [int(p + 0.5) for p in predictions]" ] }, { "cell_type": "code", "execution_count": 58, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "1.094756057085961" ] }, "execution_count": 58, "metadata": {}, "output_type": "execute_result" } ], "source": [ "MSE(predictionsRounded, labels)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Seems to result in worse performance. For a rough explanation, consider a random variable that takes a value of \"1\" half the time and \"2\" half the time; in terms of the MSE, always predicting 1.5 (and always incurring moderate errors) is preferable to always predicting either of 1 or 2 (and incurring a large error half the time)." ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### 5.4" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Following the BPR code from examples above" ] }, { "cell_type": "code", "execution_count": 59, "metadata": {}, "outputs": [], "source": [ "optimizer = tf.keras.optimizers.Adam(0.1)\n", "modelBPR = BPRbatch(10, 0.00001)" ] }, { "cell_type": "code", "execution_count": 60, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "iteration 10, objective = 0.5274262\n", "iteration 20, objective = 0.48642576\n", "iteration 30, objective = 0.48361415\n", "iteration 40, objective = 0.48916948\n", "iteration 50, objective = 0.49432752\n" ] } ], "source": [ "for i in range(50):\n", " obj = trainingStepBPR(modelBPR, interactionsTrain)\n", " if (i % 10 == 9): print(\"iteration \" + str(i+1) + \", objective = \" + str(obj))" ] }, { "cell_type": "code", "execution_count": 61, "metadata": {}, "outputs": [], "source": [ "interactionsTestPerUser = defaultdict(set)\n", "itemSet = set()\n", "for u,i,_ in interactionsTest:\n", " interactionsTestPerUser[u].add(i)\n", " itemSet.add(i)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "AUC implementation" ] }, { "cell_type": "code", "execution_count": 62, "metadata": {}, "outputs": [], "source": [ "def AUCu(u, N): # N samples per user\n", " win = 0\n", " if N > len(interactionsTestPerUser[u]):\n", " N = len(interactionsTestPerUser[u])\n", " positive = random.sample(interactionsTestPerUser[u],N)\n", " negative = random.sample(itemSet.difference(interactionsTestPerUser[u]),N)\n", " for i,j in zip(positive,negative):\n", " si = modelBPR.predict(userIDs[u], itemIDs[i]).numpy()\n", " sj = modelBPR.predict(userIDs[u], itemIDs[j]).numpy()\n", " if si > sj:\n", " win += 1\n", " return win/N" ] }, { "cell_type": "code", "execution_count": 63, "metadata": {}, "outputs": [], "source": [ "def AUC():\n", " av = []\n", " for u in interactionsTestPerUser:\n", " av.append(AUCu(u, 10))\n", " return sum(av) / len(av)" ] }, { "cell_type": "code", "execution_count": 64, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "0.793553704111058" ] }, "execution_count": 64, "metadata": {}, "output_type": "execute_result" } ], "source": [ "AUC()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.5.2" } }, "nbformat": 4, "nbformat_minor": 2 }