{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import tensorflow as tf\n",
    "import array\n",
    "import gzip\n",
    "import random\n",
    "from tensorflow.keras import Model\n",
    "from collections import defaultdict"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "def parse(path):\n",
    "    g = gzip.open(path, 'r')\n",
    "    for l in g:\n",
    "        yield eval(l)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "userIDs = {}\n",
    "itemIDs = {}\n",
    "interactions = []\n",
    "\n",
    "# from https://sites.google.com/eng.ucsd.edu/ucsdbookgraph/home\n",
    "for d in parse(\"goodreads_reviews_comics_graphic.json.gz\"):\n",
    "    u = d['user_id']\n",
    "    i = d['book_id']\n",
    "    r = d['rating'] # not used\n",
    "    if not u in userIDs: userIDs[u] = len(userIDs)\n",
    "    if not i in itemIDs: itemIDs[i] = len(itemIDs)\n",
    "    interactions.append((u,i,r))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "542338"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "len(interactions)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Map each user to a set of items, used for sampling negatives\n",
    "itemsPerUser = defaultdict(set)\n",
    "for u,i,_ in interactions:\n",
    "    itemsPerUser[u].add(i)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "items = list(itemIDs.keys())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {},
   "outputs": [],
   "source": [
    "optimizer = tf.keras.optimizers.Adam(0.01)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {},
   "outputs": [],
   "source": [
    "class BPRbatch(tf.keras.Model):\n",
    "    def __init__(self, K, lamb):\n",
    "        super(BPRbatch, self).__init__()\n",
    "        # Initialize variables\n",
    "        self.betaI = tf.Variable(tf.random.normal([len(itemIDs)],stddev=0.001))\n",
    "        self.gammaU = tf.Variable(tf.random.normal([len(userIDs),K],stddev=0.001))\n",
    "        self.gammaI = tf.Variable(tf.random.normal([len(itemIDs),K],stddev=0.001))\n",
    "        # Regularization coefficient\n",
    "        self.lamb = lamb\n",
    "\n",
    "    # Prediction for a single instance\n",
    "    def predict(self, u, i):\n",
    "        p = self.betaI[i] + tf.tensordot(self.gammaU[u], self.gammaI[i], 1)\n",
    "        return p\n",
    "\n",
    "    # Regularizer\n",
    "    def reg(self):\n",
    "        return self.lamb * tf.nn.l2_loss(self.betaI) +\\\n",
    "                           tf.nn.l2_loss(self.gammaU) +\\\n",
    "                           tf.nn.l2_loss(self.gammaI)\n",
    "\n",
    "    # Loss for a batch of instances\n",
    "    def call(self, sampleU, sampleI, sampleJ):\n",
    "        u = tf.convert_to_tensor(sampleU, dtype=tf.int32)\n",
    "        i = tf.convert_to_tensor(sampleI, dtype=tf.int32)\n",
    "        j = tf.convert_to_tensor(sampleJ, dtype=tf.int32)\n",
    "        gamma_u = tf.nn.embedding_lookup(self.gammaU, u)\n",
    "        gamma_i = tf.nn.embedding_lookup(self.gammaI, i)\n",
    "        gamma_j = tf.nn.embedding_lookup(self.gammaI, j)\n",
    "        beta_i = tf.nn.embedding_lookup(self.betaI, i)\n",
    "        beta_j = tf.nn.embedding_lookup(self.betaI, j)\n",
    "        x_ui = tf.reduce_sum(tf.multiply(gamma_u, gamma_i), 1) + beta_i\n",
    "        x_uj = tf.reduce_sum(tf.multiply(gamma_u, gamma_j), 1) + beta_j\n",
    "        return -tf.reduce_mean(tf.math.log(tf.math.sigmoid(x_ui - x_uj)))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 34,
   "metadata": {},
   "outputs": [],
   "source": [
    "model = BPRbatch(10, 0.0000001)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 35,
   "metadata": {},
   "outputs": [],
   "source": [
    "batchSize = 10000\n",
    "\n",
    "def trainingStep(interactions):\n",
    "    with tf.GradientTape() as tape:\n",
    "        # Generate lists of user, item, negative item triples\n",
    "        sampleU, sampleI, sampleJ = [], [], []\n",
    "        for _ in range(batchSize):\n",
    "            u,i,_ = random.choice(interactions) # positive sample\n",
    "            j = random.choice(items) # negative sample\n",
    "            while j in itemsPerUser[u]:\n",
    "                j = random.choice(items)\n",
    "            sampleU.append(userIDs[u])\n",
    "            sampleI.append(itemIDs[i])\n",
    "            sampleJ.append(itemIDs[j])\n",
    "\n",
    "        loss = model(sampleU,sampleI,sampleJ)\n",
    "        loss += model.reg()\n",
    "    gradients = tape.gradient(loss, model.trainable_variables)\n",
    "    optimizer.apply_gradients((grad, var) for\n",
    "                              (grad, var) in zip(gradients, model.trainable_variables)\n",
    "                              if grad is not None)\n",
    "    return loss.numpy()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 36,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "iteration 0, objective for batch = 0.6931441\n",
      "iteration 1, objective for batch = 0.6894025\n",
      "iteration 2, objective for batch = 0.6839422\n",
      "iteration 3, objective for batch = 0.67746615\n",
      "iteration 4, objective for batch = 0.67004853\n",
      "iteration 5, objective for batch = 0.66284996\n",
      "iteration 6, objective for batch = 0.65489775\n",
      "iteration 7, objective for batch = 0.64770955\n",
      "iteration 8, objective for batch = 0.6375079\n",
      "iteration 9, objective for batch = 0.63136154\n",
      "iteration 10, objective for batch = 0.6243593\n",
      "iteration 11, objective for batch = 0.61448044\n",
      "iteration 12, objective for batch = 0.6062905\n",
      "iteration 13, objective for batch = 0.5976286\n",
      "iteration 14, objective for batch = 0.5919805\n",
      "iteration 15, objective for batch = 0.58349234\n",
      "iteration 16, objective for batch = 0.57864463\n"
     ]
    },
    {
     "ename": "KeyboardInterrupt",
     "evalue": "",
     "output_type": "error",
     "traceback": [
      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
      "\u001b[0;31mKeyboardInterrupt\u001b[0m                         Traceback (most recent call last)",
      "\u001b[0;32m<ipython-input-36-a5c67a32b8ad>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m()\u001b[0m\n\u001b[1;32m      3\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m      4\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0mi\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mrange\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0miterations\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 5\u001b[0;31m     \u001b[0mobj\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mtrainingStep\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0minteractions\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m      6\u001b[0m     \u001b[0mprint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"iteration \"\u001b[0m \u001b[0;34m+\u001b[0m \u001b[0mstr\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mi\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;34m+\u001b[0m \u001b[0;34m\", objective for batch = \"\u001b[0m \u001b[0;34m+\u001b[0m \u001b[0mstr\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mobj\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
      "\u001b[0;32m<ipython-input-35-b61470df330c>\u001b[0m in \u001b[0;36mtrainingStep\u001b[0;34m(interactions)\u001b[0m\n\u001b[1;32m     11\u001b[0m                 \u001b[0mj\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mrandom\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mchoice\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mitems\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m     12\u001b[0m             \u001b[0msampleU\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mappend\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0muserIDs\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mu\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 13\u001b[0;31m             \u001b[0msampleI\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mappend\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mitemIDs\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mi\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m     14\u001b[0m             \u001b[0msampleJ\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mappend\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mitemIDs\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mj\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m     15\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
      "\u001b[0;31mKeyboardInterrupt\u001b[0m: "
     ]
    }
   ],
   "source": [
    "losses = []\n",
    "iterations = 100\n",
    "\n",
    "for i in range(iterations):\n",
    "    obj = trainingStep(interactions)\n",
    "    print(\"iteration \" + str(i) + \", objective for batch = \" + str(obj))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 30,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'21855709'"
      ]
     },
     "execution_count": 30,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "items[0]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 31,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([ 0.00269207, -0.00853635, -0.00305801,  0.00092033, -0.00108074,\n",
       "       -0.00109601, -0.00532799,  0.00042907, -0.0001782 ,  0.00162639],\n",
       "      dtype=float32)"
      ]
     },
     "execution_count": 31,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "model.gammaI[itemIDs[items[0]]].numpy()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.5.2"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
