{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import gzip\n",
    "import numpy\n",
    "import random\n",
    "import scipy\n",
    "import scipy.sparse\n",
    "from fastFM import als\n",
    "from scipy import spatial"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "import json\n",
    "import yaml\n",
    "import demjson\n",
    "import matplotlib.pyplot as plt\n",
    "import matplotlib as mpl\n",
    "import numpy\n",
    "import math\n",
    "import dateutil.parser\n",
    "from collections import defaultdict\n",
    "from scipy.stats import probplot\n",
    "import sklearn\n",
    "from sklearn import linear_model"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "plt.switch_backend('PS') \n",
    "mpl.use('PS')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "mpl.rcParams['text.usetex']=True\n",
    "mpl.rcParams['text.latex.unicode']=True"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
    "def parseData(fname):\n",
    "  for l in gzip.open(fname):\n",
    "    yield eval(l)\n",
    "\n",
    "def parseDataFromURL(fname):\n",
    "  for l in urlopen(fname):\n",
    "    yield eval(l)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "data = list(parseData(\"goodreads_reviews_fantasy_paranormal.json.gz\"))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "random.shuffle(data)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "data[0]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "userIDs,itemIDs = {},{}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "for d in data:\n",
    "    u,i = d['user_id'],d['book_id']\n",
    "    if not u in userIDs: userIDs[u] = len(userIDs)\n",
    "    if not i in itemIDs: itemIDs[i] = len(itemIDs)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "nUsers,nItems = len(userIDs),len(itemIDs)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "nUsers,nItems"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "X = scipy.sparse.lil_matrix((len(data), nUsers + nItems))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "for i in range(len(data)):\n",
    "    user = userIDs[data[i]['user_id']]\n",
    "    item = itemIDs[data[i]['book_id']]\n",
    "    X[i,user] = 1\n",
    "    X[i,nUsers + item] = 1"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "y = scipy.array([d['rating'] for d in data])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "fm = als.FMRegression(n_iter=1000, init_stdev=0.1, rank=5, l2_reg_w=0.1, l2_reg_V=0.5)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "X_train,y_train = X[:2000000],y[:2000000]\n",
    "X_test,y_test = X[2000000:],y[2000000:]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "fm.fit(X_train, y_train)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "y_pred = fm.predict(X_test)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "y_pred[:10]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "y_test[:10]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "help(als.FMRegression)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "X = scipy.sparse.lil_matrix((nItems, nUsers))\n",
    "for d in data:\n",
    "    X[itemIDs[d['book_id']],userIDs[d['user_id']]] = 1"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "X = scipy.sparse.coo_matrix((data, (items, users)))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from implicit import bpr\n",
    "from implicit import als"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "model = implicit.als.AlternatingLeastSquares(factors=5)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "model.fit(X2)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "model = bpr.BayesianPersonalizedRanking(factors = 5)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "model.fit(X)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "usersToItems = scipy.sparse.csr_matrix(X.T)\n",
    "\n",
    "recommended = model.recommend(0, usersToItems)\n",
    "related = model.similar_items(0)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "help(model.similar_items)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "itemFactors = model.item_factors\n",
    "userFactors = model.user_factors"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "related"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from scipy import spatial"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "1 - spatial.distance.cosine(itemFactors[0], itemFactors[1277])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "len(f)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "nItems"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import surprise"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from surprise import SVD"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from surprise import Dataset"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "data = Dataset.load_builtin('ml-100k')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "data[0]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "f = open(\"goodreads_fantasy.tsv\", 'w')\n",
    "for d in data:\n",
    "    f.write(str(d['user_id']) + '\\t' + str(d['book_id']) + '\\t' + str(d['rating']) + '\\n')\n",
    "\n",
    "f.close()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import surprise\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from surprise import SVD, Reader, Dataset\n",
    "from surprise.model_selection import train_test_split"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "reader = Reader(line_format='user item rating', sep='\\t')\n",
    "data = Dataset.load_from_file(\"goodreads_fantasy.tsv\", reader=reader)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "model = SVD()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "trainset, testset = train_test_split(data, test_size=.25)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "model.fit(trainset)\n",
    "predictions = model.test(testset)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "predictions[0].est"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "sse = 0\n",
    "for p in predictions:\n",
    "    sse += (p.r_ui - p.est)**2\n",
    "\n",
    "print(sse / len(predictions))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "from implicit import bpr\n",
    "from implicit import als"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "data = []\n",
    "for x in parseData(\"/home/jmcauley/PML book data/goodreads_reviews_comics_graphic.json.gz\"):\n",
    "    del x['review_text']\n",
    "    data.append(x)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 51,
   "metadata": {},
   "outputs": [],
   "source": [
    "random.shuffle(data)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": 52,
   "metadata": {},
   "outputs": [],
   "source": [
    "userIDs,itemIDs = {},{}\n",
    "revUIDs,revIIDs = {},{}\n",
    "for d in data:\n",
    "    u,i = d['user_id'],d['book_id']\n",
    "    if not u in userIDs:\n",
    "        userIDs[u] = len(userIDs)\n",
    "        revUIDs[userIDs[u]] = u\n",
    "    if not i in itemIDs:\n",
    "        itemIDs[i] = len(itemIDs)\n",
    "        revIIDs[itemIDs[i]] = i"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 53,
   "metadata": {},
   "outputs": [],
   "source": [
    "nUsers,nItems = len(userIDs),len(itemIDs)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 74,
   "metadata": {},
   "outputs": [],
   "source": [
    "Xiu = scipy.sparse.lil_matrix((len(data), nUsers + nItems))\n",
    "for d in data:\n",
    "    Xiu[itemIDs[d['book_id']],userIDs[d['user_id']]] = 1"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 75,
   "metadata": {},
   "outputs": [],
   "source": [
    "Xui = scipy.sparse.csr_matrix(X.T)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 76,
   "metadata": {},
   "outputs": [],
   "source": [
    "model = bpr.BayesianPersonalizedRanking(factors = 5)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 77,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "fae3ce2e9dcb41d18d3deee46a75bf0a",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "HBox(children=(FloatProgress(value=0.0), HTML(value='')))"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n"
     ]
    }
   ],
   "source": [
    "model.fit(Xiu)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 78,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[(2016, 2.7942648),\n",
       " (1450, 2.771629),\n",
       " (5396, 2.726222),\n",
       " (7442, 2.5824156),\n",
       " (170, 2.577093),\n",
       " (262, 2.4888153),\n",
       " (5294, 2.4322572),\n",
       " (891, 2.4138844),\n",
       " (6797, 2.400085),\n",
       " (12135, 2.379011)]"
      ]
     },
     "execution_count": 78,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "model.recommend(0, Xui)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 79,
   "metadata": {},
   "outputs": [],
   "source": [
    "from collections import defaultdict\n",
    "interactionTuples = []\n",
    "itemsPerUser = defaultdict(list)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 80,
   "metadata": {},
   "outputs": [],
   "source": [
    "for d in data:\n",
    "    u,i = d['user_id'],d['book_id']\n",
    "    interactionTuples.append((userIDs[u],itemIDs[i]))\n",
    "    itemsPerUser[userIDs[u]].append(itemIDs[i])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 81,
   "metadata": {},
   "outputs": [],
   "source": [
    "recommendationTuples = []"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 82,
   "metadata": {},
   "outputs": [],
   "source": [
    "for u in range(len(userIDs)):\n",
    "    A = model.recommend(u, Xui, N = len(itemsPerUser[u]))\n",
    "    for i, sc in A:\n",
    "        recommendationTuples.append((u,i))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 83,
   "metadata": {},
   "outputs": [],
   "source": [
    "countsPerItemI = defaultdict(int)\n",
    "countsPerItemR = defaultdict(int)\n",
    "for u,i in interactionTuples:\n",
    "    countsPerItemI[i] += 1\n",
    "\n",
    "for u,i in recommendationTuples:\n",
    "    countsPerItemR[i] += 1"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 84,
   "metadata": {},
   "outputs": [],
   "source": [
    "sortPopularI = [(countsPerItemI[i], i) for i in countsPerItemI]\n",
    "sortPopularR = [(countsPerItemR[i], i) for i in countsPerItemR]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 85,
   "metadata": {},
   "outputs": [],
   "source": [
    "sortPopularI.sort(reverse=True)\n",
    "sortPopularR.sort(reverse=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 105,
   "metadata": {},
   "outputs": [],
   "source": [
    "X2 = range(1,301)\n",
    "YI = [x[0] for x in sortPopularI[:300]]\n",
    "Ys = [x[1] for x in sortPopularI[:300]]\n",
    "YR = [countsPerItemR[x] for x in Ys]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 106,
   "metadata": {},
   "outputs": [],
   "source": [
    "plt.figure(figsize=(2.5,2))\n",
    "ax = plt.subplot(111)\n",
    "ax.set_position([0.225,0.2,0.7,0.7])\n",
    "plt.xlim(1,200)\n",
    "plt.ylim(0,3150)\n",
    "plt.plot(X2,YR, color='grey', label = \"recommendations\")\n",
    "plt.plot(X2,YI, color='k', label = \"interactions\")\n",
    "plt.xlabel(\"popularity rank (interactions)\")\n",
    "plt.ylabel(\"interaction freq.\")\n",
    "plt.legend(loc=\"best\",  prop={'size': 8})\n",
    "#plt.yticks([2,4,6,8])\n",
    "plt.title(\"Recommendation diversity\")\n",
    "plt.savefig(\"diversityDist.pdf\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 107,
   "metadata": {},
   "outputs": [],
   "source": [
    "X2 = range(1,301)\n",
    "YR = [x[0] for x in sortPopularR[:300]]\n",
    "Ys = [x[1] for x in sortPopularR[:300]]\n",
    "YI = [countsPerItemI[x] for x in Ys]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 108,
   "metadata": {},
   "outputs": [],
   "source": [
    "plt.figure(figsize=(2.5,2))\n",
    "ax = plt.subplot(111)\n",
    "ax.set_position([0.225,0.2,0.7,0.7])\n",
    "plt.plot(X2,YI, color='k', label = \"interactions\")\n",
    "plt.xlim(1,200)\n",
    "plt.ylim(0,3150)\n",
    "plt.plot(X2,YR, color='grey', label = \"recommendations\")\n",
    "plt.xlabel(\"popularity rank (recommendations)\")\n",
    "plt.ylabel(\"interaction freq.\")\n",
    "plt.legend(loc=\"best\",  prop={'size': 8})\n",
    "#plt.yticks([2,4,6,8])\n",
    "plt.title(\"Recommendation diversity\")\n",
    "plt.savefig(\"diversityDist2.pdf\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 109,
   "metadata": {},
   "outputs": [],
   "source": [
    "def gini(z, samples=1000000):\n",
    "    m = sum(z) / len(z)\n",
    "    denom = 2 * samples * m\n",
    "    numer = 0\n",
    "    #for i in range(len(z)):\n",
    "#        for j in range(len(z)):\n",
    "    for _ in range(samples):\n",
    "        i = random.choice(z)\n",
    "        j = random.choice(z)\n",
    "        numer += math.fabs(i - j)\n",
    "    return numer / denom\n",
    "\n",
    "def giniExact(z):\n",
    "    m = sum(z) / len(z)\n",
    "    denom = 2 * len(z)**2 * m\n",
    "    numer = 0\n",
    "    for i in range(len(z)):\n",
    "        for j in range(len(z)):\n",
    "            numer += math.fabs(z[i] - z[j])\n",
    "    return numer / denom"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 110,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0.7176372148964298"
      ]
     },
     "execution_count": 110,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "gini([x[0] for x in sortPopularI])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 111,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0.7738477359626285"
      ]
     },
     "execution_count": 111,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "gini([x[0] for x in sortPopularR])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 270,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([[1.        , 0.06782733],\n",
       "       [0.06782733, 1.        ]])"
      ]
     },
     "execution_count": 270,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "YR = [x[0] for x in sortPopularR]\n",
    "Ys = [x[1] for x in sortPopularR]\n",
    "YI = [countsPerItemI[x] for x in Ys]\n",
    "\n",
    "numpy.corrcoef(YR,YI)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": 263,
   "metadata": {},
   "outputs": [],
   "source": [
    "def avCosine(z, samples = 100):\n",
    "    av = []\n",
    "    while len(av) < samples:\n",
    "        i = random.choice(z)\n",
    "        j = random.choice(z)\n",
    "        d = 1 - spatial.distance.cosine(i, j)\n",
    "        if not math.isnan(d) and d > 0:\n",
    "            av.append(d)\n",
    "    return sum(av) / len(av)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 264,
   "metadata": {},
   "outputs": [],
   "source": [
    "itemFactors = model.item_factors"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 265,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/usr/local/lib/python3.5/dist-packages/scipy/spatial/distance.py:720: RuntimeWarning: invalid value encountered in float_scalars\n",
      "  dist = 1.0 - uv / np.sqrt(uu * vv)\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "0.455286483662203"
      ]
     },
     "execution_count": 265,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "avCosine([itemFactors[i] for i in itemsPerUser[0]])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 266,
   "metadata": {},
   "outputs": [],
   "source": [
    "avavI = []\n",
    "avavR = []"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 267,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/usr/local/lib/python3.5/dist-packages/scipy/spatial/distance.py:720: RuntimeWarning: invalid value encountered in float_scalars\n",
      "  dist = 1.0 - uv / np.sqrt(uu * vv)\n"
     ]
    }
   ],
   "source": [
    "while len(avavI) < 1000:\n",
    "    u = random.choice(range(len(userIDs)))\n",
    "    if len(itemsPerUser[u]) < 10:\n",
    "        continue\n",
    "    aI = avCosine([itemFactors[i] for i in itemsPerUser[u]])\n",
    "    A = model.recommend(u, X, N = len(itemsPerUser[u]))\n",
    "    aR = avCosine([itemFactors[i[0]] for i in A])\n",
    "    avavI.append(aI)\n",
    "    avavR.append(aR)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 268,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0.45346739776550166"
      ]
     },
     "execution_count": 268,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "sum(avavI) / len(avavI)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 269,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0.8442348545947543"
      ]
     },
     "execution_count": 269,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "sum(avavR) / len(avavR)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 251,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Help on function cosine in module scipy.spatial.distance:\n",
      "\n",
      "cosine(u, v, w=None)\n",
      "    Compute the Cosine distance between 1-D arrays.\n",
      "    \n",
      "    The Cosine distance between `u` and `v`, is defined as\n",
      "    \n",
      "    .. math::\n",
      "    \n",
      "        1 - \\frac{u \\cdot v}\n",
      "                  {||u||_2 ||v||_2}.\n",
      "    \n",
      "    where :math:`u \\cdot v` is the dot product of :math:`u` and\n",
      "    :math:`v`.\n",
      "    \n",
      "    Parameters\n",
      "    ----------\n",
      "    u : (N,) array_like\n",
      "        Input array.\n",
      "    v : (N,) array_like\n",
      "        Input array.\n",
      "    w : (N,) array_like, optional\n",
      "        The weights for each value in `u` and `v`. Default is None,\n",
      "        which gives each value a weight of 1.0\n",
      "    \n",
      "    Returns\n",
      "    -------\n",
      "    cosine : double\n",
      "        The Cosine distance between vectors `u` and `v`.\n",
      "    \n",
      "    Examples\n",
      "    --------\n",
      "    >>> from scipy.spatial import distance\n",
      "    >>> distance.cosine([1, 0, 0], [0, 1, 0])\n",
      "    1.0\n",
      "    >>> distance.cosine([100, 0, 0], [0, 1, 0])\n",
      "    1.0\n",
      "    >>> distance.cosine([1, 1, 0], [0, 1, 0])\n",
      "    0.29289321881345254\n",
      "\n"
     ]
    }
   ],
   "source": [
    "help(spatial.distance.cosine)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": 132,
   "metadata": {},
   "outputs": [],
   "source": [
    "f = open(\"goodreads_comics.tsv\", 'w')\n",
    "for d in data:\n",
    "    f.write(str(d['user_id']) + '\\t' + str(d['book_id']) + '\\t' + str(d['rating']) + '\\n')\n",
    "\n",
    "f.close()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 134,
   "metadata": {},
   "outputs": [],
   "source": [
    "from surprise import SVD, Reader, Dataset\n",
    "from surprise.model_selection import train_test_split"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 135,
   "metadata": {},
   "outputs": [],
   "source": [
    "reader = Reader(line_format='user item rating', sep='\\t')\n",
    "data = Dataset.load_from_file(\"goodreads_fantasy.tsv\", reader=reader)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 136,
   "metadata": {},
   "outputs": [],
   "source": [
    "model = SVD()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 142,
   "metadata": {},
   "outputs": [],
   "source": [
    "trainset, testset = train_test_split(data, test_size=0.25)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 143,
   "metadata": {},
   "outputs": [
    {
     "ename": "KeyboardInterrupt",
     "evalue": "",
     "output_type": "error",
     "traceback": [
      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
      "\u001b[0;31mKeyboardInterrupt\u001b[0m                         Traceback (most recent call last)",
      "\u001b[0;32m<ipython-input-143-03e04613f8e0>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m()\u001b[0m\n\u001b[1;32m      1\u001b[0m \u001b[0mmodel\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mfit\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mtrainset\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 2\u001b[0;31m \u001b[0mpredictions\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mmodel\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mtest\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mtestset\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m",
      "\u001b[0;32m/usr/local/lib/python3.5/dist-packages/surprise/prediction_algorithms/algo_base.py\u001b[0m in \u001b[0;36mtest\u001b[0;34m(self, testset, verbose)\u001b[0m\n\u001b[1;32m    166\u001b[0m                                     \u001b[0mr_ui_trans\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    167\u001b[0m                                     verbose=verbose)\n\u001b[0;32m--> 168\u001b[0;31m                        for (uid, iid, r_ui_trans) in testset]\n\u001b[0m\u001b[1;32m    169\u001b[0m         \u001b[0;32mreturn\u001b[0m \u001b[0mpredictions\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    170\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
      "\u001b[0;32m/usr/local/lib/python3.5/dist-packages/surprise/prediction_algorithms/algo_base.py\u001b[0m in \u001b[0;36m<listcomp>\u001b[0;34m(.0)\u001b[0m\n\u001b[1;32m    166\u001b[0m                                     \u001b[0mr_ui_trans\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    167\u001b[0m                                     verbose=verbose)\n\u001b[0;32m--> 168\u001b[0;31m                        for (uid, iid, r_ui_trans) in testset]\n\u001b[0m\u001b[1;32m    169\u001b[0m         \u001b[0;32mreturn\u001b[0m \u001b[0mpredictions\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    170\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
      "\u001b[0;32m/usr/local/lib/python3.5/dist-packages/surprise/prediction_algorithms/algo_base.py\u001b[0m in \u001b[0;36mpredict\u001b[0;34m(self, uid, iid, r_ui, clip, verbose)\u001b[0m\n\u001b[1;32m    104\u001b[0m         \u001b[0mdetails\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m{\u001b[0m\u001b[0;34m}\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    105\u001b[0m         \u001b[0;32mtry\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 106\u001b[0;31m             \u001b[0mest\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mestimate\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0miuid\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0miiid\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    107\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    108\u001b[0m             \u001b[0;31m# If the details dict was also returned\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
      "\u001b[0;31mKeyboardInterrupt\u001b[0m: "
     ]
    }
   ],
   "source": [
    "model.fit(trainset)\n",
    "predictions = model.test(testset)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "help(model)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.5.2"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
