Similarity-based recommendation

In [1]:
import gzip
from collections import defaultdict
import scipy
import scipy.optimize
import numpy
import random
In [6]:
# From https://s3.amazonaws.com/amazon-reviews-pds/tsv/amazon_reviews_us_Musical_Instruments_v1_00.tsv.gz
path = "C://Users/Julian McAuley/Documents/class_files/amazon_reviews_us_Musical_Instruments_v1_00.tsv.gz"
In [7]:
f = gzip.open(path, 'rt', encoding="utf8")
In [8]:
header = f.readline()
header = header.strip().split('\t')
In [9]:
dataset = []
In [10]:
for line in f:
    fields = line.strip().split('\t')
    d = dict(zip(header, fields))
    d['star_rating'] = int(d['star_rating'])
    d['helpful_votes'] = int(d['helpful_votes'])
    d['total_votes'] = int(d['total_votes'])
    dataset.append(d)
In [11]:
dataset[0]
Out[11]:
{'marketplace': 'US',
 'customer_id': '45610553',
 'review_id': 'RMDCHWD0Y5OZ9',
 'product_id': 'B00HH62VB6',
 'product_parent': '618218723',
 'product_title': 'AGPtek® 10 Isolated Output 9V 12V 18V Guitar Pedal Board Power Supply Effect Pedals with Isolated Short Cricuit / Overcurrent Protection',
 'product_category': 'Musical Instruments',
 'star_rating': 3,
 'helpful_votes': 0,
 'total_votes': 1,
 'vine': 'N',
 'verified_purchase': 'N',
 'review_headline': 'Three Stars',
 'review_body': 'Works very good, but induces ALOT of noise.',
 'review_date': '2015-08-31'}

First we'll build a few useful data structures, in this case just to maintain a collection of the items reviewed by each user, and the collection of users who have reviewed each item.

In [12]:
usersPerItem = defaultdict(set)
itemsPerUser = defaultdict(set)
In [13]:
itemNames = {}
In [14]:
for d in dataset:
    user,item = d['customer_id'], d['product_id']
    usersPerItem[item].add(user)
    itemsPerUser[user].add(item)
    itemNames[item] = d['product_title']
In [15]:
def Jaccard(s1, s2):
    numer = len(s1.intersection(s2))
    denom = len(s1.union(s2))
    return numer / denom
In [16]:
def mostSimilar(i):
    similarities = []
    users = usersPerItem[i]
    for i2 in usersPerItem:
        if i2 == i: continue
        sim = Jaccard(users, usersPerItem[i2])
        similarities.append((sim,i2))
    similarities.sort(reverse=True)
    return similarities[:10]

Generating a recommendation

In [17]:
dataset[2]
Out[17]:
{'marketplace': 'US',
 'customer_id': '6111003',
 'review_id': 'RIZR67JKUDBI0',
 'product_id': 'B0006VMBHI',
 'product_parent': '603261968',
 'product_title': 'AudioQuest LP record clean brush',
 'product_category': 'Musical Instruments',
 'star_rating': 3,
 'helpful_votes': 0,
 'total_votes': 1,
 'vine': 'N',
 'verified_purchase': 'Y',
 'review_headline': 'Three Stars',
 'review_body': 'removes dust. does not clean',
 'review_date': '2015-08-31'}
In [18]:
query = dataset[2]['product_id']
In [19]:
mostSimilar(query)
Out[19]:
[(0.028446389496717725, 'B00006I5SD'),
 (0.01694915254237288, 'B00006I5SB'),
 (0.015065913370998116, 'B000AJR482'),
 (0.014204545454545454, 'B00E7MVP3S'),
 (0.008955223880597015, 'B001255YL2'),
 (0.008849557522123894, 'B003EIRVO8'),
 (0.008333333333333333, 'B0015VEZ22'),
 (0.00821917808219178, 'B00006I5UH'),
 (0.008021390374331552, 'B00008BWM7'),
 (0.007656967840735069, 'B000H2BC4E')]
In [20]:
itemNames[query]
Out[20]:
'AudioQuest LP record clean brush'
In [21]:
[itemNames[x[1]] for x in mostSimilar(query)]
Out[21]:
['Shure SFG-2 Stylus Tracking Force Gauge',
 'Shure M97xE High-Performance Magnetic Phono Cartridge',
 'ART Pro Audio DJPRE II Phono Turntable Preamplifier',
 'Signstek Blue LCD Backlight Digital Long-Playing LP Turntable Stylus Force Scale Gauge Tester',
 'Audio Technica AT120E/T Standard Mount Phono Cartridge',
 'Technics: 45 Adaptor for Technics 1200 (SFWE010)',
 'GruvGlide GRUVGLIDE DJ Package',
 'STANTON MAGNETICS Record Cleaner Kit',
 'Shure M97xE High-Performance Magnetic Phono Cartridge',
 'Behringer PP400 Ultra Compact Phono Preamplifier']

Efficient similarity-based recommendation

In [22]:
def mostSimilarFast(i):
    similarities = []
    users = usersPerItem[i]
    candidateItems = set()
    for u in users:
        candidateItems = candidateItems.union(itemsPerUser[u])
    for i2 in candidateItems:
        if i2 == i: continue
        sim = Jaccard(users, usersPerItem[i2])
        similarities.append((sim,i2))
    similarities.sort(reverse=True)
    return similarities[:10]
In [23]:
mostSimilarFast(query)
Out[23]:
[(0.028446389496717725, 'B00006I5SD'),
 (0.01694915254237288, 'B00006I5SB'),
 (0.015065913370998116, 'B000AJR482'),
 (0.014204545454545454, 'B00E7MVP3S'),
 (0.008955223880597015, 'B001255YL2'),
 (0.008849557522123894, 'B003EIRVO8'),
 (0.008333333333333333, 'B0015VEZ22'),
 (0.00821917808219178, 'B00006I5UH'),
 (0.008021390374331552, 'B00008BWM7'),
 (0.007656967840735069, 'B000H2BC4E')]

Collaborative-filtering-based rating estimation

In [20]:
reviewsPerUser = defaultdict(list)
reviewsPerItem = defaultdict(list)
In [21]:
for d in dataset:
    user,item = d['customer_id'], d['product_id']
    reviewsPerUser[user].append(d)
    reviewsPerItem[item].append(d)
In [22]:
ratingMean = sum([d['star_rating'] for d in dataset]) / len(dataset)
In [23]:
ratingMean
Out[23]:
4.251102772543146

Our prediction function computes (a) a list of the user's previous ratings (ignoring the query item); and (b) a list of the similarities of these previous items, compared to the query. These weights are used to constructed a weighted average of the ratings from the first set.

In [24]:
def predictRating(user,item):
    ratings = []
    similarities = []
    for d in reviewsPerUser[user]:
        i2 = d['product_id']
        if i2 == item: continue
        ratings.append(d['star_rating'])
        similarities.append(Jaccard(usersPerItem[item],usersPerItem[i2]))
    if (sum(similarities) > 0):
        weightedRatings = [(x*y) for x,y in zip(ratings,similarities)]
        return sum(weightedRatings) / sum(similarities)
    else:
        # User hasn't rated any similar items
        return ratingMean

Let's try a simple example:

In [25]:
dataset[1]
Out[25]:
{'customer_id': '14640079',
 'helpful_votes': 0,
 'marketplace': 'US',
 'product_category': 'Musical Instruments',
 'product_id': 'B003LRN53I',
 'product_parent': '986692292',
 'product_title': 'Sennheiser HD203 Closed-Back DJ Headphones',
 'review_body': 'Nice headphones at a reasonable price.',
 'review_date': '2015-08-31',
 'review_headline': 'Five Stars',
 'review_id': 'RZSL0BALIYUNU',
 'star_rating': 5,
 'total_votes': 0,
 'verified_purchase': 'Y',
 'vine': 'N'}
In [26]:
u,i = dataset[1]['customer_id'], dataset[1]['product_id']
In [27]:
predictRating(u, i)
Out[27]:
5.0
In [28]:
def MSE(predictions, labels):
    differences = [(x-y)**2 for x,y in zip(predictions,labels)]
    return sum(differences) / len(differences)
In [29]:
alwaysPredictMean = [ratingMean for d in dataset]
In [30]:
cfPredictions = [predictRating(d['customer_id'], d['product_id']) for d in dataset]
In [31]:
labels = [d['star_rating'] for d in dataset]
In [32]:
MSE(alwaysPredictMean, labels)
Out[32]:
1.4796142779564334
In [33]:
MSE(cfPredictions, labels)
Out[33]:
1.6146130004291603

In this case, the accuracy of our rating prediction model was actually worse (in terms of the MSE) than just predicting the mean rating. However note again that this is just a heuristic, and could be modified to improve its predictions (e.g. by using a different similarity function other than the Jaccard similarity).