import gzip from collections import defaultdict from random import random def readGz(f): for l in gzip.open(f): yield eval(l) ### Helpfulness baseline: similar to the above. Compute the global average helpfulness rate, and the average helpfulness rate for each user nUser = defaultdict(int) nItem = defaultdict(int) ratings = [] for l in readGz("train.json.gz"): ratings.append((l['reviewerID'], l['itemID'], l['rating'])) nUser[l['reviewerID']] += 1 nItem[l['itemID']] += 1 nTrain = len(ratings) lamb = 1.0 alpha = 0 userBeta = defaultdict(float) itemBeta = defaultdict(float) objective = None while (True): #update alpha alpha = 0 for user, item, rating in ratings: alpha += (rating - (userBeta[user] + itemBeta[item])) / nTrain #update user terms userBeta = defaultdict(float) for user, item, rating in ratings: userBeta[user] += (rating - (alpha + itemBeta[item])) / (lamb + nUser[user]) #update item terms itemBeta = defaultdict(float) for user, item, rating in ratings: itemBeta[item] += (rating - (alpha + userBeta[user])) / (lamb + nItem[item]) oldObjective = objective objective = 0 for user, item, rating in ratings: objective += (rating - (alpha + userBeta[user] + itemBeta[item]))**2 for u in userBeta: objective += lamb * userBeta[u]**2 for i in itemBeta: objective += lamb * itemBeta[i]**2 print objective if (oldObjective and oldObjective - objective < 0.0000001*objective): break ### Gradient descent alpha = 0 for user in userBeta: userBeta[user] = 0 for item in itemBeta: itemBeta[item] = 0 objective = None stepsize = 0.000001 while (True): dalpha = 0 duserBeta = defaultdict(float) ditemBeta = defaultdict(float) for user, item, rating in ratings: residual = alpha + userBeta[user] + itemBeta[item] - rating dalpha += 2*residual duserBeta[user] += 2*residual ditemBeta[item] += 2*residual for user in userBeta: duserBeta[user] += 2*lamb*userBeta[user] for item in itemBeta: ditemBeta[item] += 2*lamb*itemBeta[item] alpha -= stepsize*dalpha for user in userBeta: userBeta[user] -= stepsize*duserBeta[user] for item in itemBeta: itemBeta[item] -= stepsize*ditemBeta[item] oldObjective = objective objective = 0 for user, item, rating in ratings: objective += (rating - (alpha + userBeta[user] + itemBeta[item]))**2 for u in userBeta: objective += lamb * userBeta[u]**2 for i in itemBeta: objective += lamb * itemBeta[i]**2 print objective if (oldObjective and oldObjective - objective < 0.0000001*objective): break