import numpy
import urllib
import scipy.optimize
import random
from sklearn import svm

def parseData(fname):
  for l in urllib.urlopen(fname):
    yield eval(l)

print "Reading data..."
data = list(parseData("http://jmcauley.ucsd.edu/cse255/data/amazon/book_descriptions_50000.json"))
print "done"

### Naive bayes to determine p(childrens book | mentions wizards and mentions witches) ###

# p(childrens book)
prior = ["Children's Books" in b['categories'] for b in data]
prior = sum(prior) * 1.0 / len(prior)

# p(isn't children's book)
prior_neg = 1 - prior

# p(mentions wizards | is childrens)
p1 = ['wizard' in b['description'] for b in data if "Children's Books" in b['categories']]
p1 = sum(p1) * 1.0 / len(p1)

# p(mentions wizards | isn't childrens)
p1_neg = ['wizard' in b['description'] for b in data if not ("Children's Books" in b['categories'])]
p1_neg = sum(p1_neg) * 1.0 / len(p1_neg)

# p(mentions witches | is childrens)
p2 = ['witch' in b['description'] for b in data if "Children's Books" in b['categories']]
p2 = sum(p2) * 1.0 / len(p2)

# p(mentions witches | isn't childrens)
p2_neg = ['witch' in b['description'] for b in data if not ("Children's Books" in b['categories'])]
p2_neg = sum(p2_neg) * 1.0 / len(p2_neg)

# Prediction

score = prior * p1 * p2
score_neg = prior_neg * p1_neg * p2_neg

# Actual ('non-naive') probability

p = ["Children's Books" in b['categories'] for b in data if 'witch' in b['description'] and 'wizard' in b['description']]
p = sum(p) * 1.0 / len(p)

### SVM -- "Judging a book by its cover"

print "Reading data..."
data = list(parseData("http://jmcauley.ucsd.edu/cse255/data/amazon/book_images_5000.json"))
print "done"

X = [b['image_feature'] for b in data]
y = ["Children's Books" in b['categories'] for b in data]

X_train = X[:2500]
y_train = y[:2500]

X_test = X[2500:]
y_test = y[2500:]

# Create a support vector classifier object, with regularization parameter C = 1000
clf = svm.SVC(C=1000)
clf.fit(X_train, y_train)

train_predictions = clf.predict(X_train)
test_predictions = clf.predict(X_test)