##################################################
# Loading data from the web #
##################################################
from urllib.request import urlopen
# Grab the page from the web
f = urlopen("https://www.goodreads.com/book/show/4671.The_Great_Gatsby")
html = str(f.read())
# Extract the (30) review elements
reviews = html.split('
')[1].split('
')[1].split(' struct
timeStruct = time.strptime(timeString, "%Y-%m-%d")
time.strptime("21:36:18, 28/5/2019", "%H:%M:%S, %d/%m/%Y")
# Struct --> int
timeInt = time.mktime(timeStruct)
timeInt
timeInt2 = time.mktime(time.strptime(dataset[99]['date'], "%Y-%m-%d"))
timeDiff = timeInt - timeInt2
# Int --> struct
timeStruct2 = time.gmtime(timeInt2)
# Struct --> string
time.strftime("%b %Y, %I:%M:%S", timeStruct2)
##################################################
# Matplotlib #
##################################################
datasetWithTimeValues = []
# Add time values to the dataset
for d in dataset:
d['date']
d['timeStruct'] = time.strptime(d['date'], "%Y-%m-%d")
d['timeInt'] = time.mktime(d['timeStruct'])
datasetWithTimeValues.append(d)
# Compile ratings per weekday
from collections import defaultdict
weekRatings = defaultdict(list)
for d in datasetWithTimeValues:
day = d['timeStruct'].tm_wday
weekRatings[day].append(d['stars'])
weekAverages = {}
for d in weekRatings:
weekAverages[d] = sum(weekRatings[d]) / len(weekRatings[d])
# Plot...
X = [0,1,2,3,4,5,6]
Y = [weekAverages[x] for x in X]
import matplotlib.pyplot as plt
# Line plot
plt.plot(X, Y)
plt.show()
# Bar plot
plt.bar(X, Y)
plt.show()
# Limits
plt.bar(X, Y)
plt.ylim(3.6, 3.8)
plt.show()
# Label, ticks, and title
plt.ylim(3.6, 3.8)
plt.xlabel("Weekday")
plt.ylabel("Av. Rating")
plt.xticks(X, "SMTWTFS")
plt.title("weekday vs. rating")
plt.bar(X, Y)
plt.show()
##################################################
# Tensorflow #
##################################################
import tensorflow as tf
# PM2.5 data: https://archive.ics.uci.edu/ml/datasets/Beijing+PM2.5+Data
path = "datasets/PRSA_data_2010.1.1-2014.12.31.csv"
f = open(path, 'r')
dataset = []
header = f.readline().strip().split(',')
for line in f:
line = line.split(',')
dataset.append(line)
# Exclude N/A entries from the dataset
dataset = [d for d in dataset if d[5] != 'NA']
# Extract features
def feature(datum):
feat = [1, float(datum[7]), float(datum[8]), float(datum[10])] # Temperature, pressure, and wind speed
return feat
X = [feature(d) for d in dataset]
y = [float(d[5]) for d in dataset]
# Convert to tensorflow constant (column vector)
y = tf.constant(y, shape=[len(y),1])
K = len(X[0])
# Regularized MSE
def MSE(X, y, theta):
return tf.reduce_mean((tf.matmul(X, theta) - y)**2) + 1.0 * tf.reduce_sum(theta**2)
theta = tf.Variable(tf.constant([0.0]*K, shape=[K,1]))
optimizer = tf.train.AdamOptimizer(0.01)
objective = MSE(X,y,theta)
train = optimizer.minimize(objective)
# Initialize variables
init = tf.global_variables_initializer()
sess = tf.Session()
sess.run(init)
# Run gradient descent
for iteration in range(1000):
cvalues = sess.run([train, objective])
print("objective = " + str(cvalues[1]))
# Print results
with sess.as_default():
print(MSE(X, y, theta).eval())
print(theta.eval())