Lecture 4 example: Gradient method for parameter estimation#

  • NB: Even though the gradient estimation methods have been implemented in most of machine learning/regression method, it would be nice we know how it has been worked

  • There are two types of gradient methods, determinstic and stochastic

  • Think about their efficiency and accuracy, i.e., pros/cons

PART I: For this implementation, we are going to use the advertising dataset.#

This is a dataset that gives us the total sales for different products, after marketing them on Television, Radio and Newspaper. Using our algorithm, we can find out which medium performs the best for our sales and assign weights to all the mediums accordingly.

# load some necessary libraries
import pandas as pd
from numpy.random import randint

import numpy as np
import matplotlib.pyplot as plt
import seaborn as sn
df=pd.read_csv('Advertising.csv')
df.head()
Unnamed: 0 TV radio newspaper sales
0 1 230.1 37.8 69.2 22.1
1 2 44.5 39.3 45.1 10.4
2 3 17.2 45.9 69.3 9.3
3 4 151.5 41.3 58.5 18.5
4 5 180.8 10.8 58.4 12.9
X=df[['TV','radio','newspaper']]
Y=df['sales']
Y=np.array((Y-Y.mean())/Y.std())
X=X.apply(lambda rec:(rec-rec.mean())/rec.std(),axis=0)

Once we have a normalised dataset, we can start defining our algorithm. To implement a gradient descent algorithm we need to follow 4 steps:

  • Randomly initialize the bias and the weight theta

  • Calculate predicted value of y that is Y given the bias and the weight

  • Calculate the cost function from predicted and actual values of Y

  • Calculate gradient and the weights

import random
def initialize(dim):
    b=random.random()
    theta=np.random.rand(dim)
    return b,theta

b,theta=initialize(3)
print("Bias: ",b, ",   ", "Weights: ",theta)
Bias:  0.9526887046636117 ,    Weights:  [0.01683829 0.6870954  0.50098462]
def predict_Y(b,theta,X):
    return b + np.dot(X,theta)

Y_hat=predict_Y(b,theta,X)
Y_hat[0:10]
array([ 2.53068341,  2.00931635,  2.8660883 ,  2.43110685,  1.02304367,
        3.13437818,  1.21417681,  0.3418495 , -0.73375777, -0.20844808])
import math
def get_cost(Y,Y_hat):
    Y_resd=Y-Y_hat
    return np.sum(np.dot(Y_resd.T,Y_resd))/len(Y-Y_resd)

Y_hat=predict_Y(b,theta,X)
get_cost(Y,Y_hat)
1.8254478835326722

NB: The parameters passed to the function are

  • x,y : the input and output variable

  • y_hat: predicted value with current bias and weights

  • b_0,theta_0: current bias and weights

  • Learning rate: learning rate to adjust the update step

def update_theta(x,y,y_hat,b_0,theta_o,learning_rate):
    db=(np.sum(y_hat-y)*2)/len(y)
    dw=(np.dot((y_hat-y),x)*2)/len(y)
    b_1=b_0-learning_rate*db
    theta_1=theta_o-learning_rate*dw
    return b_1,theta_1

print("After initialization -Bias: ",b,"theta: ",theta)
Y_hat=predict_Y(b,theta,X)
b,theta=update_theta(X,Y,Y_hat,b,theta,0.01)
print("After first update -Bias: ",b,"theta: ",theta)
get_cost(Y,Y_hat)
After initialization -Bias:  0.9526887046636117 theta:  [0.01683829 0.6870954  0.50098462]
After first update -Bias:  0.9336349305703394 theta:  [0.03075531 0.68134039 0.49069747]
1.8254478835326722
def run_gradient_descent(X,Y,alpha,num_iterations):
    b,theta=initialize(X.shape[1])
    iter_num=0
    gd_iterations_df=pd.DataFrame(columns=['iteration','cost'])
    result_idx=0
    for each_iter in range(num_iterations):
        Y_hat=predict_Y(b,theta,X)
        this_cost=get_cost(Y,Y_hat)
        prev_b=b
        prev_theta=theta
        b,theta=update_theta(X,Y,Y_hat,prev_b,prev_theta,alpha)
        if((iter_num%10==0)):
            gd_iterations_df.loc[result_idx]=[iter_num,this_cost]
        result_idx=result_idx+1
        iter_num +=1
    print("Final Estimate of b and theta is: ",b,theta)
    return gd_iterations_df,b,theta

gd_iterations_df,b,theta=run_gradient_descent(X,Y,alpha=0.001,num_iterations=200)
Final Estimate of b and theta is:  0.3364027827437597 [0.46412853 0.2043997  0.13832669]
gd_iterations_df[0:10]
iteration cost
0 0.0 0.762186
10 10.0 0.736024
20 20.0 0.710951
30 30.0 0.686920
40 40.0 0.663886
50 50.0 0.641806
60 60.0 0.620638
70 70.0 0.600343
80 80.0 0.580883
90 90.0 0.562223
%matplotlib inline
plt.plot(gd_iterations_df['iteration'],gd_iterations_df['cost'])
plt.xlabel("Number of iterations")
plt.ylabel("Cost or MSE")
Text(0, 0.5, 'Cost or MSE')
../../../../_images/1ef313f02abfb391b69725de5b06254599b60a08663cbb9ff824f9e2a3c4dc05.png
alpha_df_1,b,theta=run_gradient_descent(X,Y,alpha=0.01,num_iterations=2000)
alpha_df_2,b,theta=run_gradient_descent(X,Y,alpha=0.001,num_iterations=2000)
plt.plot(alpha_df_1['iteration'],alpha_df_1['cost'],label="alpha=0.01")
plt.plot(alpha_df_2['iteration'],alpha_df_2['cost'],label="alpha=0.001")
plt.legend()
plt.ylabel('cost')
plt.xlabel('Number of iterations')
plt.title('Cost Vs. Iterations for different alpha values')
Final Estimate of b and theta is:  2.7774630888127755e-16 [ 0.75306591  0.53648155 -0.00433069]
Final Estimate of b and theta is:  0.015495503503709054 [0.74481196 0.50267295 0.03065355]
Text(0.5, 1.0, 'Cost Vs. Iterations for different alpha values')
../../../../_images/8aa58cba606f4287dd307c1e746b6b59833923763f3bacdc6ab347f592401e30.png

Part II: demonstration of stochastic gradient#

A good reference to work on: https://jcboyd.github.io/assets/lsml2018/stochastic_gradient_descent.html

from scipy import io
from numpy import log, exp

data = io.loadmat('data_orsay_2017.mat')

X_train, y_train = data['Xtrain'], data['ytrain']
X_test, y_test = data['Xtest'], data['ytest']

print('X_train shape: %s' % str(X_train.shape))
print('y_train shape: %s' % str(y_train.shape))
print('X_test shape: %s' % str(X_test.shape))
print('y_test shape: %s' % str(y_test.shape))
X_train shape: (10000, 100)
y_train shape: (10000, 1)
X_test shape: (100000, 100)
y_test shape: (100000, 1)
# This is to convert the data structure to X= [1, x]
X_train_bt = np.concatenate([np.ones((X_train.shape[0], 1)), X_train], axis=1)
X_test_bt = np.concatenate([np.ones((X_test.shape[0], 1)), X_test], axis=1)
X_train_bt.shape, X_test_bt.shape 
((10000, 101), (100000, 101))

Defind the logical loss function: \(Loss = \frac{1}{N}\sum_{i=1}^N\log(1 + \exp(-y_i\mathbf{x}^T\boldsymbol\beta))\)

# Defind the logical loss function: $\frac{1}{N}\sum_{i=1}^N\log(1 + \exp(-y_i\mathbf{x}^T\boldsymbol\beta))$

def log_loss(X, y, beta):
    #TODO: implement the logistic loss function
    return np.sum(np.log(1 + np.exp(-y * X.dot(beta)))) / X.shape[0]


# Implement a function to return the logistic loss gradient:
def evaluate_gradient(X, y, beta):
    # TODO: implement the gradient of the logistic loss
    return np.sum(-X * y / (1 + np.exp(y * X.dot(beta))), axis=0) / X.shape[0]

NB: Stochastics gradient descent#

since the Dim(X), number of samples, is 10000, we will have to use the stochastic gradient descent

# Pay attention to the generated radom vector for batch as stochastic gradient
def minibatch_gradient_descent(X, y, batch_size=10, lr=1e-1, max_iters=1000, tol=1e-5):
    # randomly initialise beta
    N, D = X.shape
    beta = np.random.rand(D, 1)
    # initialise history variables
    losses = [log_loss(X, y, beta)]
    betas = [beta]

    for i in range(max_iters):
        # TODO: construct batch
        start = i * batch_size % N
        end = min(start + batch_size, N)
        idx = np.arange(start, end)
        batchX = X[idx]
        batchY = y[idx]
        grad = evaluate_gradient(batchX, batchY, beta)
        grad = grad[:, np.newaxis]
        # TODO: perform gradient descent step (as before)
        beta -= lr * grad
        if i % 10 == 0:
            loss = log_loss(X, y, beta)
            losses.append(loss)
        betas.append(beta.copy())

        if np.sqrt(grad.T.dot(grad)) < tol: break

    return betas, losses


#Another way of random sampling

def minibatch_gradient_descent_sampling(X, y, batch_size=10, lr=1e-1, max_iters=1000, tol=1e-5):
    # randomly initialise beta
    N, D = X.shape
    beta = np.random.rand(D, 1)
    # initialise history variables
    losses = [log_loss(X, y, beta)]
    betas = [beta]

    for i in range(max_iters):
        # TODO: randomly sample batch
        idx = np.random.randint(X.shape[0], size=batch_size)
        batchX = X[idx]
        batchY = y[idx]
        grad = evaluate_gradient(batchX, batchY, beta)
        grad = grad[:, np.newaxis]
        # TODO: perform gradient descent step (as before)
        beta -= lr * grad
        if i % 10 == 0:
            loss = log_loss(X, y, beta)
            losses.append(loss)
        betas.append(beta.copy())

        if np.sqrt(grad.T.dot(grad)) < tol: break

    return betas, losses

# Run batch gradient descent
betas, losses = minibatch_gradient_descent(X_train_bt, y_train, batch_size=10, lr=1e-0)
# Check the accuracy of the predicted model using test data
from scipy.special import expit
beta = betas[-1]
# TODO: calculate output probabilities
probs = expit(X_test_bt.dot(beta))

y_pred = list(map(lambda x : 1 if x >= 0.5 else -1, probs))

from sklearn.metrics import accuracy_score
accuracy_score(y_test, y_pred)
0.76615
### Now we start to investigate the impact of batch size to the convergency of results
bs = [1, 5, 100]
_, losses1 = minibatch_gradient_descent(X_train, y_train, batch_size=bs[0])
_, losses2 = minibatch_gradient_descent(X_train, y_train, batch_size=bs[1])
_, losses3 = minibatch_gradient_descent(X_train, y_train, batch_size=bs[2])

# create figure
fig = plt.figure(figsize=(5, 5))
# add subplot (rows, cols, number)
ax = fig.add_subplot(1, 1, 1, xlabel='iteration', ylabel='loss')
# plot data on new axis
ax.plot(losses1, color='red', marker='.', alpha=0.5, label='M = %s'%bs[0])
ax.plot(losses2, color='green', marker='.', alpha=0.5, label='M = %s'%bs[1])
ax.plot(losses3, color='blue', marker='.', alpha=0.5, label='M = %s'%bs[2])
ax.set_title('Stochastic gradient descent')
# display lengend
plt.legend()
# display plot
plt.show()
../../../../_images/6b3df1bcd254bd4d732fe21eaf3d3be82f6bd7e3d3d5a5f1e21d7fbd11b73352.png

PART III: Two SGD are used based on their sampling methods, i.e., cycling vs random sampling#

Below we will compare them

bs = [1, 10]
betas1, losses1 = minibatch_gradient_descent(X_train, y_train, batch_size=bs[0])
betas2, losses2 = minibatch_gradient_descent_sampling(X_train, y_train, batch_size=bs[0])
betas3, losses3 = minibatch_gradient_descent(X_train, y_train, batch_size=bs[1])
betas4, losses4 = minibatch_gradient_descent_sampling(X_train, y_train, batch_size=bs[1])

# create figure
fig = plt.figure(figsize=(10, 10))
# add subplot (rows, cols, number)
ax = fig.add_subplot(1, 1, 1, xlabel='iteration', ylabel='loss')
# plot data on new axis
ax.plot(losses1, color='red', marker='.', alpha=0.5, label='Cycling (M = %d)' % bs[0])
ax.plot(losses2, color='blue', marker='.', alpha=0.5, label='Sampling (M = %d)' % bs[0])
ax.plot(losses3, color='red', marker='*', alpha=0.5, label='Cycling (M = %d)' % bs[1])
ax.plot(losses4, color='blue', marker='*', alpha=0.5, label='Sampling (M = %d)' % bs[1])
# display plot
ax.set_title('Cycling vs. sampling minibatch gradient descent')
plt.legend()
plt.show()
../../../../_images/d5f35666f6c81bbea70b14868aa51d79bf374d292298eeee0c6057c8a0970918.png
len(betas3),len(betas1)
(1001, 1001)