Lecture 4 example: Gradient method for parameter estimation

Lecture 4 example: Gradient method for parameter estimation#

NB: Even though the gradient estimation methods have been implemented in most of machine learning/regression method, it would be nice we know how it has been worked

There are two types of gradient methods, determinstic and stochastic

Think about their efficiency and accuracy, i.e., pros/cons

PART I: For this implementation, we are going to use the advertising dataset.#

This is a dataset that gives us the total sales for different products, after marketing them on Television, Radio and Newspaper. Using our algorithm, we can find out which medium performs the best for our sales and assign weights to all the mediums accordingly.

# load some necessary libraries
import pandas as pd
from numpy.random import randint

import numpy as np
import matplotlib.pyplot as plt
import seaborn as sn
df=pd.read_csv('Advertising.csv')
df.head()

	Unnamed: 0	TV	radio	newspaper	sales
0	1	230.1	37.8	69.2	22.1
1	2	44.5	39.3	45.1	10.4
2	3	17.2	45.9	69.3	9.3
3	4	151.5	41.3	58.5	18.5
4	5	180.8	10.8	58.4	12.9

X=df[['TV','radio','newspaper']]
Y=df['sales']
Y=np.array((Y-Y.mean())/Y.std())
X=X.apply(lambda rec:(rec-rec.mean())/rec.std(),axis=0)

Once we have a normalised dataset, we can start defining our algorithm. To implement a gradient descent algorithm we need to follow 4 steps:

Randomly initialize the bias and the weight theta

Calculate predicted value of y that is Y given the bias and the weight

Calculate the cost function from predicted and actual values of Y

Calculate gradient and the weights

import random
def initialize(dim):
    b=random.random()
    theta=np.random.rand(dim)
    return b,theta

b,theta=initialize(3)
print("Bias: ",b, ",   ", "Weights: ",theta)

Bias:  0.9526887046636117 ,    Weights:  [0.01683829 0.6870954  0.50098462]

def predict_Y(b,theta,X):
    return b + np.dot(X,theta)

Y_hat=predict_Y(b,theta,X)
Y_hat[0:10]

array([ 2.53068341,  2.00931635,  2.8660883 ,  2.43110685,  1.02304367,
        3.13437818,  1.21417681,  0.3418495 , -0.73375777, -0.20844808])

import math
def get_cost(Y,Y_hat):
    Y_resd=Y-Y_hat
    return np.sum(np.dot(Y_resd.T,Y_resd))/len(Y-Y_resd)

Y_hat=predict_Y(b,theta,X)
get_cost(Y,Y_hat)

1.8254478835326722

NB: The parameters passed to the function are

x,y : the input and output variable

y_hat: predicted value with current bias and weights

b_0,theta_0: current bias and weights

Learning rate: learning rate to adjust the update step

def update_theta(x,y,y_hat,b_0,theta_o,learning_rate):
    db=(np.sum(y_hat-y)*2)/len(y)
    dw=(np.dot((y_hat-y),x)*2)/len(y)
    b_1=b_0-learning_rate*db
    theta_1=theta_o-learning_rate*dw
    return b_1,theta_1

print("After initialization -Bias: ",b,"theta: ",theta)
Y_hat=predict_Y(b,theta,X)
b,theta=update_theta(X,Y,Y_hat,b,theta,0.01)
print("After first update -Bias: ",b,"theta: ",theta)
get_cost(Y,Y_hat)

After initialization -Bias:  0.9526887046636117 theta:  [0.01683829 0.6870954  0.50098462]
After first update -Bias:  0.9336349305703394 theta:  [0.03075531 0.68134039 0.49069747]

1.8254478835326722

def run_gradient_descent(X,Y,alpha,num_iterations):
    b,theta=initialize(X.shape[1])
    iter_num=0
    gd_iterations_df=pd.DataFrame(columns=['iteration','cost'])
    result_idx=0
    for each_iter in range(num_iterations):
        Y_hat=predict_Y(b,theta,X)
        this_cost=get_cost(Y,Y_hat)
        prev_b=b
        prev_theta=theta
        b,theta=update_theta(X,Y,Y_hat,prev_b,prev_theta,alpha)
        if((iter_num%10==0)):
            gd_iterations_df.loc[result_idx]=[iter_num,this_cost]
        result_idx=result_idx+1
        iter_num +=1
    print("Final Estimate of b and theta is: ",b,theta)
    return gd_iterations_df,b,theta

gd_iterations_df,b,theta=run_gradient_descent(X,Y,alpha=0.001,num_iterations=200)

Final Estimate of b and theta is:  0.3364027827437597 [0.46412853 0.2043997  0.13832669]

gd_iterations_df[0:10]

	iteration	cost
0	0.0	0.762186
10	10.0	0.736024
20	20.0	0.710951
30	30.0	0.686920
40	40.0	0.663886
50	50.0	0.641806
60	60.0	0.620638
70	70.0	0.600343
80	80.0	0.580883
90	90.0	0.562223

%matplotlib inline
plt.plot(gd_iterations_df['iteration'],gd_iterations_df['cost'])
plt.xlabel("Number of iterations")
plt.ylabel("Cost or MSE")

Text(0, 0.5, 'Cost or MSE')

../../../../_images/1ef313f02abfb391b69725de5b06254599b60a08663cbb9ff824f9e2a3c4dc05.png

alpha_df_1,b,theta=run_gradient_descent(X,Y,alpha=0.01,num_iterations=2000)
alpha_df_2,b,theta=run_gradient_descent(X,Y,alpha=0.001,num_iterations=2000)
plt.plot(alpha_df_1['iteration'],alpha_df_1['cost'],label="alpha=0.01")
plt.plot(alpha_df_2['iteration'],alpha_df_2['cost'],label="alpha=0.001")
plt.legend()
plt.ylabel('cost')
plt.xlabel('Number of iterations')
plt.title('Cost Vs. Iterations for different alpha values')

Final Estimate of b and theta is:  2.7774630888127755e-16 [ 0.75306591  0.53648155 -0.00433069]
Final Estimate of b and theta is:  0.015495503503709054 [0.74481196 0.50267295 0.03065355]

Text(0.5, 1.0, 'Cost Vs. Iterations for different alpha values')

../../../../_images/8aa58cba606f4287dd307c1e746b6b59833923763f3bacdc6ab347f592401e30.png

Part II: demonstration of stochastic gradient#

A good reference to work on: https://jcboyd.github.io/assets/lsml2018/stochastic_gradient_descent.html

from scipy import io
from numpy import log, exp

data = io.loadmat('data_orsay_2017.mat')

X_train, y_train = data['Xtrain'], data['ytrain']
X_test, y_test = data['Xtest'], data['ytest']

print('X_train shape: %s' % str(X_train.shape))
print('y_train shape: %s' % str(y_train.shape))
print('X_test shape: %s' % str(X_test.shape))
print('y_test shape: %s' % str(y_test.shape))

X_train shape: (10000, 100)
y_train shape: (10000, 1)
X_test shape: (100000, 100)
y_test shape: (100000, 1)

# This is to convert the data structure to X= [1, x]
X_train_bt = np.concatenate([np.ones((X_train.shape[0], 1)), X_train], axis=1)
X_test_bt = np.concatenate([np.ones((X_test.shape[0], 1)), X_test], axis=1)
X_train_bt.shape, X_test_bt.shape 

((10000, 101), (100000, 101))

Defind the logical loss function: \(Loss = \frac{1}{N}\sum_{i=1}^N\log(1 + \exp(-y_i\mathbf{x}^T\boldsymbol\beta))\)

# Defind the logical loss function: $\frac{1}{N}\sum_{i=1}^N\log(1 + \exp(-y_i\mathbf{x}^T\boldsymbol\beta))$

def log_loss(X, y, beta):
    #TODO: implement the logistic loss function
    return np.sum(np.log(1 + np.exp(-y * X.dot(beta)))) / X.shape[0]


# Implement a function to return the logistic loss gradient:
def evaluate_gradient(X, y, beta):
    # TODO: implement the gradient of the logistic loss
    return np.sum(-X * y / (1 + np.exp(y * X.dot(beta))), axis=0) / X.shape[0]

NB: Stochastics gradient descent#

since the Dim(X), number of samples, is 10000, we will have to use the stochastic gradient descent

# Pay attention to the generated radom vector for batch as stochastic gradient
def minibatch_gradient_descent(X, y, batch_size=10, lr=1e-1, max_iters=1000, tol=1e-5):
    # randomly initialise beta
    N, D = X.shape
    beta = np.random.rand(D, 1)
    # initialise history variables
    losses = [log_loss(X, y, beta)]
    betas = [beta]

    for i in range(max_iters):
        # TODO: construct batch
        start = i * batch_size % N
        end = min(start + batch_size, N)
        idx = np.arange(start, end)
        batchX = X[idx]
        batchY = y[idx]
        grad = evaluate_gradient(batchX, batchY, beta)
        grad = grad[:, np.newaxis]
        # TODO: perform gradient descent step (as before)
        beta -= lr * grad
        if i % 10 == 0:
            loss = log_loss(X, y, beta)
            losses.append(loss)
        betas.append(beta.copy())

        if np.sqrt(grad.T.dot(grad)) < tol: break

    return betas, losses


#Another way of random sampling

def minibatch_gradient_descent_sampling(X, y, batch_size=10, lr=1e-1, max_iters=1000, tol=1e-5):
    # randomly initialise beta
    N, D = X.shape
    beta = np.random.rand(D, 1)
    # initialise history variables
    losses = [log_loss(X, y, beta)]
    betas = [beta]

    for i in range(max_iters):
        # TODO: randomly sample batch
        idx = np.random.randint(X.shape[0], size=batch_size)
        batchX = X[idx]
        batchY = y[idx]
        grad = evaluate_gradient(batchX, batchY, beta)
        grad = grad[:, np.newaxis]
        # TODO: perform gradient descent step (as before)
        beta -= lr * grad
        if i % 10 == 0:
            loss = log_loss(X, y, beta)
            losses.append(loss)
        betas.append(beta.copy())

        if np.sqrt(grad.T.dot(grad)) < tol: break

    return betas, losses

# Run batch gradient descent
betas, losses = minibatch_gradient_descent(X_train_bt, y_train, batch_size=10, lr=1e-0)

# Check the accuracy of the predicted model using test data
from scipy.special import expit
beta = betas[-1]
# TODO: calculate output probabilities
probs = expit(X_test_bt.dot(beta))

y_pred = list(map(lambda x : 1 if x >= 0.5 else -1, probs))

from sklearn.metrics import accuracy_score
accuracy_score(y_test, y_pred)

0.76615

### Now we start to investigate the impact of batch size to the convergency of results

bs = [1, 5, 100]
_, losses1 = minibatch_gradient_descent(X_train, y_train, batch_size=bs[0])
_, losses2 = minibatch_gradient_descent(X_train, y_train, batch_size=bs[1])
_, losses3 = minibatch_gradient_descent(X_train, y_train, batch_size=bs[2])

# create figure
fig = plt.figure(figsize=(5, 5))
# add subplot (rows, cols, number)
ax = fig.add_subplot(1, 1, 1, xlabel='iteration', ylabel='loss')
# plot data on new axis
ax.plot(losses1, color='red', marker='.', alpha=0.5, label='M = %s'%bs[0])
ax.plot(losses2, color='green', marker='.', alpha=0.5, label='M = %s'%bs[1])
ax.plot(losses3, color='blue', marker='.', alpha=0.5, label='M = %s'%bs[2])
ax.set_title('Stochastic gradient descent')
# display lengend
plt.legend()
# display plot
plt.show()

../../../../_images/6b3df1bcd254bd4d732fe21eaf3d3be82f6bd7e3d3d5a5f1e21d7fbd11b73352.png

PART III: Two SGD are used based on their sampling methods, i.e., cycling vs random sampling#

Below we will compare them

bs = [1, 10]
betas1, losses1 = minibatch_gradient_descent(X_train, y_train, batch_size=bs[0])
betas2, losses2 = minibatch_gradient_descent_sampling(X_train, y_train, batch_size=bs[0])
betas3, losses3 = minibatch_gradient_descent(X_train, y_train, batch_size=bs[1])
betas4, losses4 = minibatch_gradient_descent_sampling(X_train, y_train, batch_size=bs[1])

# create figure
fig = plt.figure(figsize=(10, 10))
# add subplot (rows, cols, number)
ax = fig.add_subplot(1, 1, 1, xlabel='iteration', ylabel='loss')
# plot data on new axis
ax.plot(losses1, color='red', marker='.', alpha=0.5, label='Cycling (M = %d)' % bs[0])
ax.plot(losses2, color='blue', marker='.', alpha=0.5, label='Sampling (M = %d)' % bs[0])
ax.plot(losses3, color='red', marker='*', alpha=0.5, label='Cycling (M = %d)' % bs[1])
ax.plot(losses4, color='blue', marker='*', alpha=0.5, label='Sampling (M = %d)' % bs[1])
# display plot
ax.set_title('Cycling vs. sampling minibatch gradient descent')
plt.legend()
plt.show()

../../../../_images/d5f35666f6c81bbea70b14868aa51d79bf374d292298eeee0c6057c8a0970918.png

len(betas3),len(betas1)

(1001, 1001)