Lecture 2 Example – Linear regression analysis#

  • The data will be provided on the course website/cloud drive

  • You can redo the example together with another similar exercise after the class

Part I: Simple linear regression#

1, Load some basic python/ML libraries#

from __future__ import print_function, division
from builtins import range
# Note: you may need to update your version of future
# sudo pip install -U future

import numpy as np
import matplotlib.pyplot as plt

2, load the data and plot them#

# load the data
X = []
Y = []
for line in open('data_1d.csv'):
    x, y = line.split(',')
    X.append(float(x))
    Y.append(float(y))

# let's turn X and Y into numpy arrays since that will be useful later
X = np.array(X)
Y = np.array(Y)


# let's plot the data to see what it looks like
plt.scatter(X, Y)
plt.show()
../../../../_images/9ba8a6378b5ff75f3c093da56433d44c0ccb0843acd5a649d4b9090c9ac529d4.png

3, Do the linear regression analysis#

  • 3.1, apply the equations we learned to calculate a and b

  • 3.2, you can load the ML library sklearn to use already developed regression functions

# 3.1, use the provided regression formulas to estimate the coefficients for simple linear regression
# denominator is common
# note: this could be more efficient if
#       we only computed the sums and means once
denominator = X.dot(X) - X.mean() * X.sum()
a = ( X.dot(Y) - Y.mean()*X.sum() ) / denominator
b = ( Y.mean() * X.dot(X) - X.mean() * X.dot(Y) ) / denominator

# let's calculate the predicted Y
Yhat = a*X + b

# let's plot everything together to make sure it worked
plt.scatter(X, Y)
plt.plot(X, Yhat)
plt.show()

# determine how good the model is by computing the r-squared
d1 = Y - Yhat
d2 = Y - Y.mean()
r2 = 1 - d1.dot(d1) / d2.dot(d2)
print("the r-squared is:", r2)
../../../../_images/74f20d582b637d5027708e5a0db87e82f6cbbc5944c071358b275113d5ed3105.png
the r-squared is: 0.9911838202977805
# 3.2, use the sklearn library for linear regression
from sklearn.linear_model import LinearRegression
reg = LinearRegression()
x = np.expand_dims(X,1)

reg.fit(x, Y)
reg.score(x, Y)

# get to know the coefficients
a1 = reg.coef_
b1 = reg.intercept_

Yhat = a1 * X + b1

plt.scatter(X, Y)
plt.plot(X, Yhat)
plt.show()
../../../../_images/74f20d582b637d5027708e5a0db87e82f6cbbc5944c071358b275113d5ed3105.png

Part II: Here we can demonstrate if a fitting/regression is under/over-fitted#

# notes for this course can be found at:
# https://deeplearningcourses.com/c/data-science-linear-regression-in-python
# https://www.udemy.com/data-science-linear-regression-in-python

from __future__ import print_function, division
from builtins import range
# Note: you may need to update your version of future
# sudo pip install -U future


import numpy as np
import matplotlib.pyplot as plt


def make_poly(X, deg):
    n = len(X)
    data = [np.ones(n)]
    for d in range(deg):
        data.append(X**(d+1))
    return np.vstack(data).T


def fit(X, Y):
    return np.linalg.solve(X.T.dot(X), X.T.dot(Y))


def fit_and_display(X, Y, sample, deg):
    N = len(X)
    train_idx = np.random.choice(N, sample)
    Xtrain = X[train_idx]
    Ytrain = Y[train_idx]

    plt.scatter(Xtrain, Ytrain)
    plt.show()

    # fit polynomial
    Xtrain_poly = make_poly(Xtrain, deg)
    w = fit(Xtrain_poly, Ytrain)

    # display the polynomial
    X_poly = make_poly(X, deg)
    Y_hat = X_poly.dot(w)
    plt.plot(X, Y)
    plt.plot(X, Y_hat)
    plt.scatter(Xtrain, Ytrain)
    plt.title("deg = %d" % deg)
    plt.show()


def get_mse(Y, Yhat):
    d = Y - Yhat
    return d.dot(d) / len(d)


def plot_train_vs_test_curves(X, Y, sample=20, max_deg=20):
    N = len(X)
    train_idx = np.random.choice(N, sample)
    Xtrain = X[train_idx]
    Ytrain = Y[train_idx]

    test_idx = [idx for idx in range(N) if idx not in train_idx]
    # test_idx = np.random.choice(N, sample)
    Xtest = X[test_idx]
    Ytest = Y[test_idx]

    mse_trains = []
    mse_tests = []
    for deg in range(max_deg+1):
        Xtrain_poly = make_poly(Xtrain, deg)
        w = fit(Xtrain_poly, Ytrain)
        Yhat_train = Xtrain_poly.dot(w)
        mse_train = get_mse(Ytrain, Yhat_train)

        Xtest_poly = make_poly(Xtest, deg)
        Yhat_test = Xtest_poly.dot(w)
        mse_test = get_mse(Ytest, Yhat_test)

        mse_trains.append(mse_train)
        mse_tests.append(mse_test)

    plt.plot(mse_trains, label="train mse")
    plt.plot(mse_tests, label="test mse")
    plt.legend()
    plt.show()

    plt.plot(mse_trains, label="train mse")
    plt.legend()
    plt.show()

if __name__ == "__main__":
    # make up some data and plot it
    N = 100
    X = np.linspace(0, 6*np.pi, N)
    Y = np.sin(X)

    plt.plot(X, Y)
    plt.show()

    for deg in (5, 6, 7, 8, 9):
        fit_and_display(X, Y, 10, deg)
    plot_train_vs_test_curves(X, Y)
../../../../_images/33fc54a0b19b83f11bbb18f300736dcbebd8e236eb033f2635b9a5985de257fa.png ../../../../_images/5b35754f06c55b4416bb8c7cece0e24ec4c92ee00ac7c2520d9ff827d502acaa.png ../../../../_images/f918f8f16fb4945a2c783033c8bc704e25f9282a75831f657a4af2c321260f3f.png ../../../../_images/ad31068521a1904e900578bbc621bdd263ee8c167930935f5d6db8e93d79cab6.png ../../../../_images/1149a7891415723e9a600c00b0816cfb85581daf8a710e7e07065b3f451d8250.png ../../../../_images/488c22f03a565de576750b3a775bae5a6baa9e369826d456786a344929a05806.png ../../../../_images/9c1cc5ed7b3a89bb27b29c5224bd046c255ec1e2c3b38e8d4b0d84460f286d2b.png ../../../../_images/3816ee6a7207e2703321c753d554f223b7e24534a46f3d44f4d684fcb9937b98.png ../../../../_images/7bc6d2a84bf337931cf5efbd06de3e865d30de3856490309e55bcd51a1448ad8.png ../../../../_images/a8952e6938d6020fe898af438f1144f1697b42599a5029af74f37e8b78760be3.png ../../../../_images/b8f08cfc0df6b5b20751fed6f41023fea2fdf5968e735a70a28f7f7060cd84c6.png ../../../../_images/feb03bd1fbef0352644abca333368d40ad08695fa254faad978fef6ad9e3a1d0.png ../../../../_images/a760b5b106a8a225a06ebc3f793fedb219316661105ec1dc1dead3eac5143d2e.png