Lecture 8 – Computer Example#

1, load the library#

from xgboost import XGBRegressor as XGBR
from sklearn.ensemble import RandomForestRegressor as RFR
from sklearn.linear_model import LinearRegression as LinearR
from sklearn.datasets import load_boston
from sklearn.model_selection import KFold, cross_val_score as CVS, train_test_split as TTS
from sklearn.metrics import mean_squared_error as MSE
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from time import time
import datetime

2, load the data and show data shape#

data = load_boston() # house price

X = data.data
y = data.target
X.shape
(506, 13)
y.shape
(506,)

3, randomly split the data into training/test and cross validation#

Xtrain,Xtest,Ytrain,Ytest = TTS(X,y,test_size=0.3,random_state=420)
cv = KFold(n_splits=5, shuffle = True, random_state=42) #5-fold cross-validation

4, machine learning to get the model#

axisx = range(10,1010,50)
rs = []
for i in axisx:
    reg = XGBR(n_estimators=i,random_state=420, silent=True)
    rs.append(CVS(reg,Xtrain,Ytrain,cv=cv).mean())
print(axisx[rs.index(max(rs))],max(rs))
plt.figure(figsize=(20,5))
plt.plot(axisx,rs,c="red",label="XGB")
plt.legend()
plt.show()
660 0.8046775284172915
../../../../_images/35fc3af7c6bef0e2d317a108b0a1c2043b6f212593d88b89549cdfb7b4fc3e8d.png
def regassess(reg,Xtrain,Ytrain,cv,scoring = ["r2"],show=True):
    score = []
    for i in range(len(scoring)):
        if show:
            print("{}:{:.2f}".format(scoring[i] #模型评估指标的名字
                                     ,CVS(reg
                                          ,Xtrain,Ytrain
                                          ,cv=cv,scoring=scoring[i]).mean()))
        score.append(CVS(reg,Xtrain,Ytrain,cv=cv,scoring=scoring[i]).mean())
    return score
regassess(reg,Xtrain,Ytrain,cv,scoring = ["r2","neg_mean_squared_error"],show=False)
[0.8045157132622822, -13.326379090387746]

5, evaluate the ML model#

from time import time
import datetime

for i in [0,0.2,0.5,1]:
    time0=time()
    reg = XGBR(n_estimators=180,random_state=420,learning_rate=i, silent=True)
    print("learning_rate = {}".format(i))
    regassess(reg,Xtrain,Ytrain,cv,scoring = ["r2","neg_mean_squared_error"])
    print(datetime.datetime.fromtimestamp(time()-time0).strftime("%M:%S:%f"))
    print("\t")
learning_rate = 0
r2:-6.76
neg_mean_squared_error:-567.55
00:01:008537
	
learning_rate = 0.2
r2:0.81
neg_mean_squared_error:-13.32
00:01:084865
	
learning_rate = 0.5
r2:0.81
neg_mean_squared_error:-13.24
00:01:093646
	
learning_rate = 1
r2:0.72
neg_mean_squared_error:-19.11
00:01:109553