天气最高温度
我们要完成三个任务
随机森林建模 --》 选择特征 - 》 增加数据量和特征个数 --》 找到最优的参数
掌握机器学习里面2种经典的参数调节方法
读数据
import pandas as pd
data = pd.read_csv("temps.csv")
data.head()
|
year |
month |
day |
week |
temp_2 |
temp_1 |
average |
actual |
friend |
| 0 |
2019 |
1 |
1 |
Fri |
45 |
45 |
45.6 |
45 |
29 |
| 1 |
2019 |
1 |
2 |
Sat |
44 |
45 |
45.7 |
44 |
61 |
| 2 |
2019 |
1 |
3 |
Sun |
45 |
44 |
45.8 |
41 |
56 |
| 3 |
2019 |
1 |
4 |
Mon |
44 |
41 |
45.9 |
40 |
53 |
| 4 |
2019 |
1 |
5 |
Tues |
41 |
40 |
46.0 |
44 |
41 |
## 看看数据多少维度
print(data.shape)
(348, 9)
data.describe()
|
year |
month |
day |
temp_2 |
temp_1 |
average |
actual |
friend |
| count |
348.0 |
348.000000 |
348.000000 |
348.000000 |
348.000000 |
348.000000 |
348.000000 |
348.000000 |
| mean |
2019.0 |
6.477011 |
15.514368 |
62.652299 |
62.701149 |
59.760632 |
62.543103 |
60.034483 |
| std |
0.0 |
3.498380 |
8.772982 |
12.165398 |
12.120542 |
10.527306 |
11.794146 |
15.626179 |
| min |
2019.0 |
1.000000 |
1.000000 |
35.000000 |
35.000000 |
45.100000 |
35.000000 |
28.000000 |
| 25% |
2019.0 |
3.000000 |
8.000000 |
54.000000 |
54.000000 |
49.975000 |
54.000000 |
47.750000 |
| 50% |
2019.0 |
6.000000 |
15.000000 |
62.500000 |
62.500000 |
58.200000 |
62.500000 |
60.000000 |
| 75% |
2019.0 |
10.000000 |
23.000000 |
71.000000 |
71.000000 |
69.025000 |
71.000000 |
71.000000 |
| max |
2019.0 |
12.000000 |
31.000000 |
117.000000 |
117.000000 |
77.400000 |
92.000000 |
95.000000 |
时间处理函数
import datetime
years = data['year']
months = data['month']
days = data['day']
dates = [str(int(year)) + '-' + str(int(month)) + '-' + str(int(day)) for year , month, day in zip (years,months,days)]
dates = [datetime.datetime.strptime(date,'%Y-%m-%d') for date in dates]
dates[:5]
[datetime.datetime(2019, 1, 1, 0, 0),
datetime.datetime(2019, 1, 2, 0, 0),
datetime.datetime(2019, 1, 3, 0, 0),
datetime.datetime(2019, 1, 4, 0, 0),
datetime.datetime(2019, 1, 5, 0, 0)]
数据展示
import matplotlib.pyplot as plt
%matplotlib inline
plt.style.use('fivethirtyeight') # 绘图风格
展示四个指标:分别为最高气温的标签值、前天、昨天、朋友预测的气温最高值,四个图。
fig,((ax1,ax2),(ax3,ax4)) = plt.subplots(nrows=2,ncols=2,figsize=(10,10))
fig.autofmt_xdate(rotation=45)
# 最高气温的标签值
ax1.plot(dates,data['actual'])
ax1.set_xlabel('');ax1.set_ylabel('Temperature');ax1.set_title('Max Temp')
# 昨天的最高温度值
ax2.plot(dates,data['temp_1'])
ax2.set_xlabel('');ax2.set_ylabel('Temperature');ax2.set_title('Yesterday Max Temp')
# 前天的最高温度值
ax3.plot(dates,data['temp_2'])
ax3.set_xlabel('');ax3.set_ylabel('Temperature');ax3.set_title('Two Days Prior Max Temp')
# 朋友预测的最高温度值
ax4.plot(dates,data['friend'])
ax4.set_xlabel('');ax4.set_ylabel('Temperature');ax4.set_title('Friend Forcast')
plt.tight_layout(pad=2)

数据预处理,比如 周几,这个要转成 计算机可识别的数字
# 独热编码
data = pd.get_dummies(data) #自动转换,自动添加后缀
data.head(5)
|
year |
month |
day |
temp_2 |
temp_1 |
average |
actual |
friend |
week_Fri |
week_Mon |
week_Sat |
week_Sun |
week_Thurs |
week_Tues |
week_Wed |
| 0 |
2019 |
1 |
1 |
45 |
45 |
45.6 |
45 |
29 |
1 |
0 |
0 |
0 |
0 |
0 |
0 |
| 1 |
2019 |
1 |
2 |
44 |
45 |
45.7 |
44 |
61 |
0 |
0 |
1 |
0 |
0 |
0 |
0 |
| 2 |
2019 |
1 |
3 |
45 |
44 |
45.8 |
41 |
56 |
0 |
0 |
0 |
1 |
0 |
0 |
0 |
| 3 |
2019 |
1 |
4 |
44 |
41 |
45.9 |
40 |
53 |
0 |
1 |
0 |
0 |
0 |
0 |
0 |
| 4 |
2019 |
1 |
5 |
41 |
40 |
46.0 |
44 |
41 |
0 |
0 |
0 |
0 |
0 |
1 |
0 |
# 数据与标签
import numpy as np
# 标签
labels = np.array(data['actual'])
# 特征中去除标签
data = data.drop('actual',axis=1) # 按照列去掉
# 名字单独保留
feature_list = list(data.columns)
# 转换为合适的格式
features = np.array(data)
# 数据集切分
from sklearn.model_selection import train_test_split
train_features, test_features, train_labels, test_labels = train_test_split(features,labels,test_size=0.25,random_state=42)
print('训练集特征:',train_features.shape)
print('训练集标签:',train_labels.shape)
print('测试集标签:',test_features.shape)
print('测试机标签:',test_labels.shape)
训练集特征: (261, 14)
训练集标签: (261,)
测试集标签: (87, 14)
测试机标签: (87,)
### 构建随机森林模型,采用 mape 平均绝对误差百分比
from sklearn.ensemble import RandomForestRegressor
rf = RandomForestRegressor(n_estimators=1000,random_state=42)
rf.fit(train_features,train_labels)
predictions = rf.predict(test_features)
errors = abs(predictions - test_labels)
mape = 100 * (errors / test_labels)
print('MAPE:',np.mean(mape))
MAPE: 6.016378550202468
from sklearn.tree import export_graphviz
import pydot
import os
tree = rf.estimators_[5]
export_graphviz(tree,out_file="tree.dot",feature_names=feature_list,rounded=True,precision=1)
(graph,) = pydot.graph_from_dot_file('./tree.dot')
graph.write_png('tree.png')
# 限制树模型
rf_small = RandomForestRegressor(n_estimators=10,max_depth=3,random_state=42)
rf_small.fit(train_features,train_labels)
tree_small = rf_small.estimators_[5]
export_graphviz(tree_small,out_file='small_tree.dot',feature_names=feature_list,rounded=True,precision=1)
(graph,) = pydot.graph_from_dot_file('small_tree.dot')
graph.write_png('small_tree.png')
# 决策树特征重要性
importances = list(rf.feature_importances_)
# 格式转换
feature_importances = [(feature, round(importance, 2)) for feature, importance in zip(feature_list,importances)]
feature_importances = sorted(feature_importances,key=lambda x:x[1],reverse=True)
# 打印
[print('Variable:{:20} importance: {}'.format(*pair)) for pair in feature_importances]
Variable:temp_1 importance: 0.69
Variable:average importance: 0.2
Variable:day importance: 0.03
Variable:friend importance: 0.03
Variable:temp_2 importance: 0.02
Variable:month importance: 0.01
Variable:year importance: 0.0
Variable:week_Fri importance: 0.0
Variable:week_Mon importance: 0.0
Variable:week_Sat importance: 0.0
Variable:week_Sun importance: 0.0
Variable:week_Thurs importance: 0.0
Variable:week_Tues importance: 0.0
Variable:week_Wed importance: 0.0
[None,
None,
None,
None,
None,
None,
None,
None,
None,
None,
None,
None,
None,
None]
# 绘制为直方图
x_values = list(range(len(importances)))
plt.bar(x_values,importances,orientation='vertical')
plt.xticks(x_values,feature_list,rotation='vertical')
plt.ylabel('Importance');plt.xlabel('Variable');plt.title('Variable Importances')
Text(0.5, 1.0, 'Variable Importances')

# 尝试使用最重要的两个特征
rf_most_important = RandomForestRegressor(n_estimators=1000,random_state=42)
# 最重要特征
important_indices = [feature_list.index('temp_1'),feature_list.index('average')]
train_important = train_features[:,important_indices]
test_important = test_features[:,important_indices]
# 重新训练模型
rf_most_important.fit(train_important,train_labels)
# 预测结果
predictions = rf_most_important.predict(test_important)
errors = abs(predictions-test_labels)
# 评估结果,保留两位小数
print('Mean Absolute Error:',round(np.mean(errors),2),'%')
mape = np.mean(100*(errors/test_labels))
print('mape:',mape)
Mean Absolute Error: 3.92 %
mape: 6.243108595734665
发现,mape的值从6.0上升到6.2,并没有下降,说明不能只选择最重要的特征
# 日期
months = features[:,feature_list.index('month')]
days = features[:,feature_list.index('day')]
years = features[:,feature_list.index('year')]
# 转换日期
dates = [str(int(year))+'-'+str(int(month))+'-'+str(int(day)) for year, month, day in zip(years,months,days)]
dates = [datetime.datetime.strptime(date,'%Y-%m-%d') for date in dates]
# 创建表格保存日期和其对应的标签数据
true_data = pd.DataFrame(data={'date':dates,'actual':labels})
# 另一个表格表示日期和对应预测值
months = test_features[:,feature_list.index('month')]
days = test_features[:,feature_list.index('day')]
years = test_features[:,feature_list.index('year')]
test_dates = [str(int(year))+'-'+str(int(month))+'-'+str(int(day)) for year,month,day in zip(years,months,days)]
test_dates = [datetime.datetime.strptime(date,'%Y-%m-%d') for date in test_dates]
predictions_data = pd.DataFrame(data = {'date':test_dates,'prediction':predictions})
# 真实值
plt.plot(true_data['date'],true_data['actual'],'b-',label='actual')
# 预测值
plt.plot(predictions_data['date'],predictions_data['prediction'],'ro',label='prediction')
plt.xticks(rotation='60')
plt.legend()
#图名
plt.xlabel('Date'); plt.ylabel('Maximum Temperature (F)'); plt.title('Acture and Predicted Values');

### 深入数据分析
## 如果可以利用的数据量增大,会对结果产生什么影响呢
## 加入的新特征会改进模型效果吗,此时的时间效率又怎么样
数据增多,采用新的数据集
import pandas as pd
features = pd.read_csv('temps_extended.csv')
features.head(5)
|
year |
month |
day |
weekday |
ws_1 |
prcp_1 |
snwd_1 |
temp_2 |
temp_1 |
average |
actual |
friend |
| 0 |
2011 |
1 |
1 |
Sat |
4.92 |
0.00 |
0 |
36 |
37 |
45.6 |
40 |
40 |
| 1 |
2011 |
1 |
2 |
Sun |
5.37 |
0.00 |
0 |
37 |
40 |
45.7 |
39 |
50 |
| 2 |
2011 |
1 |
3 |
Mon |
6.26 |
0.00 |
0 |
40 |
39 |
45.8 |
42 |
42 |
| 3 |
2011 |
1 |
4 |
Tues |
5.59 |
0.00 |
0 |
39 |
42 |
45.9 |
38 |
59 |
| 4 |
2011 |
1 |
5 |
Wed |
3.80 |
0.03 |
0 |
42 |
38 |
46.0 |
45 |
39 |
print('数据规模',features.shape)
数据规模 (2191, 12)
## 时间转化,用标准时间格式方便后续工作
import datetime
years = features['year']
months = features['month']
days = features['day']
dates = [str(int(year)) + '-' + str(int(month)) + '-' + str(int(day)) for year, month, day in zip(years,months,days)]
dates = [datetime.datetime.strptime(date,'%Y-%m-%d') for date in dates]
dates[:5]
[datetime.datetime(2011, 1, 1, 0, 0),
datetime.datetime(2011, 1, 2, 0, 0),
datetime.datetime(2011, 1, 3, 0, 0),
datetime.datetime(2011, 1, 4, 0, 0),
datetime.datetime(2011, 1, 5, 0, 0)]
# 对新特征进行可视化展示
fig,((ax1,ax2),(ax3,ax4)) = plt.subplots(nrows=2,ncols=2,figsize=(15,10))
fig.autofmt_xdate(rotation=45)
# 平均最高气温
ax1.plot(dates,features['average'])
ax1.set_xlabel('');ax1.set_ylabel('Tempertature (F)');ax1.set_title('Historical Avg Max Temp')
# 风速
ax2.plot(dates,features['ws_1'],'r-')
ax2.set_xlabel('');ax2.set_ylabel('Wind Speed (mph))');ax2.set_title('Prior Wind Speed')
# 降水
ax3.plot(dates,features['prcp_1'],'r-')
ax3.set_xlabel('Date');ax3.set_ylabel('Precipitation (in)');ax3.set_title('Prior Precipitation')
# 积雪
ax4.plot(dates,features['snwd_1'],'ro')
ax4.set_xlabel('Date');ax4.set_ylabel('Snow Depth (in)');ax4.set_title('Prior Snow Depth')
plt.tight_layout(pad=2)

天气变换与季节因素有关,然而数据集中并没有体现季节的特征,可以自己创建
# 季节变量
seasons = []
for month in features['month']:
if month in [1,2,12]:
seasons.append('winter')
elif month in [3,4,5]:
seasons.append('spring')
elif month in [6,7,8]:
seasons.append('summer')
elif month in [9,10,11]:
seasons.append('fall')
reduced_features = features[['temp_1','prcp_1','average','actual']]
reduced_features['season'] = seasons
C:\Users\Owner\AppData\Local\Temp\ipykernel_15292\2969630295.py:13: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead
See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
reduced_features['season'] = seasons
import seaborn as sns
sns.set(style='ticks',color_codes=True)
# 主题
palette = sns.xkcd_palette(['dark blue','dark green','gold','orange'])
# pairplot绘图
sns.pairplot(reduced_features,hue='season',diag_kind='kde',palette=palette,plot_kws=dict(alpha=0.7),diag_kws=dict(shade=True))
<seaborn.axisgrid.PairGrid at 0x23f1d1c4370>

# 独热编码
features = pd.get_dummies(features)
# 提取特征和标签
labels = features['actual']
features = features.drop('actual',axis=1)
# 特征名字留着备用
feature_list = list(features.columns)
# 转换为所需格式
import numpy as np
features = np.array(features)
labels = np.array(labels)
# 数据集切分
from sklearn.model_selection import train_test_split
划分新的数据集
train_features,test_features,train_labels,test_labels = train_test_split(features,labels,test_size=0.25,random_state=0)
print("训练集特征:",train_features.shape)
print("训练集标签:",train_labels.shape)
print("测试集特征:",test_features.shape)
print("测试集标签:",test_labels.shape)
训练集特征: (1643, 17)
训练集标签: (1643,)
测试集特征: (548, 17)
测试集标签: (548,)
import pandas as pd
import numpy as np
# 统一特征
original_feature_indices = [feature_list.index(feature) for feature in feature_list if feature not in ['ws_1','prcp_1','snwd_1']]
# 重新读取老数据
original_features = pd.read_csv('temps.csv')
original_features = pd.get_dummies(original_features)
# 数据标签转换
original_labels = np.array(original_features['actual'])
original_features = original_features.drop('actual',axis=1)
original_feature_list = list(original_features.columns)
original_features = np.array(original_features)
# 老数据集切分
from sklearn.model_selection import train_test_split
original_train_features,original_test_features,original_train_labels,original_test_labels = train_test_split(original_features,original_labels,test_size=0.25,random_state=42)
# 数据建模
from sklearn.ensemble import RandomForestRegressor
# 同样参数和随机种子
rf = RandomForestRegressor(n_estimators=100,random_state=0)
# 老数据集
rf.fit(original_train_features,original_train_labels)
# 统一使用一个测试集,为了公平
predictions = rf.predict(test_features[:,original_feature_indices])
errors = abs(predictions-test_labels)
print('老数据集平均温度误差:',round(np.mean(errors),2),'°')
mape = 100 *(errors/test_labels)
# 为了观察设定准确率
accuracy = 100 -np.mean(mape)
print('Accuracy:',round(accuracy,2),'%')
老数据集平均温度误差: 4.68 °
Accuracy: 92.19 %
from sklearn.ensemble import RandomForestRegressor
# 保证标签一致 剔除新的特征
original_train_changeed_features = train_features[:,original_feature_indices]
original_test_changed_features = test_features[:,original_feature_indices]
rf = RandomForestRegressor(n_estimators=100,random_state=0)
rf.fit(original_train_changeed_features,train_labels)
# 预测
baseline_predictions = rf.predict(original_test_changed_features)
# 结果
baseline_errors = abs(baseline_predictions-test_labels)
print('新数据集平均温度误差:',round(np.mean(baseline_errors),2),'%')
baseline_mape = 100 * np.mean(baseline_errors/test_labels)
# 准确率
baseline_accuracy = 100 - baseline_mape
print('Accuracy:',round(baseline_accuracy,2),'%')
新数据集平均温度误差: 4.2 %
Accuracy: 93.12 %
数据增多以后,相同的特征,结果加强了
from sklearn.ensemble import RandomForestRegressor
rf_exp = RandomForestRegressor(n_estimators=100,random_state=0)
rf_exp.fit(train_features,train_labels)
# 同一测试集
predictions = rf_exp.predict(test_features)
# 评估
errors = abs(predictions - test_labels)
print('平均温度误差:',round(np.mean(errors),2),"%")
mape = np.mean(100*(errors/test_labels))
improvement_baseline = 100 * abs(mape-baseline_mape) / baseline_mape
print('特征增多以后模型效果变化:',round(improvement_baseline,2),'%')
# 准确率
accuracy = 100 - mape
print('Accuracy:',round(accuracy,2),'%')
平均温度误差: 4.05 %
特征增多以后模型效果变化: 3.34 %
Accuracy: 93.35 %
重要特征
importances = list(rf_exp.feature_importances_)
# 名字和数值拼接在一起
feature_importances = [(feature,round(importance,2)) for feature,importance in zip(feature_list,importances)]
# 排序
feature_importances = sorted(feature_importances,key=lambda x:x[1],reverse=True)
# 打印结果
[print('Variable:{:20} Importance: {}'.format(*pair)) for pair in feature_importances]
Variable:temp_1 Importance: 0.85
Variable:average Importance: 0.05
Variable:ws_1 Importance: 0.02
Variable:friend Importance: 0.02
Variable:year Importance: 0.01
Variable:month Importance: 0.01
Variable:day Importance: 0.01
Variable:prcp_1 Importance: 0.01
Variable:temp_2 Importance: 0.01
Variable:snwd_1 Importance: 0.0
Variable:weekday_Fri Importance: 0.0
Variable:weekday_Mon Importance: 0.0
Variable:weekday_Sat Importance: 0.0
Variable:weekday_Sun Importance: 0.0
Variable:weekday_Thurs Importance: 0.0
Variable:weekday_Tues Importance: 0.0
Variable:weekday_Wed Importance: 0.0
[None,
None,
None,
None,
None,
None,
None,
None,
None,
None,
None,
None,
None,
None,
None,
None,
None]
# 可视化重要指标
plt.style.use('fivethirtyeight')
x_values = list(range(len(importances)))
plt.bar(x_values,importances,orientation="vertical",color="r",edgecolor="k",linewidth=1.2)
plt.xticks(x_values,feature_list,rotation='vertical')
plt.ylabel('Importance')
plt.xlabel('Variable')
plt.title('Variable Importances')
Text(0.5, 1.0, 'Variable Importances')

sorted_importances = [importance[1] for importance in feature_importances]
sorted_features = [importance[0] for importance in feature_importances]
# 累计重要性
cumulative_importances = np.cumsum(sorted_importances)
# 绘制折线图
plt.plot(x_values,cumulative_importances,'g-')
plt.hlines(y=0.95,xmin=0,xmax=len(sorted_importances),color='r',linestyles='dashed')
plt.xticks(x_values,sorted_features,rotation='vertical')
plt.xlabel('Variable');plt.ylabel('Cumulative Importance')
plt.title('Cumulative Importances')
Text(0.5, 1.0, 'Cumulative Importances')

如果只使用这5个特征建模,观察结果
important_feature_names = [feature[0] for feature in feature_importances[0:5]]
# 名字
important_indices = [feature_list.index(feature) for feature in important_feature_names]
# 训练集
important_train_features = train_features[:,important_indices]
important_test_features = test_features[:,important_indices]
# 数据维度
print("important train features shape:",important_train_features.shape)
print("important test features shape:",important_test_features.shape)
# 训练模型
rf_exp.fit(important_train_features,train_labels)
# 同样的测试集
predictions = rf_exp.predict(important_test_features)
# 评估
errors = abs(predictions-test_labels)
print('平均温度误差:',round(np.mean(errors),2),"°")
mape = 100*(errors/test_labels)
accuracy = 100 - np.mean(mape)
print('Accuracy:',round(accuracy,2),"%")
important train features shape: (1643, 5)
important test features shape: (548, 5)
平均温度误差: 4.11 °
Accuracy: 93.28 %
虽然没有提升效率,那么观察一下在模型时间效率上面有没有提高???
import time
all_features_time = []
for _ in range(10):
start_time = time.time()
rf_exp.fit(train_features,train_labels)
all_features_predictions = rf_exp.predict(test_features)
end_time = time.time()
all_features_time.append(end_time-start_time)
all_features_time = np.mean(all_features_time)
print("使用所有特征与测试的平均时间消耗:",round(all_features_time,2),'s')
使用所有特征与测试的平均时间消耗: 0.71 s
# 只选用重要特征训练时
reduced_features_time = []
for _ in range(10):
start_time = time.time()
rf_exp.fit(important_train_features,train_labels)
reduced_features_predictions = rf_exp.predict(important_test_features)
end_time = time.time()
reduced_features_time.append(end_time-start_time)
reduced_features_time = np.mean(reduced_features_time)
print("使用重要特征与测试的平均时间消耗:",round(reduced_features_time,2),'s')
使用重要特征与测试的平均时间消耗: 0.42 s
# 原始模型时间效率
original_features_time =[]
for _ in range(10):
start_time =time.time()
rf.fit(original_train_features,original_train_labels)
original_features_predictions =rf.predict(test_features[:,original_feature_indices])
end_time =time.time()
original_features_time.append(end_time -start_time)
original_features_time =np.mean(original_features_time)
print("使用原始模型测试的平均时间消耗:",round(original_features_time,2),'s')
使用原始模型测试的平均时间消耗: 0.17 s
不同特征 做一下对比
# 对比展示
all_accuracy = 100 * (1-np.mean(abs(all_features_predictions-test_labels)/test_labels))
reduced_accuracy = 100 * (1-np.mean(abs(reduced_features_predictions-test_labels)/test_labels))
# 保存结果并展示
comparision = pd.DataFrame({'features':['all(17)','reduced(5)'],
'runtime':[round(all_features_time,2),round(reduced_features_time,2)],
'accuracy':[round(all_accuracy,2),round(reduced_accuracy,2)]})
comparision[['features','accuracy','runtime']]
|
features |
accuracy |
runtime |
| 0 |
all(17) |
93.35 |
0.71 |
| 1 |
reduced(5) |
93.28 |
0.42 |
# 时间效率可能会比准确率更加优先考虑
relative_accuracy_decrease = 100 * (all_accuracy - reduced_accuracy) / all_accuracy
print('相对accuracy提升:',round(relative_accuracy_decrease,3),"%")
relative_runtime_decrease = 100 * (all_features_time - reduced_features_time) / all_features_time
print("相对时间效率提升:",round(relative_runtime_decrease,3),"%")
相对accuracy提升: 0.071 %
相对时间效率提升: 40.663 %
# 原模型的预测温度对比
original_mae = np.mean(abs(original_features_predictions -test_labels))
# 所有特征预测温度对比
exp_all_mae = np.mean(abs(all_features_predictions -test_labels))
# 重要特征预测温度对比
exp_reduced_mae = np.mean(abs(reduced_features_predictions -test_labels))
# 原模型的准确率
original_accuracy = 100 * (1 - np.mean(abs(original_features_predictions - test_labels) /test_labels))
model_comparison = pd.DataFrame({'model': ['original', 'exp_all', 'exp_reduced'],
'error (degrees)': [original_mae, exp_all_mae, exp_reduced_mae],
'accuracy': [original_accuracy, all_accuracy, reduced_accuracy],
'run_time (s)': [original_features_time, all_features_time, reduced_features_time]})
# 汇聚所有实验结果
fig, (ax1,ax2,ax3) = plt.subplots(nrows=1,ncols=3,figsize=(16,5),sharex=True)
# X轴
x_values = [0,1,2]
labels = list(model_comparison['model'])
plt.xticks(x_values,labels)
# 字体大小
fontdict = {'fontsize':18}
fontdict_yaxis = {'fontsize':14}
# 预测温度和真实温度的比对比
ax1.bar(x_values,model_comparison['error (degrees)'], color=['b','r','g'],edgecolor='k',linewidth=1.5)
ax1.set_ylim(bottom=3.5, top=4.5)
ax1.set_ylabel('Error (degree) (F)',fontdict=fontdict_yaxis)
ax1.set_title('Model Error Comparison',fontdict=fontdict)
# 准确率对比
ax2.bar(x_values,model_comparison['accuracy'],color=['b','r','g'],edgecolor='k',linewidth=1.5)
ax2.set_ylim(bottom=92, top=94)
ax2.set_ylabel('Accuracy (%)',fontdict=fontdict_yaxis)
ax2.set_title('Model Accuracy Comparision',fontdict=fontdict)
# 时间效率对比
ax3.bar(x_values,model_comparison['run_time (s)'], color=['b','r','g'],edgecolor='k',linewidth=1.5)
ax3.set_ylim(bottom=0,top=1)
ax3.set_ylabel('run_time (s)',fontdict=fontdict_yaxis)
ax3.set_title('Model Run-Time Comparison',fontdict=fontdict)
plt.show()

模型调参
from sklearn.ensemble import RandomForestRegressor
from pprint import pprint
rf = RandomForestRegressor(random_state=42)
pprint(rf.get_params())
{'bootstrap': True,
'ccp_alpha': 0.0,
'criterion': 'squared_error',
'max_depth': None,
'max_features': 'auto',
'max_leaf_nodes': None,
'max_samples': None,
'min_impurity_decrease': 0.0,
'min_samples_leaf': 1,
'min_samples_split': 2,
'min_weight_fraction_leaf': 0.0,
'n_estimators': 100,
'n_jobs': None,
'oob_score': False,
'random_state': 42,
'verbose': 0,
'warm_start': False}
### 自动随机调参
from sklearn.model_selection import RandomizedSearchCV
# 建立树的个数
n_estimators = [int(x) for x in np.linspace(start=200,stop=2000,num=10)]
# 最大特征的选择方法
max_features = ['auto','sqrt']
# 树最大深度
max_depth = [int(x) for x in np.linspace(10,20,num=2)]
max_depth.append(None)
# 节点最小分裂所需要的样本个数
min_samples_split = [2,5,10]
# 叶子节点最小的样本数
min_samples_leaf = [1,2,4]
# 样本采样方法
bootstrap = [True,False]
# 随机参数空间
random_grid ={'n_estimators':n_estimators,
'max_features':max_features,
'max_depth':max_depth,
'min_samples_split':min_samples_split,
'min_samples_leaf':min_samples_leaf,
'bootstrap':bootstrap}
#### 随机组合参数
rf = RandomForestRegressor()
rf_random = RandomizedSearchCV(estimator=rf, # 指定调参模型
param_distributions=random_grid, # 指定候选参数列表
n_iter=100, # 随机选择参数组合的个数,这里是随机选择100组,找这中间最好的
scoring='neg_mean_absolute_error', # 评估方法
cv=3, # 交叉验证
verbose=2, # 打印信息的数量
random_state=42, # 随机种子,随便选
n_jobs=-1) # 多线程数目,如果-1代表使用所有线程
# 寻找开始
rf_random.fit(train_features,train_labels)
rf_random.best_params_
Fitting 3 folds for each of 100 candidates, totalling 300 fits
{'n_estimators': 2000,
'min_samples_split': 5,
'min_samples_leaf': 4,
'max_features': 'auto',
'max_depth': None,
'bootstrap': True}
# 评估结果
def evaluate(model,test_features,test_labels):
predictions = model.predict(test_features)
errors = abs(predictions - test_labels)
mape = 100 * np.mean(errors / test_labels)
accuracy = 100 - mape
print('平均气温误差:',np.mean(errors))
print('Accuracy = {:0.2f}%'.format(accuracy))
# 默认参数结果
base_model = RandomForestRegressor(random_state=42)
base_model.fit(train_features,train_labels)
evaluate(base_model,test_features,test_labels)
平均气温误差: 4.036259124087591
Accuracy = 93.37%
# 随机组合最佳参数
best_random = rf_random.best_estimator_
evaluate(best_random,test_features,test_labels)
平均气温误差: 4.0074731175393135
Accuracy = 93.43%
网格参数搜索
{'n_estimators': 1800,
'min_samples_split': 10,
'min_samples_leaf': 4,
'max_features': 'auto',
'max_depth': None,
'bootstrap': True}
from sklearn.model_selection import GridSearchCV
# 候选参数空间
param_grid = {
'n_estimators':[1600,1700,1800,1900,2000],
'max_features':['auto'],
'max_depth':[8,10,12],
'min_samples_split':[3,5,7],
'min_samples_leaf':[2.3,4,5,6],
'bootstrap':[True]
}
# 基本算法模型
rf = RandomForestRegressor()
# 网格搜索
grid_search = GridSearchCV(estimator=rf,
param_grid=param_grid,
scoring='neg_mean_absolute_error',
cv=3,
n_jobs=-1,
verbose=2)
# 搜索开始
grid_search.fit(train_features,train_labels)
Fitting 3 folds for each of 180 candidates, totalling 540 fits
D:\anaconda3\lib\site-packages\sklearn\model_selection\_validation.py:372: FitFailedWarning:
135 fits failed out of a total of 540.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.
Below are more details about the failures:
--------------------------------------------------------------------------------
135 fits failed with the following error:
Traceback (most recent call last):
File "D:\anaconda3\lib\site-packages\sklearn\model_selection\_validation.py", line 680, in _fit_and_score
estimator.fit(X_train, y_train, **fit_params)
File "D:\anaconda3\lib\site-packages\sklearn\ensemble\_forest.py", line 450, in fit
trees = Parallel(
File "D:\anaconda3\lib\site-packages\joblib\parallel.py", line 1863, in __call__
return output if self.return_generator else list(output)
File "D:\anaconda3\lib\site-packages\joblib\parallel.py", line 1792, in _get_sequential_output
res = func(*args, **kwargs)
File "D:\anaconda3\lib\site-packages\sklearn\utils\fixes.py", line 216, in __call__
return self.function(*args, **kwargs)
File "D:\anaconda3\lib\site-packages\sklearn\ensemble\_forest.py", line 185, in _parallel_build_trees
tree.fit(X, y, sample_weight=curr_sample_weight, check_input=False)
File "D:\anaconda3\lib\site-packages\sklearn\tree\_classes.py", line 1315, in fit
super().fit(
File "D:\anaconda3\lib\site-packages\sklearn\tree\_classes.py", line 242, in fit
raise ValueError(
ValueError: min_samples_leaf must be at least 1 or in (0, 0.5], got 2.3
warnings.warn(some_fits_failed_message, FitFailedWarning)
D:\anaconda3\lib\site-packages\sklearn\model_selection\_search.py:969: UserWarning: One or more of the test scores are non-finite: [ nan nan nan nan nan nan
nan nan nan nan nan nan
nan nan nan -3.66715249 -3.67265289 -3.6666138
-3.66880025 -3.66765682 -3.6725743 -3.66850699 -3.66715993 -3.66955895
-3.66730326 -3.66887582 -3.66898245 -3.66910087 -3.66955339 -3.66925188
-3.66767582 -3.66392168 -3.66590283 -3.66647468 -3.66916971 -3.66603121
-3.66586079 -3.66445455 -3.66298478 -3.66498142 -3.66926415 -3.66660605
-3.66211951 -3.66663106 -3.66897272 -3.66051875 -3.66402215 -3.66404952
-3.66353607 -3.6642029 -3.66047745 -3.66229798 -3.6646911 -3.65990835
-3.66086848 -3.66117259 -3.66397042 -3.66353509 -3.66311066 -3.6654521
nan nan nan nan nan nan
nan nan nan nan nan nan
nan nan nan -3.66888733 -3.66731329 -3.66960138
-3.6676715 -3.66823412 -3.6687913 -3.66691494 -3.66508719 -3.67174204
-3.6732568 -3.66707472 -3.6662491 -3.67071602 -3.66858938 -3.6695038
-3.66534122 -3.66134385 -3.66749329 -3.66597971 -3.66399534 -3.66496658
-3.66704291 -3.66484829 -3.66720088 -3.6665224 -3.66248715 -3.66751772
-3.66803523 -3.66671033 -3.66589929 -3.66162471 -3.66317662 -3.66164416
-3.66021027 -3.66350166 -3.65923551 -3.66377361 -3.66143871 -3.6651609
-3.66614199 -3.66217163 -3.66642478 -3.66285729 -3.66038393 -3.66225526
nan nan nan nan nan nan
nan nan nan nan nan nan
nan nan nan -3.67097272 -3.66544877 -3.67074033
-3.67176597 -3.66914464 -3.66931447 -3.67020498 -3.66778535 -3.6680251
-3.66782811 -3.66677228 -3.67086388 -3.66895445 -3.67114891 -3.67163594
-3.6619978 -3.66453644 -3.66907959 -3.66519192 -3.66911132 -3.6655087
-3.6684413 -3.6656255 -3.66152951 -3.66630327 -3.66651272 -3.66543072
-3.66262121 -3.66430172 -3.66648642 -3.66166699 -3.66333263 -3.66292747
-3.66399535 -3.66247052 -3.66596842 -3.66142884 -3.66444085 -3.66254073
-3.66432689 -3.66124163 -3.65741632 -3.66360827 -3.66092641 -3.66143091]
warnings.warn(
GridSearchCV(cv=3, estimator=RandomForestRegressor(), n_jobs=-1,
param_grid={'bootstrap': [True], 'max_depth': [8, 10, 12],
'max_features': ['auto'],
'min_samples_leaf': [2.3, 4, 5, 6],
'min_samples_split': [3, 5, 7],
'n_estimators': [1600, 1700, 1800, 1900, 2000]},
scoring='neg_mean_absolute_error', verbose=2)
best_grid_search = grid_search.best_estimator_
evaluate(best_grid_search,test_features,test_labels)
平均气温误差: 4.004802708677475
Accuracy = 93.44%
最终模型
print('最终模型参数:\n')
pprint(best_grid_search.get_params())
最终模型参数:
{'bootstrap': True,
'ccp_alpha': 0.0,
'criterion': 'squared_error',
'max_depth': 12,
'max_features': 'auto',
'max_leaf_nodes': None,
'max_samples': None,
'min_impurity_decrease': 0.0,
'min_samples_leaf': 6,
'min_samples_split': 7,
'min_weight_fraction_leaf': 0.0,
'n_estimators': 1700,
'n_jobs': None,
'oob_score': False,
'random_state': None,
'verbose': 0,
'warm_start': False}
贝叶斯优化寻找最佳参数
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials
from sklearn.model_selection import cross_val_score
def hyperopt_train_test(params):
clf = RandomForestRegressor(**params)
return cross_val_score(clf,train_features,train_labels).mean()
max_depth = [i for i in range(10,20)]
# max_depth.append(None)
space4rf = {
'max_depth': hp.choice('max_depth', max_depth),
'max_features': hp.choice('max_features', ['auto','sqrt']),
'min_samples_split':hp.choice('min_samples_split',range(5,20)),
'min_samples_leaf':hp.choice('min_samples_leaf',range(2,10)),
'n_estimators': hp.choice('n_estimators', range(1000,2000)),
'bootstrap':hp.choice('bootstrap',[True,False])
}
best = 0
def f(params):
global best
acc = hyperopt_train_test(params)
if acc > best:
best = acc
print('new best:', best, params)
return {'loss': -acc, 'status': STATUS_OK}
trials = Trials()
best = fmin(f, space4rf, algo=tpe.suggest, max_evals=100, trials=trials)
print("best:",best)
new best:
0.766416424801337
{'bootstrap': False, 'max_depth': 19, 'max_features': 'auto', 'min_samples_leaf': 2, 'min_samples_split': 7, 'n_estimators': 1028}
new best:
0.8644180936765691
{'bootstrap': False, 'max_depth': 19, 'max_features': 'sqrt', 'min_samples_leaf': 2, 'min_samples_split': 15, 'n_estimators': 1363}
new best:
0.8679831214513388
{'bootstrap': True, 'max_depth': 12, 'max_features': 'auto', 'min_samples_leaf': 6, 'min_samples_split': 18, 'n_estimators': 1275}
new best:
0.8683413950549937
{'bootstrap': True, 'max_depth': 14, 'max_features': 'auto', 'min_samples_leaf': 7, 'min_samples_split': 18, 'n_estimators': 1863}
new best:
0.8683946223494816
{'bootstrap': True, 'max_depth': 15, 'max_features': 'auto', 'min_samples_leaf': 9, 'min_samples_split': 6, 'n_estimators': 1933}
new best:
0.8684885517659223
{'bootstrap': True, 'max_depth': 15, 'max_features': 'auto', 'min_samples_leaf': 9, 'min_samples_split': 6, 'n_estimators': 1933}
new best:
0.8686051137472097
{'bootstrap': True, 'max_depth': 15, 'max_features': 'auto', 'min_samples_leaf': 9, 'min_samples_split': 6, 'n_estimators': 1408}
34%|██████████████▉ | 34/100 [25:19<1:06:39, 60.60s/trial, best loss: -0.8686051137472097]
贝叶斯优化的最大特点是每一次优化都对后面结果产生影响,而随机和网格每次都是独立的,不影响后面的结果,随机适合在开始时候不知道参数,网格用在随机后面。