python随机森林进行气温预测

数据赶考人 / 2024-02-04 / 原文

天气最高温度

我们要完成三个任务

随机森林建模 --》选择特征 - 》增加数据量和特征个数 --》找到最优的参数

掌握机器学习里面2种经典的参数调节方法

读数据

import pandas as pd

data = pd.read_csv("temps.csv")
data.head()

	year	month	day	week	temp_2	temp_1	average	actual	friend
0	2019	1	1	Fri	45	45	45.6	45	29
1	2019	1	2	Sat	44	45	45.7	44	61
2	2019	1	3	Sun	45	44	45.8	41	56
3	2019	1	4	Mon	44	41	45.9	40	53
4	2019	1	5	Tues	41	40	46.0	44	41

## 看看数据多少维度
print(data.shape)

(348, 9)

data.describe()

	year	month	day	temp_2	temp_1	average	actual	friend
count	348.0	348.000000	348.000000	348.000000	348.000000	348.000000	348.000000	348.000000
mean	2019.0	6.477011	15.514368	62.652299	62.701149	59.760632	62.543103	60.034483
std	0.0	3.498380	8.772982	12.165398	12.120542	10.527306	11.794146	15.626179
min	2019.0	1.000000	1.000000	35.000000	35.000000	45.100000	35.000000	28.000000
25%	2019.0	3.000000	8.000000	54.000000	54.000000	49.975000	54.000000	47.750000
50%	2019.0	6.000000	15.000000	62.500000	62.500000	58.200000	62.500000	60.000000
75%	2019.0	10.000000	23.000000	71.000000	71.000000	69.025000	71.000000	71.000000
max	2019.0	12.000000	31.000000	117.000000	117.000000	77.400000	92.000000	95.000000

时间处理函数

import datetime

years = data['year']
months = data['month']
days = data['day']

dates = [str(int(year)) + '-' + str(int(month)) + '-' + str(int(day)) for year , month, day in zip (years,months,days)]

dates = [datetime.datetime.strptime(date,'%Y-%m-%d') for date in dates]

dates[:5]

[datetime.datetime(2019, 1, 1, 0, 0),
 datetime.datetime(2019, 1, 2, 0, 0),
 datetime.datetime(2019, 1, 3, 0, 0),
 datetime.datetime(2019, 1, 4, 0, 0),
 datetime.datetime(2019, 1, 5, 0, 0)]

数据展示

import matplotlib.pyplot as plt
%matplotlib inline
plt.style.use('fivethirtyeight') # 绘图风格

展示四个指标：分别为最高气温的标签值、前天、昨天、朋友预测的气温最高值，四个图。

fig,((ax1,ax2),(ax3,ax4)) = plt.subplots(nrows=2,ncols=2,figsize=(10,10))
fig.autofmt_xdate(rotation=45)
# 最高气温的标签值
ax1.plot(dates,data['actual'])
ax1.set_xlabel('');ax1.set_ylabel('Temperature');ax1.set_title('Max Temp')
# 昨天的最高温度值
ax2.plot(dates,data['temp_1'])
ax2.set_xlabel('');ax2.set_ylabel('Temperature');ax2.set_title('Yesterday Max Temp')
# 前天的最高温度值
ax3.plot(dates,data['temp_2'])
ax3.set_xlabel('');ax3.set_ylabel('Temperature');ax3.set_title('Two Days Prior Max Temp')
# 朋友预测的最高温度值
ax4.plot(dates,data['friend'])
ax4.set_xlabel('');ax4.set_ylabel('Temperature');ax4.set_title('Friend Forcast')
plt.tight_layout(pad=2)

数据预处理，比如周几，这个要转成计算机可识别的数字

# 独热编码
data = pd.get_dummies(data) #自动转换，自动添加后缀
data.head(5)

	year	month	day	temp_2	temp_1	average	actual	friend	week_Fri	week_Mon	week_Sat	week_Sun	week_Tues
0	2019	1	1	45	45	45.6	45	29	1	0	0	0	0
1	2019	1	2	44	45	45.7	44	61	0	0	1	0	0
2	2019	1	3	45	44	45.8	41	56	0	0	0	1	0
3	2019	1	4	44	41	45.9	40	53	0	1	0	0	0
4	2019	1	5	41	40	46.0	44	41	0	0	0	0	1

# 数据与标签
import numpy as np
# 标签
labels = np.array(data['actual'])
# 特征中去除标签
data = data.drop('actual',axis=1) # 按照列去掉
# 名字单独保留
feature_list = list(data.columns)
# 转换为合适的格式
features = np.array(data)

# 数据集切分
from sklearn.model_selection import train_test_split
train_features, test_features, train_labels, test_labels = train_test_split(features,labels,test_size=0.25,random_state=42)
print('训练集特征：',train_features.shape)
print('训练集标签：',train_labels.shape)
print('测试集标签：',test_features.shape)
print('测试机标签：',test_labels.shape)

训练集特征： (261, 14)
训练集标签： (261,)
测试集标签： (87, 14)
测试机标签： (87,)

### 构建随机森林模型，采用 mape 平均绝对误差百分比
from sklearn.ensemble import RandomForestRegressor
rf = RandomForestRegressor(n_estimators=1000,random_state=42)
rf.fit(train_features,train_labels)
predictions = rf.predict(test_features)
errors = abs(predictions - test_labels)
mape = 100 * (errors / test_labels)
print('MAPE:',np.mean(mape))

MAPE: 6.016378550202468

from sklearn.tree import export_graphviz
import pydot
import os

tree = rf.estimators_[5]
export_graphviz(tree,out_file="tree.dot",feature_names=feature_list,rounded=True,precision=1)
(graph,) = pydot.graph_from_dot_file('./tree.dot')
graph.write_png('tree.png')

# 限制树模型
rf_small = RandomForestRegressor(n_estimators=10,max_depth=3,random_state=42)
rf_small.fit(train_features,train_labels)
tree_small = rf_small.estimators_[5]
export_graphviz(tree_small,out_file='small_tree.dot',feature_names=feature_list,rounded=True,precision=1)
(graph,) = pydot.graph_from_dot_file('small_tree.dot')
graph.write_png('small_tree.png')

# 决策树特征重要性
importances = list(rf.feature_importances_)
# 格式转换
feature_importances = [(feature, round(importance, 2)) for feature, importance in zip(feature_list,importances)]
feature_importances = sorted(feature_importances,key=lambda x:x[1],reverse=True)
# 打印
[print('Variable:{:20} importance: {}'.format(*pair)) for pair in feature_importances]

Variable:temp_1               importance: 0.69
Variable:average              importance: 0.2
Variable:day                  importance: 0.03
Variable:friend               importance: 0.03
Variable:temp_2               importance: 0.02
Variable:month                importance: 0.01
Variable:year                 importance: 0.0
Variable:week_Fri             importance: 0.0
Variable:week_Mon             importance: 0.0
Variable:week_Sat             importance: 0.0
Variable:week_Sun             importance: 0.0
Variable:week_Thurs           importance: 0.0
Variable:week_Tues            importance: 0.0
Variable:week_Wed             importance: 0.0





[None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None]

# 绘制为直方图
x_values = list(range(len(importances)))
plt.bar(x_values,importances,orientation='vertical')
plt.xticks(x_values,feature_list,rotation='vertical')
plt.ylabel('Importance');plt.xlabel('Variable');plt.title('Variable Importances')

Text(0.5, 1.0, 'Variable Importances')

# 尝试使用最重要的两个特征
rf_most_important = RandomForestRegressor(n_estimators=1000,random_state=42)
# 最重要特征
important_indices = [feature_list.index('temp_1'),feature_list.index('average')]
train_important = train_features[:,important_indices]
test_important = test_features[:,important_indices]
# 重新训练模型
rf_most_important.fit(train_important,train_labels)
# 预测结果
predictions = rf_most_important.predict(test_important)
errors = abs(predictions-test_labels)
# 评估结果，保留两位小数
print('Mean Absolute Error:',round(np.mean(errors),2),'%')
mape = np.mean(100*(errors/test_labels))
print('mape:',mape)

Mean Absolute Error: 3.92 %
mape: 6.243108595734665

发现，mape的值从6.0上升到6.2，并没有下降，说明不能只选择最重要的特征

# 日期
months = features[:,feature_list.index('month')]
days = features[:,feature_list.index('day')]
years = features[:,feature_list.index('year')]
# 转换日期
dates = [str(int(year))+'-'+str(int(month))+'-'+str(int(day)) for year, month, day in zip(years,months,days)]
dates = [datetime.datetime.strptime(date,'%Y-%m-%d') for date in dates]
# 创建表格保存日期和其对应的标签数据
true_data = pd.DataFrame(data={'date':dates,'actual':labels})
# 另一个表格表示日期和对应预测值
months = test_features[:,feature_list.index('month')]
days = test_features[:,feature_list.index('day')]
years = test_features[:,feature_list.index('year')]
test_dates = [str(int(year))+'-'+str(int(month))+'-'+str(int(day)) for year,month,day in zip(years,months,days)]
test_dates = [datetime.datetime.strptime(date,'%Y-%m-%d') for date in test_dates]
predictions_data = pd.DataFrame(data = {'date':test_dates,'prediction':predictions})
# 真实值
plt.plot(true_data['date'],true_data['actual'],'b-',label='actual')
# 预测值
plt.plot(predictions_data['date'],predictions_data['prediction'],'ro',label='prediction')
plt.xticks(rotation='60')
plt.legend()

#图名
plt.xlabel('Date'); plt.ylabel('Maximum Temperature (F)');  plt.title('Acture and Predicted Values');

### 深入数据分析
## 如果可以利用的数据量增大，会对结果产生什么影响呢
## 加入的新特征会改进模型效果吗，此时的时间效率又怎么样

数据增多，采用新的数据集

import pandas as pd
features = pd.read_csv('temps_extended.csv')
features.head(5)

	year	month	day	weekday	ws_1	prcp_1	temp_2	temp_1	average	actual	friend
0	2011	1	1	Sat	4.92	0.00	36	37	45.6	40	40
1	2011	1	2	Sun	5.37	0.00	37	40	45.7	39	50
2	2011	1	3	Mon	6.26	0.00	40	39	45.8	42	42
3	2011	1	4	Tues	5.59	0.00	39	42	45.9	38	59
4	2011	1	5	Wed	3.80	0.03	42	38	46.0	45	39

print('数据规模',features.shape)

数据规模 (2191, 12)

## 时间转化，用标准时间格式方便后续工作
import datetime
years = features['year']
months = features['month']
days = features['day']
dates = [str(int(year)) + '-' + str(int(month)) + '-' + str(int(day)) for year, month, day in zip(years,months,days)]
dates = [datetime.datetime.strptime(date,'%Y-%m-%d') for date in dates]
dates[:5]

[datetime.datetime(2011, 1, 1, 0, 0),
 datetime.datetime(2011, 1, 2, 0, 0),
 datetime.datetime(2011, 1, 3, 0, 0),
 datetime.datetime(2011, 1, 4, 0, 0),
 datetime.datetime(2011, 1, 5, 0, 0)]

# 对新特征进行可视化展示
fig,((ax1,ax2),(ax3,ax4)) = plt.subplots(nrows=2,ncols=2,figsize=(15,10))
fig.autofmt_xdate(rotation=45)
# 平均最高气温
ax1.plot(dates,features['average'])
ax1.set_xlabel('');ax1.set_ylabel('Tempertature (F)');ax1.set_title('Historical Avg Max Temp')
# 风速
ax2.plot(dates,features['ws_1'],'r-')
ax2.set_xlabel('');ax2.set_ylabel('Wind Speed (mph))');ax2.set_title('Prior Wind Speed')
# 降水
ax3.plot(dates,features['prcp_1'],'r-')
ax3.set_xlabel('Date');ax3.set_ylabel('Precipitation (in)');ax3.set_title('Prior Precipitation')
# 积雪
ax4.plot(dates,features['snwd_1'],'ro')
ax4.set_xlabel('Date');ax4.set_ylabel('Snow Depth (in)');ax4.set_title('Prior Snow Depth')

plt.tight_layout(pad=2)

天气变换与季节因素有关，然而数据集中并没有体现季节的特征，可以自己创建

# 季节变量
seasons = []
for month in features['month']:
    if month in [1,2,12]:
        seasons.append('winter')
    elif month in [3,4,5]:
        seasons.append('spring')
    elif month in [6,7,8]:
        seasons.append('summer')
    elif month in [9,10,11]:
        seasons.append('fall')
reduced_features = features[['temp_1','prcp_1','average','actual']]
reduced_features['season'] = seasons

C:\Users\Owner\AppData\Local\Temp\ipykernel_15292\2969630295.py:13: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  reduced_features['season'] = seasons

import seaborn as sns
sns.set(style='ticks',color_codes=True)
# 主题
palette = sns.xkcd_palette(['dark blue','dark green','gold','orange'])
# pairplot绘图
sns.pairplot(reduced_features,hue='season',diag_kind='kde',palette=palette,plot_kws=dict(alpha=0.7),diag_kws=dict(shade=True))

<seaborn.axisgrid.PairGrid at 0x23f1d1c4370>

# 独热编码
features = pd.get_dummies(features)
# 提取特征和标签
labels = features['actual']
features = features.drop('actual',axis=1)
# 特征名字留着备用
feature_list = list(features.columns)

# 转换为所需格式
import numpy as np
features = np.array(features)
labels = np.array(labels)
# 数据集切分
from sklearn.model_selection import train_test_split

划分新的数据集

train_features,test_features,train_labels,test_labels = train_test_split(features,labels,test_size=0.25,random_state=0)
print("训练集特征：",train_features.shape)
print("训练集标签：",train_labels.shape)
print("测试集特征：",test_features.shape)
print("测试集标签：",test_labels.shape)

训练集特征： (1643, 17)
训练集标签： (1643,)
测试集特征： (548, 17)
测试集标签： (548,)


import pandas as pd
import numpy as np
# 统一特征
original_feature_indices = [feature_list.index(feature) for feature in feature_list if feature not in ['ws_1','prcp_1','snwd_1']]
# 重新读取老数据
original_features = pd.read_csv('temps.csv')
original_features = pd.get_dummies(original_features)
# 数据标签转换
original_labels = np.array(original_features['actual'])
original_features = original_features.drop('actual',axis=1)
original_feature_list = list(original_features.columns)
original_features = np.array(original_features)

# 老数据集切分
from sklearn.model_selection import train_test_split
original_train_features,original_test_features,original_train_labels,original_test_labels = train_test_split(original_features,original_labels,test_size=0.25,random_state=42)
# 数据建模
from sklearn.ensemble import RandomForestRegressor
# 同样参数和随机种子
rf = RandomForestRegressor(n_estimators=100,random_state=0)
# 老数据集
rf.fit(original_train_features,original_train_labels)
# 统一使用一个测试集，为了公平
predictions = rf.predict(test_features[:,original_feature_indices])
errors = abs(predictions-test_labels)
print('老数据集平均温度误差：',round(np.mean(errors),2),'°')
mape = 100 *(errors/test_labels)
# 为了观察设定准确率
accuracy = 100 -np.mean(mape)
print('Accuracy:',round(accuracy,2),'%')

老数据集平均温度误差： 4.68 °
Accuracy: 92.19 %

from sklearn.ensemble import RandomForestRegressor
# 保证标签一致 剔除新的特征
original_train_changeed_features = train_features[:,original_feature_indices]
original_test_changed_features = test_features[:,original_feature_indices]
rf = RandomForestRegressor(n_estimators=100,random_state=0)
rf.fit(original_train_changeed_features,train_labels)
# 预测
baseline_predictions = rf.predict(original_test_changed_features)
# 结果
baseline_errors = abs(baseline_predictions-test_labels)
print('新数据集平均温度误差:',round(np.mean(baseline_errors),2),'%')
baseline_mape = 100 * np.mean(baseline_errors/test_labels)
# 准确率
baseline_accuracy = 100 - baseline_mape
print('Accuracy:',round(baseline_accuracy,2),'%')

新数据集平均温度误差: 4.2 %
Accuracy: 93.12 %

数据增多以后，相同的特征，结果加强了

from sklearn.ensemble import RandomForestRegressor
rf_exp = RandomForestRegressor(n_estimators=100,random_state=0)
rf_exp.fit(train_features,train_labels)
# 同一测试集
predictions = rf_exp.predict(test_features)
# 评估
errors = abs(predictions - test_labels)
print('平均温度误差：',round(np.mean(errors),2),"%")
mape = np.mean(100*(errors/test_labels))
improvement_baseline = 100 * abs(mape-baseline_mape) / baseline_mape
print('特征增多以后模型效果变化：',round(improvement_baseline,2),'%')
# 准确率
accuracy = 100 - mape
print('Accuracy:',round(accuracy,2),'%')

平均温度误差： 4.05 %
特征增多以后模型效果变化： 3.34 %
Accuracy: 93.35 %

重要特征

importances = list(rf_exp.feature_importances_)
# 名字和数值拼接在一起
feature_importances = [(feature,round(importance,2)) for feature,importance in zip(feature_list,importances)]
# 排序
feature_importances = sorted(feature_importances,key=lambda x:x[1],reverse=True)
# 打印结果
[print('Variable:{:20} Importance: {}'.format(*pair)) for pair in feature_importances]

Variable:temp_1               Importance: 0.85
Variable:average              Importance: 0.05
Variable:ws_1                 Importance: 0.02
Variable:friend               Importance: 0.02
Variable:year                 Importance: 0.01
Variable:month                Importance: 0.01
Variable:day                  Importance: 0.01
Variable:prcp_1               Importance: 0.01
Variable:temp_2               Importance: 0.01
Variable:snwd_1               Importance: 0.0
Variable:weekday_Fri          Importance: 0.0
Variable:weekday_Mon          Importance: 0.0
Variable:weekday_Sat          Importance: 0.0
Variable:weekday_Sun          Importance: 0.0
Variable:weekday_Thurs        Importance: 0.0
Variable:weekday_Tues         Importance: 0.0
Variable:weekday_Wed          Importance: 0.0





[None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None]

# 可视化重要指标
plt.style.use('fivethirtyeight')
x_values = list(range(len(importances)))
plt.bar(x_values,importances,orientation="vertical",color="r",edgecolor="k",linewidth=1.2)
plt.xticks(x_values,feature_list,rotation='vertical')
plt.ylabel('Importance')
plt.xlabel('Variable')
plt.title('Variable Importances')

Text(0.5, 1.0, 'Variable Importances')

sorted_importances = [importance[1] for importance  in feature_importances]
sorted_features = [importance[0] for importance in feature_importances]
# 累计重要性
cumulative_importances = np.cumsum(sorted_importances)
# 绘制折线图
plt.plot(x_values,cumulative_importances,'g-')
plt.hlines(y=0.95,xmin=0,xmax=len(sorted_importances),color='r',linestyles='dashed')
plt.xticks(x_values,sorted_features,rotation='vertical')
plt.xlabel('Variable');plt.ylabel('Cumulative Importance')
plt.title('Cumulative Importances')

Text(0.5, 1.0, 'Cumulative Importances')

如果只使用这5个特征建模，观察结果

important_feature_names = [feature[0] for feature in feature_importances[0:5]]
# 名字
important_indices = [feature_list.index(feature) for feature in important_feature_names]
# 训练集
important_train_features = train_features[:,important_indices]
important_test_features = test_features[:,important_indices]
# 数据维度
print("important train features shape:",important_train_features.shape)
print("important test features shape:",important_test_features.shape)
# 训练模型
rf_exp.fit(important_train_features,train_labels)
# 同样的测试集
predictions = rf_exp.predict(important_test_features)
# 评估
errors = abs(predictions-test_labels)
print('平均温度误差：',round(np.mean(errors),2),"°")
mape = 100*(errors/test_labels)
accuracy = 100 - np.mean(mape)
print('Accuracy:',round(accuracy,2),"%")

important train features shape: (1643, 5)
important test features shape: (548, 5)
平均温度误差： 4.11 °
Accuracy: 93.28 %

虽然没有提升效率，那么观察一下在模型时间效率上面有没有提高???

import time
all_features_time = []
for _ in range(10):
    start_time = time.time()
    rf_exp.fit(train_features,train_labels)
    all_features_predictions = rf_exp.predict(test_features)
    end_time = time.time()
    all_features_time.append(end_time-start_time)
    
all_features_time = np.mean(all_features_time)
print("使用所有特征与测试的平均时间消耗：",round(all_features_time,2),'s')

使用所有特征与测试的平均时间消耗： 0.71 s

# 只选用重要特征训练时
reduced_features_time = []
for _ in range(10):
    start_time = time.time()
    rf_exp.fit(important_train_features,train_labels)
    reduced_features_predictions = rf_exp.predict(important_test_features)
    end_time = time.time()
    reduced_features_time.append(end_time-start_time)
    
reduced_features_time = np.mean(reduced_features_time)
print("使用重要特征与测试的平均时间消耗：",round(reduced_features_time,2),'s')

使用重要特征与测试的平均时间消耗： 0.42 s

# 原始模型时间效率
original_features_time =[]
for _ in range(10):
    start_time =time.time()
    rf.fit(original_train_features,original_train_labels)
    original_features_predictions =rf.predict(test_features[:,original_feature_indices])
    end_time =time.time()
    original_features_time.append(end_time -start_time)
original_features_time =np.mean(original_features_time)

print("使用原始模型测试的平均时间消耗：",round(original_features_time,2),'s')

使用原始模型测试的平均时间消耗： 0.17 s

不同特征做一下对比

# 对比展示
all_accuracy = 100 * (1-np.mean(abs(all_features_predictions-test_labels)/test_labels))
reduced_accuracy = 100 * (1-np.mean(abs(reduced_features_predictions-test_labels)/test_labels))

# 保存结果并展示
comparision = pd.DataFrame({'features':['all(17)','reduced(5)'],
                           'runtime':[round(all_features_time,2),round(reduced_features_time,2)],
                           'accuracy':[round(all_accuracy,2),round(reduced_accuracy,2)]})
comparision[['features','accuracy','runtime']]

	features	accuracy	runtime
0	all(17)	93.35	0.71
1	reduced(5)	93.28	0.42

# 时间效率可能会比准确率更加优先考虑
relative_accuracy_decrease =  100 * (all_accuracy - reduced_accuracy) / all_accuracy
print('相对accuracy提升：',round(relative_accuracy_decrease,3),"%")
relative_runtime_decrease = 100 * (all_features_time - reduced_features_time) / all_features_time
print("相对时间效率提升:",round(relative_runtime_decrease,3),"%")

相对accuracy提升： 0.071 %
相对时间效率提升: 40.663 %

# 原模型的预测温度对比
original_mae = np.mean(abs(original_features_predictions -test_labels))
# 所有特征预测温度对比
exp_all_mae = np.mean(abs(all_features_predictions -test_labels))
# 重要特征预测温度对比
exp_reduced_mae = np.mean(abs(reduced_features_predictions -test_labels))
# 原模型的准确率
original_accuracy = 100 * (1 - np.mean(abs(original_features_predictions - test_labels) /test_labels))
model_comparison = pd.DataFrame({'model': ['original', 'exp_all', 'exp_reduced'],
                                'error (degrees)': [original_mae, exp_all_mae, exp_reduced_mae],
                                'accuracy': [original_accuracy, all_accuracy, reduced_accuracy],
                                'run_time (s)': [original_features_time, all_features_time, reduced_features_time]})

# 汇聚所有实验结果
fig, (ax1,ax2,ax3) = plt.subplots(nrows=1,ncols=3,figsize=(16,5),sharex=True)
# X轴
x_values = [0,1,2]
labels = list(model_comparison['model'])
plt.xticks(x_values,labels)
# 字体大小
fontdict = {'fontsize':18}
fontdict_yaxis = {'fontsize':14}
# 预测温度和真实温度的比对比
ax1.bar(x_values,model_comparison['error (degrees)'], color=['b','r','g'],edgecolor='k',linewidth=1.5)
ax1.set_ylim(bottom=3.5, top=4.5)
ax1.set_ylabel('Error (degree) (F)',fontdict=fontdict_yaxis)
ax1.set_title('Model Error Comparison',fontdict=fontdict)
# 准确率对比
ax2.bar(x_values,model_comparison['accuracy'],color=['b','r','g'],edgecolor='k',linewidth=1.5)
ax2.set_ylim(bottom=92, top=94)
ax2.set_ylabel('Accuracy (%)',fontdict=fontdict_yaxis)
ax2.set_title('Model Accuracy Comparision',fontdict=fontdict)
# 时间效率对比
ax3.bar(x_values,model_comparison['run_time (s)'], color=['b','r','g'],edgecolor='k',linewidth=1.5)
ax3.set_ylim(bottom=0,top=1)
ax3.set_ylabel('run_time (s)',fontdict=fontdict_yaxis)
ax3.set_title('Model Run-Time Comparison',fontdict=fontdict)
plt.show()

模型调参

from sklearn.ensemble import RandomForestRegressor
from pprint import pprint
rf = RandomForestRegressor(random_state=42)
pprint(rf.get_params())

{'bootstrap': True,
 'ccp_alpha': 0.0,
 'criterion': 'squared_error',
 'max_depth': None,
 'max_features': 'auto',
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 100,
 'n_jobs': None,
 'oob_score': False,
 'random_state': 42,
 'verbose': 0,
 'warm_start': False}

###  自动随机调参

from sklearn.model_selection import RandomizedSearchCV
# 建立树的个数
n_estimators = [int(x) for x in np.linspace(start=200,stop=2000,num=10)]
# 最大特征的选择方法
max_features = ['auto','sqrt']
# 树最大深度
max_depth = [int(x) for x in np.linspace(10,20,num=2)]
max_depth.append(None)
# 节点最小分裂所需要的样本个数
min_samples_split = [2,5,10]
# 叶子节点最小的样本数
min_samples_leaf = [1,2,4]
# 样本采样方法
bootstrap = [True,False]
# 随机参数空间
random_grid ={'n_estimators':n_estimators,
             'max_features':max_features,
             'max_depth':max_depth,
             'min_samples_split':min_samples_split,
             'min_samples_leaf':min_samples_leaf,
             'bootstrap':bootstrap}

####  随机组合参数

rf = RandomForestRegressor()
rf_random = RandomizedSearchCV(estimator=rf, # 指定调参模型
                            param_distributions=random_grid, # 指定候选参数列表
                            n_iter=100, # 随机选择参数组合的个数，这里是随机选择100组，找这中间最好的
                            scoring='neg_mean_absolute_error', # 评估方法
                            cv=3, # 交叉验证
                            verbose=2, # 打印信息的数量
                            random_state=42, # 随机种子，随便选
                            n_jobs=-1) # 多线程数目，如果-1代表使用所有线程
# 寻找开始
rf_random.fit(train_features,train_labels)
rf_random.best_params_

Fitting 3 folds for each of 100 candidates, totalling 300 fits





{'n_estimators': 2000,
 'min_samples_split': 5,
 'min_samples_leaf': 4,
 'max_features': 'auto',
 'max_depth': None,
 'bootstrap': True}

# 评估结果
def evaluate(model,test_features,test_labels):
    predictions = model.predict(test_features)
    errors = abs(predictions - test_labels)
    mape = 100 * np.mean(errors / test_labels)
    accuracy = 100 - mape
    print('平均气温误差：',np.mean(errors))
    print('Accuracy = {:0.2f}%'.format(accuracy))
# 默认参数结果
base_model = RandomForestRegressor(random_state=42)
base_model.fit(train_features,train_labels)
evaluate(base_model,test_features,test_labels)

平均气温误差： 4.036259124087591
Accuracy = 93.37%

# 随机组合最佳参数
best_random = rf_random.best_estimator_
evaluate(best_random,test_features,test_labels)

平均气温误差： 4.0074731175393135
Accuracy = 93.43%

网格参数搜索

{'n_estimators': 1800,
 'min_samples_split': 10,
 'min_samples_leaf': 4,
 'max_features': 'auto',
 'max_depth': None,
 'bootstrap': True}

from sklearn.model_selection import GridSearchCV
# 候选参数空间
param_grid = {
    'n_estimators':[1600,1700,1800,1900,2000],
    'max_features':['auto'],
    'max_depth':[8,10,12],
    'min_samples_split':[3,5,7],
    'min_samples_leaf':[2.3,4,5,6],
    'bootstrap':[True]
}
# 基本算法模型
rf = RandomForestRegressor()
# 网格搜索
grid_search = GridSearchCV(estimator=rf,
                          param_grid=param_grid,
                          scoring='neg_mean_absolute_error',
                          cv=3,
                          n_jobs=-1,
                          verbose=2)
# 搜索开始
grid_search.fit(train_features,train_labels)

Fitting 3 folds for each of 180 candidates, totalling 540 fits


D:\anaconda3\lib\site-packages\sklearn\model_selection\_validation.py:372: FitFailedWarning: 
135 fits failed out of a total of 540.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
135 fits failed with the following error:
Traceback (most recent call last):
  File "D:\anaconda3\lib\site-packages\sklearn\model_selection\_validation.py", line 680, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "D:\anaconda3\lib\site-packages\sklearn\ensemble\_forest.py", line 450, in fit
    trees = Parallel(
  File "D:\anaconda3\lib\site-packages\joblib\parallel.py", line 1863, in __call__
    return output if self.return_generator else list(output)
  File "D:\anaconda3\lib\site-packages\joblib\parallel.py", line 1792, in _get_sequential_output
    res = func(*args, **kwargs)
  File "D:\anaconda3\lib\site-packages\sklearn\utils\fixes.py", line 216, in __call__
    return self.function(*args, **kwargs)
  File "D:\anaconda3\lib\site-packages\sklearn\ensemble\_forest.py", line 185, in _parallel_build_trees
    tree.fit(X, y, sample_weight=curr_sample_weight, check_input=False)
  File "D:\anaconda3\lib\site-packages\sklearn\tree\_classes.py", line 1315, in fit
    super().fit(
  File "D:\anaconda3\lib\site-packages\sklearn\tree\_classes.py", line 242, in fit
    raise ValueError(
ValueError: min_samples_leaf must be at least 1 or in (0, 0.5], got 2.3

  warnings.warn(some_fits_failed_message, FitFailedWarning)
D:\anaconda3\lib\site-packages\sklearn\model_selection\_search.py:969: UserWarning: One or more of the test scores are non-finite: [        nan         nan         nan         nan         nan         nan
         nan         nan         nan         nan         nan         nan
         nan         nan         nan -3.66715249 -3.67265289 -3.6666138
 -3.66880025 -3.66765682 -3.6725743  -3.66850699 -3.66715993 -3.66955895
 -3.66730326 -3.66887582 -3.66898245 -3.66910087 -3.66955339 -3.66925188
 -3.66767582 -3.66392168 -3.66590283 -3.66647468 -3.66916971 -3.66603121
 -3.66586079 -3.66445455 -3.66298478 -3.66498142 -3.66926415 -3.66660605
 -3.66211951 -3.66663106 -3.66897272 -3.66051875 -3.66402215 -3.66404952
 -3.66353607 -3.6642029  -3.66047745 -3.66229798 -3.6646911  -3.65990835
 -3.66086848 -3.66117259 -3.66397042 -3.66353509 -3.66311066 -3.6654521
         nan         nan         nan         nan         nan         nan
         nan         nan         nan         nan         nan         nan
         nan         nan         nan -3.66888733 -3.66731329 -3.66960138
 -3.6676715  -3.66823412 -3.6687913  -3.66691494 -3.66508719 -3.67174204
 -3.6732568  -3.66707472 -3.6662491  -3.67071602 -3.66858938 -3.6695038
 -3.66534122 -3.66134385 -3.66749329 -3.66597971 -3.66399534 -3.66496658
 -3.66704291 -3.66484829 -3.66720088 -3.6665224  -3.66248715 -3.66751772
 -3.66803523 -3.66671033 -3.66589929 -3.66162471 -3.66317662 -3.66164416
 -3.66021027 -3.66350166 -3.65923551 -3.66377361 -3.66143871 -3.6651609
 -3.66614199 -3.66217163 -3.66642478 -3.66285729 -3.66038393 -3.66225526
         nan         nan         nan         nan         nan         nan
         nan         nan         nan         nan         nan         nan
         nan         nan         nan -3.67097272 -3.66544877 -3.67074033
 -3.67176597 -3.66914464 -3.66931447 -3.67020498 -3.66778535 -3.6680251
 -3.66782811 -3.66677228 -3.67086388 -3.66895445 -3.67114891 -3.67163594
 -3.6619978  -3.66453644 -3.66907959 -3.66519192 -3.66911132 -3.6655087
 -3.6684413  -3.6656255  -3.66152951 -3.66630327 -3.66651272 -3.66543072
 -3.66262121 -3.66430172 -3.66648642 -3.66166699 -3.66333263 -3.66292747
 -3.66399535 -3.66247052 -3.66596842 -3.66142884 -3.66444085 -3.66254073
 -3.66432689 -3.66124163 -3.65741632 -3.66360827 -3.66092641 -3.66143091]
  warnings.warn(





GridSearchCV(cv=3, estimator=RandomForestRegressor(), n_jobs=-1,
             param_grid={'bootstrap': [True], 'max_depth': [8, 10, 12],
                         'max_features': ['auto'],
                         'min_samples_leaf': [2.3, 4, 5, 6],
                         'min_samples_split': [3, 5, 7],
                         'n_estimators': [1600, 1700, 1800, 1900, 2000]},
             scoring='neg_mean_absolute_error', verbose=2)

best_grid_search = grid_search.best_estimator_
evaluate(best_grid_search,test_features,test_labels)

平均气温误差： 4.004802708677475
Accuracy = 93.44%

最终模型

print('最终模型参数:\n')
pprint(best_grid_search.get_params())

最终模型参数:

{'bootstrap': True,
 'ccp_alpha': 0.0,
 'criterion': 'squared_error',
 'max_depth': 12,
 'max_features': 'auto',
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_samples_leaf': 6,
 'min_samples_split': 7,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 1700,
 'n_jobs': None,
 'oob_score': False,
 'random_state': None,
 'verbose': 0,
 'warm_start': False}

贝叶斯优化寻找最佳参数

from hyperopt import fmin, tpe, hp, STATUS_OK, Trials
from sklearn.model_selection import cross_val_score
def hyperopt_train_test(params):
    clf = RandomForestRegressor(**params)
    return cross_val_score(clf,train_features,train_labels).mean()
max_depth = [i for i in range(10,20)]
# max_depth.append(None)
space4rf = {
    'max_depth': hp.choice('max_depth', max_depth),
    'max_features': hp.choice('max_features', ['auto','sqrt']),
    'min_samples_split':hp.choice('min_samples_split',range(5,20)),
    'min_samples_leaf':hp.choice('min_samples_leaf',range(2,10)),
    'n_estimators': hp.choice('n_estimators', range(1000,2000)),
    'bootstrap':hp.choice('bootstrap',[True,False])
}

best = 0
def f(params):
    global best
    acc = hyperopt_train_test(params)
    if acc > best:
        best = acc
        print('new best:', best, params)
    return {'loss': -acc, 'status': STATUS_OK}

trials = Trials()
best = fmin(f, space4rf, algo=tpe.suggest, max_evals=100, trials=trials)
print("best:",best)

new best:                                                                                                              
0.766416424801337                                                                                                      
{'bootstrap': False, 'max_depth': 19, 'max_features': 'auto', 'min_samples_leaf': 2, 'min_samples_split': 7, 'n_estimators': 1028}
new best:                                                                                                              
0.8644180936765691                                                                                                     
{'bootstrap': False, 'max_depth': 19, 'max_features': 'sqrt', 'min_samples_leaf': 2, 'min_samples_split': 15, 'n_estimators': 1363}
new best:                                                                                                              
0.8679831214513388                                                                                                     
{'bootstrap': True, 'max_depth': 12, 'max_features': 'auto', 'min_samples_leaf': 6, 'min_samples_split': 18, 'n_estimators': 1275}
new best:                                                                                                              
0.8683413950549937                                                                                                     
{'bootstrap': True, 'max_depth': 14, 'max_features': 'auto', 'min_samples_leaf': 7, 'min_samples_split': 18, 'n_estimators': 1863}
new best:                                                                                                              
0.8683946223494816                                                                                                     
{'bootstrap': True, 'max_depth': 15, 'max_features': 'auto', 'min_samples_leaf': 9, 'min_samples_split': 6, 'n_estimators': 1933}
new best:                                                                                                              
0.8684885517659223                                                                                                     
{'bootstrap': True, 'max_depth': 15, 'max_features': 'auto', 'min_samples_leaf': 9, 'min_samples_split': 6, 'n_estimators': 1933}
new best:                                                                                                              
0.8686051137472097                                                                                                     
{'bootstrap': True, 'max_depth': 15, 'max_features': 'auto', 'min_samples_leaf': 9, 'min_samples_split': 6, 'n_estimators': 1408}
 34%|██████████████▉                             | 34/100 [25:19<1:06:39, 60.60s/trial, best loss: -0.8686051137472097]

贝叶斯优化的最大特点是每一次优化都对后面结果产生影响，而随机和网格每次都是独立的，不影响后面的结果，随机适合在开始时候不知道参数，网格用在随机后面。

python随机森林进行气温预测更多相关文章

今日报告-66

设置Windows10暂停更新3000天

AQS公平锁的流程

AMD锐龙7 7800X3D网游专项测试：竟比i9-13900KS强了15%

常用总线技术基本参数对比

探索图像数据中的隐藏信息：语义实体识别和关系抽取的奇妙之旅

设置Chrome浏览器自动升级

JavaScript – 小技巧 Tips

postgresql在插入数据后怎么获取自增id

EF Core 的基本使用

error: failed to push some refs to 'https://github.com/*******/********.github.io.git'

欧拉降幂

编程语言能力对比

基于机器视觉的小车轨迹控制软件界面展示

随机推荐

划水

命令拼接技巧

Fiddler抓包Android7以内版本

AJAX & AXIOS-2024/11/1

验证码处理在自动化测试中的应用

一些学科笑话

NOIP2024模拟赛20 & 11.1 小记

20241101 数据结构与算法期中机试收获

Java，启动！

什么是IT技术

即将到来！

2024/11/1日日志关于JavaScript简介&引入方式以及基础语法的学习

舍得-时间-工作是人的一生最重要的事情-自己要有私房钱-人的一生最重要的事情是书写自己的人生

2.TiUP 部署 DM 集群

原型模式的C++实现

python随机森林进行气温预测

天气最高温度

我们要完成三个任务

随机森林建模 --》 选择特征 - 》 增加数据量和特征个数 --》 找到最优的参数

掌握机器学习里面2种经典的参数调节方法

读数据

时间处理函数

数据展示

展示四个指标：分别为最高气温的标签值、前天、昨天、朋友预测的气温最高值，四个图。

数据预处理，比如 周几，这个要转成 计算机可识别的数字

发现，mape的值从6.0上升到6.2，并没有下降，说明不能只选择最重要的特征

数据增多，采用新的数据集

天气变换与季节因素有关，然而数据集中并没有体现季节的特征，可以自己创建

划分新的数据集

数据增多以后，相同的特征，结果加强了

重要特征

如果只使用这5个特征建模，观察结果

虽然没有提升效率，那么观察一下在模型时间效率上面有没有提高???

不同特征 做一下对比

模型调参

网格参数搜索

最终模型

贝叶斯优化寻找最佳参数

python随机森林进行气温预测更多相关文章

随机推荐

热门话题

随机森林建模 --》选择特征 - 》增加数据量和特征个数 --》找到最优的参数

数据预处理，比如周几，这个要转成计算机可识别的数字

不同特征做一下对比