193 lines
6.3 KiB
Python
193 lines
6.3 KiB
Python
import numpy as np
|
|
np.random.seed(42)
|
|
|
|
from sklearn.tree import DecisionTreeRegressor
|
|
from sklearn import tree
|
|
from sklearn.model_selection import train_test_split
|
|
from sklearn.metrics import mean_squared_error
|
|
import matplotlib.pyplot as plt
|
|
from sklearn.inspection import plot_partial_dependence
|
|
import pydotplus
|
|
from pydotplus import graph_from_dot_data
|
|
import pandas as pd
|
|
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
|
|
from sklearn.feature_extraction import DictVectorizer
|
|
from sklearn.ensemble import RandomForestRegressor
|
|
from sklearn.tree import DecisionTreeRegressor, export_graphviz
|
|
from sklearn.preprocessing import OneHotEncoder
|
|
from sklearn.ensemble import GradientBoostingRegressor
|
|
import xgboost as xgb
|
|
from xgboost import plot_importance
|
|
from matplotlib import pyplot as plt
|
|
from sklearn import linear_model
|
|
from sklearn.metrics import explained_variance_score, mean_absolute_error, mean_squared_error, r2_score
|
|
#import graphviz
|
|
|
|
#导入数据集
|
|
data = pd.read_csv("G:\py数据库\insurance_data.csv")
|
|
|
|
#独热编码
|
|
ohe = OneHotEncoder()
|
|
data_transform = ohe.fit_transform(data.iloc[:, 7:8]).toarray()
|
|
#data_transform2 = ohe.categories_
|
|
#data_transform3 = ohe.get_feature_names
|
|
#print(data_transform)
|
|
#print(data_transform2)
|
|
#print(data_transform3)
|
|
|
|
|
|
data_transform4 = pd.DataFrame(data_transform,
|
|
columns=["southwest", "southeast",
|
|
"northwest", "northeast"])
|
|
#print(data_transform4)
|
|
|
|
data2 = data.drop(["region"], axis=1)
|
|
data3 = pd.concat((data2, data_transform4), axis=1)
|
|
#print(data3)
|
|
|
|
order=["age", "gender", "bmi", "bloodpressure", "diabetic", "children", "smoker",
|
|
"southwest", "southeast", "northwest", "northeast",
|
|
"claim"]
|
|
|
|
data4=data3[order]
|
|
#print(data4)
|
|
|
|
#更新数据集
|
|
|
|
x = data4[["age", "gender", "bmi", "bloodpressure", "diabetic", "children", "smoker",
|
|
"southwest", "southeast", "northwest", "northeast"]]
|
|
y = data4["claim"]
|
|
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3)
|
|
|
|
|
|
# 定义随机森林模型
|
|
rf = RandomForestRegressor()
|
|
|
|
# 定义参数网格
|
|
param_grid_rf = {'max_depth': [4, 5, 6],
|
|
'max_features': ['sqrt', 'log2'],
|
|
'min_samples_leaf': [50, 60],
|
|
'n_estimators': [300]}
|
|
|
|
# 进行网格搜索来寻找最佳参数组合
|
|
grid_search = GridSearchCV(rf, param_grid_rf, cv=10)
|
|
grid_search.fit(x_train, y_train)
|
|
best_params = grid_search.best_params_
|
|
|
|
#print("随机森林最佳参数:\n", grid_search.best_params_)
|
|
#print("随机森林最佳结果:\n", grid_search.best_score_)
|
|
#print("随机森林最佳估计器:\n", grid_search.best_estimator_)
|
|
#print("随机森林交叉验证结果:\n", grid_search.cv_results_)
|
|
# 训练随机森林模型
|
|
rf = RandomForestRegressor(max_depth=grid_search.best_params_['max_depth'],
|
|
max_features=grid_search.best_params_['max_features'],
|
|
min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
|
|
random_state=42)
|
|
rf.fit(x, y)
|
|
score_rf = rf.score(x_test, y_test)
|
|
y_pred_rf = rf.predict(x_test)
|
|
#print(rf.score(x_test,y_test))
|
|
|
|
|
|
|
|
|
|
# 定义GBDT模型
|
|
gbdt = GradientBoostingRegressor()
|
|
|
|
# 定义参数网格
|
|
param_grid_gbdt = {'max_depth': [3, 4, 5],
|
|
'max_features': ['sqrt', 'log2'],
|
|
'min_samples_leaf': [50, 60, 70, 80, 90]}
|
|
|
|
# 进行网格搜索来寻找最佳参数组合
|
|
grid_search = GridSearchCV(gbdt, param_grid_gbdt, cv=10)
|
|
grid_search.fit(x_train, y_train)
|
|
best_params = grid_search.best_params_
|
|
|
|
#print("GBDT最佳参数:\n", grid_search.best_params_)
|
|
#print("GBDT最佳结果:\n", grid_search.best_score_)
|
|
#print("GBDT最佳估计器:\n", grid_search.best_estimator_)
|
|
#print("GBDT交叉验证结果:\n", grid_search.cv_results_)
|
|
# 训练GBDT模型
|
|
gbdt = GradientBoostingRegressor(max_depth=grid_search.best_params_['max_depth'],
|
|
max_features=grid_search.best_params_['max_features'],
|
|
min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
|
|
random_state=42)
|
|
gbdt.fit(x_train, y_train)
|
|
score_gbdt=gbdt.score(x_test, y_test)
|
|
y_pred_gbdt = gbdt.predict(x_test)
|
|
#print(gbdt.score(x, y))
|
|
|
|
|
|
|
|
|
|
# 定义XGB模型
|
|
XGB = xgb.XGBRegressor()
|
|
|
|
# 定义参数网格
|
|
param_grid_XGB = {'max_depth': [3, 4, 5],
|
|
'max_features': ['sqrt', 'log2'],
|
|
'min_samples_leaf': [50, 60, 70, 80, 90, 100]}
|
|
|
|
# 进行网格搜索来寻找最佳参数组合
|
|
grid_search = GridSearchCV(XGB, param_grid_XGB, cv=10)
|
|
grid_search.fit(x_train, y_train)
|
|
best_params = grid_search.best_params_
|
|
|
|
#print("XGB最佳参数:\n", grid_search.best_params_)
|
|
#print("XGB最佳结果:\n", grid_search.best_score_)
|
|
#print("XGB最佳估计器:\n", grid_search.best_estimator_)
|
|
#print("XGB交叉验证结果:\n", grid_search.cv_results_)
|
|
# 训练XGB模型
|
|
XGB = xgb.XGBRegressor(max_depth=grid_search.best_params_['max_depth'],
|
|
max_features=grid_search.best_params_['max_features'],
|
|
min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
|
|
random_state=42)
|
|
XGB.fit(x_train, y_train)
|
|
score_XGB=XGB.score(x_test, y_test)
|
|
y_pred_XGB = XGB.predict(x_test)
|
|
#print(XGB.score(x_test, y_test))
|
|
|
|
|
|
|
|
#按r2加权
|
|
a = score_rf+score_gbdt+score_XGB
|
|
subscore_rf = score_rf/a
|
|
subscore_gbdt = score_gbdt/a
|
|
subscore_XGB = score_XGB/a
|
|
|
|
y_pred=y_pred_rf*subscore_rf+y_pred_gbdt*subscore_gbdt+y_pred_XGB*subscore_XGB
|
|
|
|
print(y_pred)
|
|
|
|
#评价标准
|
|
explained_variance_score(y_test, y_pred)
|
|
mean_absolute_error(y_test, y_pred)
|
|
mean_squared_error(y_test, y_pred)
|
|
rmse_test1 = mean_squared_error(y_test, y_pred) ** 0.5
|
|
r2_score(y_test, y_pred)
|
|
|
|
print("解释方差分:", explained_variance_score(y_test, y_pred))
|
|
print("平均绝对误差:", mean_absolute_error(y_test, y_pred))
|
|
print("均方误差:", mean_squared_error(y_test, y_pred))
|
|
print("均方根误差:", rmse_test1)
|
|
print("决定系数:", r2_score(y_test, y_pred))
|
|
|
|
# 绘制图
|
|
plt.figure(figsize=(15, 5))
|
|
plt.plot(range(len(y_test)), y_test, 'r', label='TRUE')
|
|
plt.plot(range(len(y_test)), y_pred, 'b', label='PRED')
|
|
plt.legend()
|
|
plt.show()
|
|
|
|
# 绘制散点图
|
|
plt.figure(figsize=(5, 5))
|
|
plt.scatter(y_test, y_pred)
|
|
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'k--')
|
|
plt.xlabel('TRUE')
|
|
plt.ylabel('PRED')
|
|
plt.show()
|
|
|
|
|
|
|