import numpy as np np.random.seed(42) from sklearn.tree import DecisionTreeRegressor from sklearn import tree from sklearn.model_selection import train_test_split from sklearn.metrics import mean_squared_error import matplotlib.pyplot as plt from sklearn.inspection import plot_partial_dependence import pydotplus from pydotplus import graph_from_dot_data import pandas as pd from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV from sklearn.feature_extraction import DictVectorizer from sklearn.ensemble import RandomForestRegressor from sklearn.tree import DecisionTreeRegressor, export_graphviz from sklearn.preprocessing import OneHotEncoder from sklearn.ensemble import GradientBoostingRegressor import xgboost as xgb from xgboost import plot_importance from matplotlib import pyplot as plt from sklearn import linear_model from sklearn.metrics import explained_variance_score, mean_absolute_error, mean_squared_error, r2_score #import graphviz #导入数据集 data = pd.read_csv("G:\py数据库\insurance_data.csv") #独热编码 ohe = OneHotEncoder() data_transform = ohe.fit_transform(data.iloc[:, 7:8]).toarray() #data_transform2 = ohe.categories_ #data_transform3 = ohe.get_feature_names #print(data_transform) #print(data_transform2) #print(data_transform3) data_transform4 = pd.DataFrame(data_transform, columns=["southwest", "southeast", "northwest", "northeast"]) #print(data_transform4) data2 = data.drop(["region"], axis=1) data3 = pd.concat((data2, data_transform4), axis=1) #print(data3) order=["age", "gender", "bmi", "bloodpressure", "diabetic", "children", "smoker", "southwest", "southeast", "northwest", "northeast", "claim"] data4=data3[order] #print(data4) #更新数据集 x = data4[["age", "gender", "bmi", "bloodpressure", "diabetic", "children", "smoker", "southwest", "southeast", "northwest", "northeast"]] y = data4["claim"] x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3) # 定义随机森林模型 rf = RandomForestRegressor() # 定义参数网格 param_grid_rf = {'max_depth': [4, 5, 6], 'max_features': ['sqrt', 'log2'], 'min_samples_leaf': [50, 60], 'n_estimators': [300]} # 进行网格搜索来寻找最佳参数组合 grid_search = GridSearchCV(rf, param_grid_rf, cv=10) grid_search.fit(x_train, y_train) best_params = grid_search.best_params_ #print("随机森林最佳参数:\n", grid_search.best_params_) #print("随机森林最佳结果:\n", grid_search.best_score_) #print("随机森林最佳估计器:\n", grid_search.best_estimator_) #print("随机森林交叉验证结果:\n", grid_search.cv_results_) # 训练随机森林模型 rf = RandomForestRegressor(max_depth=grid_search.best_params_['max_depth'], max_features=grid_search.best_params_['max_features'], min_samples_leaf=grid_search.best_params_['min_samples_leaf'], random_state=42) rf.fit(x, y) score_rf = rf.score(x_test, y_test) y_pred_rf = rf.predict(x_test) #print(rf.score(x_test,y_test)) # 定义GBDT模型 gbdt = GradientBoostingRegressor() # 定义参数网格 param_grid_gbdt = {'max_depth': [3, 4, 5], 'max_features': ['sqrt', 'log2'], 'min_samples_leaf': [50, 60, 70, 80, 90]} # 进行网格搜索来寻找最佳参数组合 grid_search = GridSearchCV(gbdt, param_grid_gbdt, cv=10) grid_search.fit(x_train, y_train) best_params = grid_search.best_params_ #print("GBDT最佳参数:\n", grid_search.best_params_) #print("GBDT最佳结果:\n", grid_search.best_score_) #print("GBDT最佳估计器:\n", grid_search.best_estimator_) #print("GBDT交叉验证结果:\n", grid_search.cv_results_) # 训练GBDT模型 gbdt = GradientBoostingRegressor(max_depth=grid_search.best_params_['max_depth'], max_features=grid_search.best_params_['max_features'], min_samples_leaf=grid_search.best_params_['min_samples_leaf'], random_state=42) gbdt.fit(x_train, y_train) score_gbdt=gbdt.score(x_test, y_test) y_pred_gbdt = gbdt.predict(x_test) #print(gbdt.score(x, y)) # 定义XGB模型 XGB = xgb.XGBRegressor() # 定义参数网格 param_grid_XGB = {'max_depth': [3, 4, 5], 'max_features': ['sqrt', 'log2'], 'min_samples_leaf': [50, 60, 70, 80, 90, 100]} # 进行网格搜索来寻找最佳参数组合 grid_search = GridSearchCV(XGB, param_grid_XGB, cv=10) grid_search.fit(x_train, y_train) best_params = grid_search.best_params_ #print("XGB最佳参数:\n", grid_search.best_params_) #print("XGB最佳结果:\n", grid_search.best_score_) #print("XGB最佳估计器:\n", grid_search.best_estimator_) #print("XGB交叉验证结果:\n", grid_search.cv_results_) # 训练XGB模型 XGB = xgb.XGBRegressor(max_depth=grid_search.best_params_['max_depth'], max_features=grid_search.best_params_['max_features'], min_samples_leaf=grid_search.best_params_['min_samples_leaf'], random_state=42) XGB.fit(x_train, y_train) score_XGB=XGB.score(x_test, y_test) y_pred_XGB = XGB.predict(x_test) #print(XGB.score(x_test, y_test)) #按r2加权 a = score_rf+score_gbdt+score_XGB subscore_rf = score_rf/a subscore_gbdt = score_gbdt/a subscore_XGB = score_XGB/a y_pred=y_pred_rf*subscore_rf+y_pred_gbdt*subscore_gbdt+y_pred_XGB*subscore_XGB print(y_pred) #评价标准 explained_variance_score(y_test, y_pred) mean_absolute_error(y_test, y_pred) mean_squared_error(y_test, y_pred) rmse_test1 = mean_squared_error(y_test, y_pred) ** 0.5 r2_score(y_test, y_pred) print("解释方差分:", explained_variance_score(y_test, y_pred)) print("平均绝对误差:", mean_absolute_error(y_test, y_pred)) print("均方误差:", mean_squared_error(y_test, y_pred)) print("均方根误差:", rmse_test1) print("决定系数:", r2_score(y_test, y_pred)) # 绘制图 plt.figure(figsize=(15, 5)) plt.plot(range(len(y_test)), y_test, 'r', label='TRUE') plt.plot(range(len(y_test)), y_pred, 'b', label='PRED') plt.legend() plt.show() # 绘制散点图 plt.figure(figsize=(5, 5)) plt.scatter(y_test, y_pred) plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'k--') plt.xlabel('TRUE') plt.ylabel('PRED') plt.show()