import numpy as np
np.random.seed(42)

from sklearn.tree import DecisionTreeRegressor
from sklearn import tree
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import matplotlib.pyplot as plt
from sklearn.inspection import plot_partial_dependence
import pydotplus
from pydotplus import graph_from_dot_data
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.feature_extraction import DictVectorizer
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor, export_graphviz
from sklearn.preprocessing import OneHotEncoder
from sklearn.ensemble import GradientBoostingRegressor
import xgboost as xgb
from xgboost import plot_importance
from matplotlib import pyplot as plt
from sklearn import linear_model
from sklearn.metrics import explained_variance_score, mean_absolute_error, mean_squared_error, r2_score
#import graphviz

#导入数据集
data = pd.read_csv("G:\py数据库\insurance_data.csv")

#独热编码
ohe = OneHotEncoder()
data_transform = ohe.fit_transform(data.iloc[:, 7:8]).toarray()
#data_transform2 = ohe.categories_
#data_transform3 = ohe.get_feature_names
#print(data_transform)
#print(data_transform2)
#print(data_transform3)


data_transform4 = pd.DataFrame(data_transform,
                             columns=["southwest", "southeast",
                                      "northwest", "northeast"])
#print(data_transform4)

data2 = data.drop(["region"], axis=1)
data3 = pd.concat((data2, data_transform4), axis=1)
#print(data3)

order=["age", "gender", "bmi", "bloodpressure", "diabetic", "children", "smoker",
       "southwest", "southeast", "northwest", "northeast",
       "claim"]

data4=data3[order]
#print(data4)

#更新数据集

x = data4[["age", "gender", "bmi", "bloodpressure", "diabetic", "children", "smoker",
           "southwest", "southeast", "northwest", "northeast"]]
y = data4["claim"]
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3)


# 定义随机森林模型
rf = RandomForestRegressor()

# 定义参数网格
param_grid_rf = {'max_depth': [4, 5, 6],
              'max_features': ['sqrt', 'log2'],
              'min_samples_leaf': [50, 60],
              'n_estimators': [300]}

# 进行网格搜索来寻找最佳参数组合
grid_search = GridSearchCV(rf, param_grid_rf, cv=10)
grid_search.fit(x_train, y_train)
best_params = grid_search.best_params_

#print("随机森林最佳参数:\n", grid_search.best_params_)
#print("随机森林最佳结果:\n", grid_search.best_score_)
#print("随机森林最佳估计器:\n", grid_search.best_estimator_)
#print("随机森林交叉验证结果:\n", grid_search.cv_results_)
# 训练随机森林模型
rf = RandomForestRegressor(max_depth=grid_search.best_params_['max_depth'],
                           max_features=grid_search.best_params_['max_features'],
                           min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
                           random_state=42)
rf.fit(x, y)
score_rf = rf.score(x_test, y_test)
y_pred_rf = rf.predict(x_test)
#print(rf.score(x_test,y_test))


# 定义GBDT模型
gbdt = GradientBoostingRegressor()

# 定义参数网格
param_grid_gbdt = {'max_depth': [3, 4, 5],
              'max_features': ['sqrt', 'log2'],
              'min_samples_leaf': [50, 60, 70, 80, 90]}

# 进行网格搜索来寻找最佳参数组合
grid_search = GridSearchCV(gbdt, param_grid_gbdt, cv=10)
grid_search.fit(x_train, y_train)
best_params = grid_search.best_params_

#print("GBDT最佳参数:\n", grid_search.best_params_)
#print("GBDT最佳结果:\n", grid_search.best_score_)
#print("GBDT最佳估计器:\n", grid_search.best_estimator_)
#print("GBDT交叉验证结果:\n", grid_search.cv_results_)
# 训练GBDT模型
gbdt = GradientBoostingRegressor(max_depth=grid_search.best_params_['max_depth'],
                                 max_features=grid_search.best_params_['max_features'],
                                 min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
                                 random_state=42)
gbdt.fit(x_train, y_train)
score_gbdt=gbdt.score(x_test, y_test)
y_pred_gbdt = gbdt.predict(x_test)
#print(gbdt.score(x, y))


# 定义XGB模型
XGB = xgb.XGBRegressor()

# 定义参数网格
param_grid_XGB = {'max_depth': [3, 4, 5],
              'max_features': ['sqrt', 'log2'],
              'min_samples_leaf': [50, 60, 70, 80, 90, 100]}

# 进行网格搜索来寻找最佳参数组合
grid_search = GridSearchCV(XGB, param_grid_XGB, cv=10)
grid_search.fit(x_train, y_train)
best_params = grid_search.best_params_

#print("XGB最佳参数:\n", grid_search.best_params_)
#print("XGB最佳结果:\n", grid_search.best_score_)
#print("XGB最佳估计器:\n", grid_search.best_estimator_)
#print("XGB交叉验证结果:\n", grid_search.cv_results_)
# 训练XGB模型
XGB = xgb.XGBRegressor(max_depth=grid_search.best_params_['max_depth'],
                       max_features=grid_search.best_params_['max_features'],
                       min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
                       random_state=42)
XGB.fit(x_train, y_train)
score_XGB=XGB.score(x_test, y_test)
y_pred_XGB = XGB.predict(x_test)
#print(XGB.score(x_test, y_test))


#按r2加权
a = score_rf+score_gbdt+score_XGB
subscore_rf = score_rf/a
subscore_gbdt = score_gbdt/a
subscore_XGB = score_XGB/a

y_pred=y_pred_rf*subscore_rf+y_pred_gbdt*subscore_gbdt+y_pred_XGB*subscore_XGB

print(y_pred)

#评价标准
explained_variance_score(y_test, y_pred)
mean_absolute_error(y_test, y_pred)
mean_squared_error(y_test, y_pred)
rmse_test1 = mean_squared_error(y_test, y_pred) ** 0.5
r2_score(y_test, y_pred)

print("解释方差分:", explained_variance_score(y_test, y_pred))
print("平均绝对误差:", mean_absolute_error(y_test, y_pred))
print("均方误差:", mean_squared_error(y_test, y_pred))
print("均方根误差:", rmse_test1)
print("决定系数:", r2_score(y_test, y_pred))

# 绘制图
plt.figure(figsize=(15, 5))
plt.plot(range(len(y_test)), y_test, 'r', label='TRUE')
plt.plot(range(len(y_test)), y_pred, 'b', label='PRED')
plt.legend()
plt.show()

# 绘制散点图
plt.figure(figsize=(5, 5))
plt.scatter(y_test, y_pred)
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'k--')
plt.xlabel('TRUE')
plt.ylabel('PRED')
plt.show()