1.分类决策树模型
from sklearn.tree import DecisionTreeClassifier
x = [[1, 2], [3, 4], [5, 6], [7, 8], [9, 10]]
y = [1, 0, 0, 1, 1]
model = DecisionTreeClassifier(random_state=0)
model.fit(x, y)
print(model.predict([[5, 5], [6, 9]]))
2.回归决策树模型
from sklearn.tree import DecisionTreeRegressor
x = [[1, 2], [3, 4], [5, 6], [7, 8], [9, 10]]
y = [1, 2, 3, 4, 7]
model = DecisionTreeRegressor(max_depth=2, random_state=0)
model.fit(x, y)
print(model.predict([[6, 7], [1, 3]]))
import pandas as pd
df = pd.read_excel('C:\\Users\\gaijinchao\\Desktop\\数据分析\\源代码汇总-2020-12-16\\第5章 决策树模型\\源代码汇总_Jupyter Notebook格式(推荐)\\员工离职预测模型.xlsx')
df.head()
df = df.replace({'工资':{'低':0, '中':1, '高':2}})
df.head()
x = df.drop(columns='离职')
y = df['离职']
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=2)
from sklearn.tree import DecisionTreeClassifier
model = DecisionTreeClassifier(max_depth=5, random_state=111)
model.fit(x_train, y_train)
y_pred = model.predict(x_test)
print(y_pred[:100])
a = pd.DataFrame()
a['预测值'] = list(y_pred)
a['实际值'] = list(y_test)
a.head(10)
from sklearn.metrics import accuracy_score
score = accuracy_score(y_pred, y_test)
print(score)
y_pred_proba = model.predict_proba(x_test)
b = pd.DataFrame(y_pred_proba, columns=['不离职概率', '离职概率'])
b.head(10)
from sklearn.metrics import roc_curve
fpr, tpr, thres = roc_curve(y_test, y_pred_proba[:,1])
a = pd.DataFrame()
a['阈值'] = list(thres)
a['命中率'] = list(tpr)
a['假警报率'] = list(fpr)
a.head(10)
绘制ROC曲线
import matplotlib.pyplot as plt
plt.plot(fpr, tpr)
plt.show()
求出AUC值
from sklearn.metrics import roc_auc_score
score = roc_auc_score(y_test, y_pred_proba[:,1])
print(score)
以CART决策树为例,模型分裂到最后的叶子节点,整个系统的基尼系数下降a,而根据某一特征变量进行分裂的节点产生的基尼系数下降值之和为b,那么该特征的重要性计算为(b/a)*100%
features = x.columns # 获取特征名称
importances = model.feature_importances_ #获取特征重要性
importances_df = pd.DataFrame()
importances_df['feature'] = features
importances_df['importance'] = importances
importances_df.sort_values(by=['importance'], ascending=False, inplace=True)
display(importances_df)
from sklearn.tree import export_graphviz
import graphviz
import os
os.environ['PATH'] = os.pathsep + r'C:\Program Files (x86)\Graphviz2.38\bin'
dot_data = export_graphviz(model, out_file=None, class_names=['0', '1'])
graph = graphviz.Source(dot_data)
graph.render("result")
from sklearn.model_selection import cross_val_score
acc = cross_val_score(model, x, y, cv=5) # cv=5表示交叉验证5次
print(acc)
print(acc.mean())
scoring参数用以设置模型的评估方法,默认为准确度,也可以计算其AUC值
auc = cross_val_score(model, x, y, scoring='roc_auc', cv=5)
print(auc)
print(auc.mean())
import numpy as np
from sklearn.model_selection import GridSearchCV
parameters = {'max_depth':np.arange(1, 10 ,1)}
model = DecisionTreeClassifier()
grid_search = GridSearchCV(model, parameters, scoring='roc_auc', cv=5)
grid_search.fit(x_train, y_train)
display(grid_search.best_params_)
model = DecisionTreeClassifier(max_depth=7)
model.fit(x_train, y_train)
y_pred = model.predict(x_test)
score1 = accuracy_score(y_pred, y_test)
y_pred_proba = model.predict_proba(x_test)
score2 = roc_auc_score(y_test, y_pred_proba[:,1])
新模型的AUC值为0.985,相比于原来的0.976有所提高
print(score1) # 准确度
print(score2) # AUC值
值得注意,模型深度增加后,特征重要性也随之改变
print(model.feature_importances_)
注意:若某参数的优化值是给定范围的边界值,需要额外增大范围继续调优
parameters = {'max_depth':np.arange(5, 14 ,2), 'criterion':['gini', 'entropy'], 'min_samples_split':np.arange(13,24,2)}
model = DecisionTreeClassifier()
grid_search = GridSearchCV(model, parameters, scoring='roc_auc', cv=5)
grid_search.fit(x_train, y_train)
display(grid_search.best_params_)
model = DecisionTreeClassifier(max_depth=11, criterion='entropy', min_samples_split=23)
model.fit(x_train, y_train)
y_pred = model.predict(x_test)
score1 = accuracy_score(y_pred, y_test)
y_pred_proba = model.predict_proba(x_test)
score2 = roc_auc_score(y_test, y_pred_proba[:,1])
print(score1) # 准确度
print(score2) # AUC值
print(model.feature_importances_)