from sklearn.datasets import fetch_openml
import matplotlib
import matplotlib.pyplot as plt
import numpy as np
def sort_by_target(mnist):
reorder_train = np.array(sorted([(target, i) for i, target in enumerate(mnist.target[:60000])]))[:, 1]
reorder_test = np.array(sorted([(target, i) for i, target in enumerate(mnist.target[60000:])]))[:, 1]
mnist.data[:60000] = mnist.data[reorder_train]
mnist.target[:60000] = mnist.target[reorder_train]
mnist.data[60000:] = mnist.data[reorder_test + 60000]
mnist.target[60000:] = mnist.target[reorder_test + 60000]
mnist = fetch_openml('mnist_784')
mnist.target = mnist.target.astype(np.int8)
sort_by_target(mnist)
# print(mnist)
X , y = mnist['data'] , mnist['target']
some_digit = X[36000]
# some_digit_img = some_digit.reshape(28,28)
# plt.imshow(some_digit_img,cmap=matplotlib.cm.binary,interpolation='nearest')
# plt.axis('off')
# plt.show()
print(y[36000]) # 5
X_train , X_test , y_train , y_test = X[:60000] , X[60000:] , y[:60000] , y[60000:]
shuffle_index = np.random.permutation(60000)
X_train , y_train = X_train[shuffle_index] , y_train[shuffle_index]
y_train_5 = (y_train == 5)
y_test_5 = (y_test == 5)
######## SGDClassifier分类器 #######
from sklearn.linear_model import SGDClassifier
sgd_clf = SGDClassifier(random_state=42)
# sgd_clf.fit(X_train,y_train_5)
# print(sgd_clf.predict([some_digit]))
from sklearn.model_selection import cross_val_score
# acc = cross_val_score(sgd_clf,X_train,y_train_5,cv=3,scoring='accuracy')
# print(acc)
from sklearn.model_selection import cross_val_predict # 返回交叉验证每个折叠的预测
# y_train_pred = cross_val_predict(sgd_clf,X_train,y_train_5,cv=3)
# 混淆矩阵:评估分类器性能(行为实际类别,列为预测类别
from sklearn.metrics import confusion_matrix
# matrix = confusion_matrix(y_train_5,y_train_pred)
# print(matrix)
# 精度、召回率、F1(2/((1/精度)+(1/召回率)))
from sklearn.metrics import precision_score , recall_score , f1_score
# print(precision_score(y_train_5,y_train_pred))
# print(recall_score(y_train_5,y_train_pred))
# print(f1_score(y_train_5,y_train_pred)) # 对具有相近的精度、召回率的分类器更有利
# 精度、召回率权衡(互相制约)
# cross_val_predict()获取训练集中所有实例的分数,precision_recall_curve()计算所有可能的阈值的精度、召回率
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import precision_recall_curve
# 返回决策分数而非预测结果
# y_scores = cross_val_predict(sgd_clf,X_train,y_train_5,cv=3,method='decision_function')
# precisions , recalls , thresholds = precision_recall_curve(y_train_5,y_scores)
# 精度、召回率、阈值曲线
def plot_precision_recall_vs_threshold(precisions,recalls,thresholds):
plt.plot(thresholds,precisions[:-1],"b--",label="Precision")
plt.plot(thresholds,recalls[:-1],"g-",label="Recall")
plt.xlabel('Threshold')
plt.legend(loc='upper left')
plt.ylim([0,1])
# plot_precision_recall_vs_threshold(precisions,recalls,thresholds)
# plt.show()
# 精确度/召回曲线
def plot_precision_vc_recall(precisions,recalls):
plt.plot(recalls,precisions,"b--",linewidth=2)
plt.xlabel('Recall',fontsize='16')
plt.ylabel('Precision',fontsize='16')
plt.axis([0,1,0,1])
# plt.figure(figsize=(8,6))
# plot_precision_vc_recall(precisions,recalls)
# plt.savefig('plot_precision_vc_recall')
# plt.show()
# ROC曲线(假正类率(fpr)/真正类率(tpr))
from sklearn.metrics import roc_curve
# fpr , tpr , thresholds = roc_curve(y_train_5,y_scores) # 计算多种阈值的TPR和FPR
def plot_roc_curve(fpr,tpr,label=None):
plt.plot(fpr,tpr,'b--',linewidth=2,label=label)
plt.plot([0,1],[0,1],'k--')
plt.axis([0,1,0,1])
plt.xlabel('Fales Positive Rate')
plt.xlabel('True Positive Rate')
# plot_roc_curve(fpr,tpr)
# plt.show()
# ROC曲线下面积AUC
from sklearn.metrics import roc_auc_score
# print(roc_auc_score(y_train_5,y_scores))
####### RandomForestClassifier 分类器 ######
# RandomForestClassifier没有decision_function(),predict_proba()返回(数组:行实例 列类别)某个给定实例属于某个给定类别的概率
# 但绘制ROC曲线需要分数值而非概率大小
from sklearn.ensemble import RandomForestClassifier
forest_clf = RandomForestClassifier(random_state=42)
y_probas_forest = cross_val_predict(forest_clf,X_train,y_train_5,cv=3,method='predict_proba')
#
y_scores_forest = y_probas_forest[:,1] # 使用正例的概率作为分数值
# # 计算所有可能的阈值的真假正类率
fpr_forest , tpr_forest , thresholds_forest = roc_curve(y_train_5,y_scores_forest)
#
# plt.plot(fpr,tpr,'b:',label='SGD')
# plt.plot(fpr_forest,tpr_forest,label='Random Forest')
# plt.legend(loc='bottom right')
# plt.show()
# print(roc_auc_score(y_train_5,y_scores_forest))
######## 多类别分类器(OvA——一对多 OvO——一对一) #########
# OvA: 分几类就创建几个二分类器
# OvO: 每两对创建一个二分类器(更多),但每个分类器只需要用到部分训练集
# SVM分类器在数据规模大时性能差,适于OvO
# 0-9 10个分类器
sgd_clf = sgd_clf.fit(X_train,y_train) # 多分类
print(sgd_clf.predict([some_digit]))
some_digit_scores = sgd_clf.decision_function([some_digit]) # 返回10个分数,每个类别一个
print(some_digit_scores)
print(np.argmax(some_digit_scores))
print(sgd_clf.classes_)
# 强制使用OvO或者OvA策略
from sklearn.multiclass import OneVsOneClassifier
from sklearn.preprocessing import StandardScaler
# ovo_clf = OneVsOneClassifier(SGDClassifier(random_state=42)) # SGD分类器使用OvO
# ovo_clf = ovo_clf.fit(X_train,y_train)
# print(ovo_clf.predict([some_digit]))
# print(len(ovo_clf.estimators_))
#
# multi_forest_clf_score = cross_val_predict(sgd_clf,X_train,y_train,cv=3,scoring='accuracy')
# print(multi_forest_clf_score)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train.astype(np.float64))
# cross_val_score(sgd_clf,X_train_scaled,y_train,cv=3,scoring='accuracy')
y_train_pred = cross_val_predict(sgd_clf,X_train_scaled,y_train,cv=3)
conf_mx = confusion_matrix(y_train,y_train_pred)
# print(conf_mx)
plt.matshow(conf_mx,cmap=plt.cm.gray)
plt.show()
row_sums = conf_mx.sum(axis=1,keepdims=True)
norm_conf_mx = conf_mx / row_sums
norm_conf_mx = np.fill_diagonal(norm_conf_mx,0)
plt.matshow(norm_conf_mx,cmap=plt.cm.gray)
plt.show()
# 随机森林直接就可以分多个类别
# forest_clf = forest_clf.fit(X_train,y_train)
# print(forest_clf.predict([some_digit]))
# print(forest_clf.predict_proba([some_digit])) # 每个实例分类为每个类别的概率列表
######## 多标签分类 ########
from sklearn.neighbors import KNeighborsClassifier
y_train_large = (y_train >= 7)
y_train_odd = (y_train % 2 == 1)
y_multilable = np.c_[y_train_large,y_train_odd]
knn_clf = KNeighborsClassifier()
knn_clf = knn_clf.fit(X_train,y_multilable)
print(knn_clf.predict([some_digit]))
y_train_knn_pred = cross_val_predict(knn_clf,X_train,y_train,cv=3)
# 所有标签都同等,若更改自身支持的权重则设置参数average='weighted'
print(f1_score(y_train,y_train_knn_pred,average='macro'))