直接给代码:

  1 # -- coding: gbk --
  2 from sklearn.datasets import load_breast_cancer
  3 from sklearn.tree import DecisionTreeClassifier
  4 from sklearn.model_selection import  train_test_split
  5 from sklearn.tree import export_graphviz
  6 import pandas as pd
  7 import graphviz
  8 import mglearn
  9 from sklearn.ensemble import RandomForestClassifier
 10 from sklearn.datasets import make_moons
 11 from sklearn.ensemble import GradientBoostingClassifier
 12 from sklearn.svm import SVC
 13 from pylab import *
 14 def 决策树():
 15     cancer = load_breast_cancer()
 16     X_train, X_test, y_train, y_test = train_test_split(
 17         cancer.data, cancer.target, stratify=cancer.target, random_state=42)
 18     tree = DecisionTreeClassifier(random_state=0)
 19     print(X_train)
 20     print(y_train.shape)
 21     tree.fit(X_train, y_train)
 22     y_predict=tree.predict(X_test)
 23     print("Accuracy on training set: {:.3f}".format(tree.score(X_train, y_train)))
 24     print("Accuracy on test set: {:.3f}".format(tree.score(X_test, y_test)))
 25     '''
 26     export_graphviz(tree, out_file="tree.dot", class_names=["malignant", "benign"], feature_names=cancer.feature_names,
 27                     impurity=False, filled=True)
 28 
 29     with open("tree.dot") as f:
 30         dot_graph = f.read()
 31     graphviz.Source(dot_graph)
 32     '''
 33     print("特征的重要:\n{}".format(tree.feature_importances_))
 34 
 35 def 随机森林():
 36     X, y = make_moons(n_samples=100, noise=0.25, random_state=3)
 37     X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y,random_state=42)
 38     '''五颗随机森林'''
 39     forest = RandomForestClassifier(n_estimators=5, random_state=2)
 40     forest.fit(X_train, y_train)
 41     y_pred=forest.predict(X_test)
 42     print(y_pred)
 43     print(y_test)
 44     print(np.mean(y_test==y_pred ))
 45     fig, axes = plt.subplots(2, 3, figsize=(20, 10))
 46     for i, (ax, tree) in enumerate(zip(axes.ravel(), forest.estimators_)):    ax.set_title("Tree {}".format(i))
 47     mglearn.plots.plot_tree_partition(X_train, y_train, tree, ax=ax)
 48     mglearn.plots.plot_2d_separator(forest, X_train, fill=True, ax=axes[-1, -1], alpha=.4)
 49     axes[-1, -1].set_title("Random Forest")
 50     mglearn.discrete_scatter(X_train[:, 0], X_train[:, 1], y_train)
 51 
 52 def 梯度提升树():
 53     cancer = load_breast_cancer()
 54     X_train, X_test, y_train, y_test = train_test_split(cancer.data, cancer.target, random_state=0)
 55     #gbrt = GradientBoostingClassifier(random_state=0, max_depth=1)
 56     gbrt = GradientBoostingClassifier(random_state=0,  learning_rate=0.01)
 57     gbrt.fit(X_train, y_train)
 58     y_pred=gbrt.predict(X_test)
 59     print(y_pred)
 60     print(np.mean(y_pred==y_test))
 61 
 62 def SVM向量机简易():
 63     X, y = mglearn.tools.make_handcrafted_dataset()
 64     '''
 65     gamma参数是上一节给出的公式中的参数,用于控制高斯核的宽度。它决定了点与点之间“靠近”是指多大的距离。
 66     C参数是正则化参数,与线性模型中用到的类似。它限制每个点的重要性(或者更确切地说,每个点的dual_coef_)。
 67 
 68     '''
 69     svm = SVC(kernel='rbf', C=10, gamma=0.1).fit(X, y)
 70     mglearn.plots.plot_2d_separator(svm, X, eps=.5)
 71     mglearn.discrete_scatter(X[:, 0], X[:, 1], y)
 72     # 画出支持向量
 73     sv = svm.support_vectors_
 74     sv_labels = svm.dual_coef_.ravel() > 0
 75     mglearn.discrete_scatter(sv[:, 0], sv[:, 1], sv_labels, s=15, markeredgewidth=3)
 76     plt.xlabel("Feature 0")
 77     plt.ylabel("Feature 1")
 78     plt.show()
 79 
 80 def 预处理向量机数据():
 81     cancer = load_breast_cancer()
 82     X_train, X_test, y_train, y_test = train_test_split(cancer.data, cancer.target, random_state=0)
 83     svc = SVC()
 84     svc.fit(X_train, y_train)
 85     y_pred=svc.predict(X_test)
 86     print(np.mean(y_pred==y_test))
 87 
 88     '''预处理——缩放'''
 89     min_on_training = X_train.min(axis=0)
 90     range_on_training = (X_train - min_on_training).max(axis=0)
 91 
 92     X_train_scaled = (X_train - min_on_training) / range_on_training
 93     X_test_scaled = (X_test - min_on_training) / range_on_training
 94     print("Minimum for each feature\n{}".format(X_train_scaled.min(axis=0)))
 95     print("Maximum for each feature\n {}".format(X_train_scaled.max(axis=0)))
 96 
 97     '''变换'''
 98     X_test_scaled = (X_test - min_on_training) / range_on_training
 99     svc = SVC()
100     svc.fit(X_train_scaled, y_train)
101     y_pred=svc.predict(X_test_scaled)
102     print(np.mean(y_pred==y_test))
103 if __name__ =='__main__':
104     预处理向量机数据()