#正则化:降低模型的复杂度,避免过拟合。

#加载模块
from sklearn.datasets import load_iris
import joblib
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier

#分割数据集
data = load_iris()
X = data.data
y = data.target
train_X,test_X,train_y,test_y = train_test_split(X,y,test_size=0.3,random_state=2)

#训练模型
#k近邻
kneighbor=KNeighborsClassifier(n_neighbors=3)#n_neighbors是设定邻居的个数
#线性回归
lr = LinearRegression()
#岭回归
ridge=Ridge(alpha=.01)# aipha模型简单性与训练集性能之间的权衡,alpha趋向于0,降低训练集性能,提高泛化性能
#Lasso回归
lasso= Lasso(alpha=.01,max_iter=100)# max_iter运行迭代的最大次数,aipha越小,模型越简单
#logistic回归
log=LogisticRegression(C=1)#C是正则化强度的权衡系数,C越大,模型较复杂,拟合效果越好,但可能出现过拟合
#线性支持向量机
linearSVC=LinearSVC(C=10)#C是正则化强度的权衡系数,C越大,模型较复杂,拟合效果越好,但可能出现过拟合
#朴素贝叶斯分类器  GaussianNB(连续数据)、BernoulliNB(二分类数据)、MultinomialNB(计数数据)。后两个基本用于文本数据分类
#决策树
tree=DecisionTreeClassifier(random_state=0,max_depth=4)#random_state解决内部平局(不太理解);
                                          # max_depth是树的深度,max_leaf_nodes、min_samples_leaf三个都可以防止过拟合。
#随机森林
randomtree=RandomForestClassifier(n_estimators=4,random_state=2)#n_estimators多少棵树,越大,树越多,也可以防止过拟合。
                                                  # max_features决定每棵树的随机性大小,较小可以防止过拟合,一般使用默认值。
#核支持向量机
svc=SVC(C=1,gamma=0.1)#C是指正则化系数,C越大,模型较复杂,拟合效果越好,但可能出现过拟合
                                   #gamma用于控制高斯核宽度,决定点与点的最大距离,gamma越大,模型复杂度越高,决策边界变化越快。
#神经网络
mlp=MLPClassifier(hidden_layer_sizes=100,alpha=0.1)#hidden_layer_sizes隐含层数,alpha每个隐含层的正则化

kneighbor.fit(train_X,train_y)
lr.fit(train_X,train_y)
ridge.fit(train_X,train_y)
lasso.fit(train_X,train_y)
log.fit(train_X,train_y)
linearSVC.fit(train_X,train_y)
tree.fit(train_X,train_y)
randomtree.fit(train_X,train_y)
svc.fit(train_X,train_y)
mlp.fit(train_X,train_y)

#将训练的模型保存到磁盘(value=模型名)   默认当前文件夹下
joblib.dump(filename='kneighbor.model',value=kneighbor)
joblib.dump(filename='LR.model',value=lr)
joblib.dump(filename='Ridge.model',value=ridge)
joblib.dump(filename='lasso.model',value=lasso)
joblib.dump(filename='log.model',value=log)
joblib.dump(filename='linearSVC.model',value=linearSVC)
joblib.dump(filename='tree.model',value=tree)
joblib.dump(filename='randomtree.model',value=randomtree)
joblib.dump(filename='svc.model',value=svc)
joblib.dump(filename='mlp.model',value=mlp)

# 下载本地模型
model0 = joblib.load(filename="kneighbor.model")
model1 = joblib.load(filename="LR.model")
model2 = joblib.load(filename="Ridge.model")
model3 = joblib.load(filename="lasso.model")
model4 = joblib.load(filename="log.model")
model5 = joblib.load(filename="linearSVC.model")
model6 = joblib.load(filename="tree.model")
model7 = joblib.load(filename="randomtree.model")
model8 = joblib.load(filename="svc.model")
model9 = joblib.load(filename="mlp.model")
#对本地模型进行第三组数据进行预测
print(model0.predict(test_X)[2])
print(model0.score(test_X,test_y))
print(model1.predict(test_X)[2])
print(model1.score(test_X,test_y))
print(model2.predict(test_X)[2])
print(model2.score(test_X,test_y))
print(model3.predict(test_X)[2])
print(model3.score(test_X,test_y))
print(model4.predict(test_X)[2])
print(model4.score(test_X,test_y))
print(model5.predict(test_X)[2])
print(model5.score(test_X,test_y))
print(model6.predict(test_X)[2])
print(model6.score(test_X,test_y))
print(model7.predict(test_X)[2])
print(model7.score(test_X,test_y))
print(model8.predict(test_X)[2])
print(model8.score(test_X,test_y))
print(model9.predict(test_X)[2])
print(model9.score(test_X,test_y))
# 重新设置模型参数并训练
'''model1.set_params(normalize=True).fit(train_X,train_y)

#新模型做预测
print(model1.predict(test_X))
print(model1.score(test_X,test_y))'''