机器学习-文本分类实例-朴素贝叶斯

1.准备训练样本

使用的复旦大学文本分类样本数据

2.训练模型

3.准备测试数据

4.分类

训练模型

import os
import jieba
#Bunch类
from sklearn.datasets.base import Bunch
import pickle
from sklearn import feature_extraction
from sklearn.feature_extraction.text import TfidfTransformer #TF-IDF向量转换类
from sklearn.feature_extraction.text import TfidfVectorizer


# 定义两个函数读取和保存文件
# 保存至文件
def savefile(savepath, content):
    fp = open(savepath, "w", encoding="GBK")
    fp.write(content)
    fp.close()


# 读文件
def readfile(path, encode):
    content = None
    try:
        fp = open(path, "r", encoding=encode)
        content = fp.read()
        fp.close()
    except UnicodeDecodeError:
        print("Error: 文件读取失败")
    else:
        return content


# 1.读取和写入Bunch对象的函数
def readbunchobj(path):
    file_obj = open(path, "rb")
    bunch = pickle.load(file_obj)
    file_obj.close()
    return bunch


# 写入Bunch对象
def writebunchobj(path,bunchobj):
    file_obj = open(path, "wb")
    pickle.dump(bunchobj, file_obj)
    file_obj.close()


# 整个语料库分词的主程序
# 训练文本分词存储
def segment(corpus_path,seg_path):
    # 获取corpus_path下的所有子目录
    cateList = os.listdir(corpus_path)
    for myDir in cateList:
        if not myDir.startswith("."):
            # 拼出分类子目录的路径
            class_path = corpus_path+myDir+"/"
            # 拼出分词后的语料分类目录
            seg_dir = seg_path+myDir+"/"
            # 是否存在目录，如果没有则创建
            if not os.path.exists(seg_dir):
                os.makedirs(seg_dir)
            # 获得类别目录下的所有文件
            file_list = os.listdir(class_path)
            # 遍历类别目录下的所有文件
            for file_path in file_list:
                # 拼出文件名的全路径
                fullname = class_path + file_path
                print("path:" + fullname)
                # 读取文件的内容
                content = readfile(fullname, "GBK")
                if content != None:
                    content = content.strip()
                    # 删除换行和多余的空格
                    content = content.replace("\r\n", "").strip()
                    # 为文件的内容分词
                    content_seg = jieba.cut(content)
                    # 将处理后的文件保存到分词后的语目录
                    savefile(seg_dir + file_path, "".join(content_seg))
    print("中文语料分析结束！！！")

# 将分好词的文本文件转换并持久化为Bunch类形式
def bunchObj(wordbag_path,seg_path):
    bunch = Bunch(target_name=[], label=[], filename=[], contents=[])
    # 将分好词的文本文件转换并持久化为Bunch类形式的代码如下：
    catelist = os.listdir(seg_path)
    # 按类别信息保存到Bunch对象中
    bunch.target_name.extend(catelist)
    for myDir in catelist:
        if not myDir.startswith("."):
            class_path = seg_path + myDir + "/"
            file_list = os.listdir(class_path)
            for file_path in file_list:
                fullname = class_path + file_path
                print(fullname)
                # 保存当前文件的分类标签
                bunch.label.append(myDir)
                # 保存当前文件路径
                bunch.filename.append(fullname)
                # 保存文件词向量
                bunch.contents.append(readfile(fullname, "GBK").strip())
    # Bunch对象的持久化
    file_obj = open(wordbag_path, "wb")
    pickle.dump(bunch, file_obj)
    file_obj.close()
    print("构建文本对象结束！！！")

#训练模型
def startTrain(stopword_path, wordbag_path, space_path):
    stpwrdlst = readfile(stopword_path,"UTF-8").splitlines()
    # 从训练集生成TF-IDF向量词袋
    # 2.导入分词后的词向量Bunch对象
    bunch = readbunchobj(wordbag_path)

    # 3.构建TF-IDF向量空间模型
    tfidfspace = Bunch(target_name=bunch.target_name, label=bunch.label, filename=bunch.filename, tdm=[], vocabulary={})
    # 使用TfidfVectorizer初始化向量空间模型
    vectorizer = TfidfVectorizer(stop_words=stpwrdlst, sublinear_tf=True, max_df=0.5)
    transform = TfidfTransformer()  # 该类会统计每个词语放入Tf-IDF权重

    # 4.文本转化为词频矩阵：单独保存字典文件
    tfidfspace.tdm = vectorizer.fit_transform(bunch.contents)
    tfidfspace.vocabulary = vectorizer.vocabulary_

    # 5.创建词袋的持久化
    writebunchobj(space_path, tfidfspace)
    print("文本分类模型训练完成")

# 未分词分类语料库路径
corpus_path = "/Users/FengZhen/Desktop/accumulate/机器学习/文本集/train/"
# 分词后的分类语料库路径
segment_path = "/Users/FengZhen/Desktop/accumulate/机器学习/classify_test/train_segment/"
# 分词语料Bunch对象持久化文件路径
wordbag_path = "/Users/FengZhen/Desktop/accumulate/机器学习/classify_test/train_word_bag/train_set.dat"
# 停用词路径
stop_words_path = "/Users/FengZhen/Desktop/accumulate/机器学习/classify_test/中文停用词表.txt"
# 创建词袋的持久化路径
space_path = "/Users/FengZhen/Desktop/accumulate/机器学习/classify_test/train_word_bag/tfidfspace.dat"


# 训练文本分词存储
# segment(corpus_path, segment_path)

# 将分好词的文本文件转换并持久化为Bunch类形式
# bunchObj(wordbag_path, segment_path)

#开始训练
startTrain(stop_words_path, wordbag_path, space_path)

准备测试数据

import os
import jieba
#Bunch类
from sklearn.datasets.base import Bunch
import pickle
from sklearn import feature_extraction
from sklearn.feature_extraction.text import TfidfTransformer #TF-IDF向量转换类
from sklearn.feature_extraction.text import TfidfVectorizer


# 定义两个函数读取和保存文件
# 保存至文件
def savefile(savepath, content):
    fp = open(savepath, "w", encoding="GBK")
    fp.write(content)
    fp.close()


# 读文件
def readfile(path, encode):
    content = None
    try:
        fp = open(path, "r", encoding=encode)
        content = fp.read()
        fp.close()
    except UnicodeDecodeError:
        print("Error: 文件读取失败")
    else:
        return content


# 1.读取和写入Bunch对象的函数
def readbunchobj(path):
    file_obj = open(path, "rb")
    bunch = pickle.load(file_obj)
    file_obj.close()
    return bunch


# 写入Bunch对象
def writebunchobj(path,bunchobj):
    file_obj = open(path, "wb")
    pickle.dump(bunchobj, file_obj)
    file_obj.close()


# 整个语料库分词的主程序
# 训练文本分词存储
def segment(corpus_path,seg_path):
    # 获取corpus_path下的所有子目录
    cateList = os.listdir(corpus_path)
    for myDir in cateList:
        if not myDir.startswith("."):
            # 拼出分类子目录的路径
            class_path = corpus_path+myDir+"/"
            # 拼出分词后的语料分类目录
            seg_dir = seg_path+myDir+"/"
            # 是否存在目录，如果没有则创建
            if not os.path.exists(seg_dir):
                os.makedirs(seg_dir)
            # 获得类别目录下的所有文件
            file_list = os.listdir(class_path)
            # 遍历类别目录下的所有文件
            for file_path in file_list:
                # 拼出文件名的全路径
                fullname = class_path + file_path
                print("path:" + fullname)
                # 读取文件的内容
                content = readfile(fullname, "GBK")
                if content != None:
                    content = content.strip()
                    # 删除换行和多余的空格
                    content = content.replace("\r\n", "").strip()
                    # 为文件的内容分词
                    content_seg = jieba.cut(content)
                    # 将处理后的文件保存到分词后的语目录
                    savefile(seg_dir + file_path, "".join(content_seg))
    print("中文语料分析结束！！！")

# 将分好词的文本文件转换并持久化为Bunch类形式
def bunchObj(wordbag_path,seg_path):
    bunch = Bunch(target_name=[], label=[], filename=[], contents=[])
    # 将分好词的文本文件转换并持久化为Bunch类形式的代码如下：
    catelist = os.listdir(seg_path)
    # 按类别信息保存到Bunch对象中
    bunch.target_name.extend(catelist)
    for myDir in catelist:
        if not myDir.startswith("."):
            class_path = seg_path + myDir + "/"
            file_list = os.listdir(class_path)
            for file_path in file_list:
                fullname = class_path + file_path
                print(fullname)
                # 保存当前文件的分类标签
                bunch.label.append(myDir)
                # 保存当前文件路径
                bunch.filename.append(fullname)
                # 保存文件词向量
                bunch.contents.append(readfile(fullname, "GBK").strip())
    # Bunch对象的持久化
    file_obj = open(wordbag_path, "wb")
    pickle.dump(bunch, file_obj)
    file_obj.close()
    print("构建文本对象结束！！！")

#训练模型
def startTrain(stopword_path, wordbag_path, space_path, train_space_path):
    stpwrdlst = readfile(stopword_path,"UTF-8").splitlines()
    # 从训练集生成TF-IDF向量词袋
    # 2.导入分词后的词向量Bunch对象
    bunch = readbunchobj(wordbag_path)

    # 3.构建测试集TF-IDF向量空间
    testspace = Bunch(target_name=bunch.target_name, label=bunch.label, filenames= bunch.filename, tdm=[], vocabulary={})

    # 4.导入训练集词袋
    trainbunch = readbunchobj(train_space_path)

    # 5.使用TfidfVectorizer初始化向量空间模型
    vectorizer = TfidfVectorizer(stop_words=stpwrdlst, sublinear_tf=True, max_df=0.5,vocabulary=trainbunch.vocabulary)  # 使用训练集词袋向量
    transformer = TfidfTransformer()
    testspace.tdm = vectorizer.fit_transform(bunch.contents)
    testspace.vocabulary = trainbunch.vocabulary
    writebunchobj(space_path, testspace)
    print("文本分类模型训练完成")


# 未分词分类语料库路径
corpus_path = "/Users/FengZhen/Desktop/accumulate/机器学习/classify_test/测试文本集/"
# 分词后的分类语料库路径
segment_path = "/Users/FengZhen/Desktop/accumulate/机器学习/classify_test/test_segment/"
# 分词语料Bunch对象持久化文件路径
wordbag_path = "/Users/FengZhen/Desktop/accumulate/机器学习/classify_test/test_word_bag/test_set.dat"
# 停用词路径
stop_words_path = "/Users/FengZhen/Desktop/accumulate/机器学习/classify_test/中文停用词表.txt"
# 创建词袋的持久化路径
space_path = "/Users/FengZhen/Desktop/accumulate/机器学习/classify_test/test_word_bag/testspace.dat"

train_space_path = "/Users/FengZhen/Desktop/accumulate/机器学习/classify_test/train_word_bag/tfidfspace.dat"

# 训练文本分词存储
# segment(corpus_path, segment_path)
#
# # 将分好词的文本文件转换并持久化为Bunch类形式
# bunchObj(wordbag_path, segment_path)

#开始训练
startTrain(stop_words_path, wordbag_path, space_path,train_space_path)

测试

import pickle
from sklearn.naive_bayes import MultinomialNB #导入多项式贝叶斯算法包

def readbunchobj(path):
    file_obj = open(path,"rb")
    bunch    = pickle.load(file_obj)
    file_obj.close()
    return bunch

#导入训练向量空间
trainpath = "/Users/FengZhen/Desktop/accumulate/机器学习/classify_test/train_word_bag/tfidfspace.dat"
train_set = readbunchobj(trainpath)

#导入测试集向量空间
testpath  = "/Users/FengZhen/Desktop/accumulate/机器学习/classify_test/test_word_bag/testspace.dat"
test_set = readbunchobj(testpath)

#应用朴素贝叶斯
#alpha:0.001 alpha越小，迭代次数越多，精度越高
clf = MultinomialNB(alpha = 0.001).fit(train_set.tdm,train_set.label)

#预测分类结果
predicted = clf.predict(test_set.tdm)
total     = len(predicted)
rate      = 0
for flabel,file_name,expct_cate in zip(test_set.label,test_set.filenames,predicted):
    print(file_name, u":实际类别:", flabel, u"-->预测类别:", expct_cate)
    if flabel != expct_cate:
        rate  += 1
        # print(file_name,u":实际类别:",flabel,u"-->预测类别:",expct_cate)
#精度
print("error rate:",float(rate)*100/float(total),"%")