1.准备训练样本
使用的复旦大学文本分类样本数据
2.训练模型
3.准备测试数据
4.分类
训练模型
import os import jieba #Bunch类 from sklearn.datasets.base import Bunch import pickle from sklearn import feature_extraction from sklearn.feature_extraction.text import TfidfTransformer #TF-IDF向量转换类 from sklearn.feature_extraction.text import TfidfVectorizer # 定义两个函数读取和保存文件 # 保存至文件 def savefile(savepath, content): fp = open(savepath, "w", encoding="GBK") fp.write(content) fp.close() # 读文件 def readfile(path, encode): content = None try: fp = open(path, "r", encoding=encode) content = fp.read() fp.close() except UnicodeDecodeError: print("Error: 文件读取失败") else: return content # 1.读取和写入Bunch对象的函数 def readbunchobj(path): file_obj = open(path, "rb") bunch = pickle.load(file_obj) file_obj.close() return bunch # 写入Bunch对象 def writebunchobj(path,bunchobj): file_obj = open(path, "wb") pickle.dump(bunchobj, file_obj) file_obj.close() # 整个语料库分词的主程序 # 训练文本分词存储 def segment(corpus_path,seg_path): # 获取corpus_path下的所有子目录 cateList = os.listdir(corpus_path) for myDir in cateList: if not myDir.startswith("."): # 拼出分类子目录的路径 class_path = corpus_path+myDir+"/" # 拼出分词后的语料分类目录 seg_dir = seg_path+myDir+"/" # 是否存在目录,如果没有则创建 if not os.path.exists(seg_dir): os.makedirs(seg_dir) # 获得类别目录下的所有文件 file_list = os.listdir(class_path) # 遍历类别目录下的所有文件 for file_path in file_list: # 拼出文件名的全路径 fullname = class_path + file_path print("path:" + fullname) # 读取文件的内容 content = readfile(fullname, "GBK") if content != None: content = content.strip() # 删除换行和多余的空格 content = content.replace("\r\n", "").strip() # 为文件的内容分词 content_seg = jieba.cut(content) # 将处理后的文件保存到分词后的语目录 savefile(seg_dir + file_path, "".join(content_seg)) print("中文语料分析结束!!!") # 将分好词的文本文件转换并持久化为Bunch类形式 def bunchObj(wordbag_path,seg_path): bunch = Bunch(target_name=[], label=[], filename=[], contents=[]) # 将分好词的文本文件转换并持久化为Bunch类形式的代码如下: catelist = os.listdir(seg_path) # 按类别信息保存到Bunch对象中 bunch.target_name.extend(catelist) for myDir in catelist: if not myDir.startswith("."): class_path = seg_path + myDir + "/" file_list = os.listdir(class_path) for file_path in file_list: fullname = class_path + file_path print(fullname) # 保存当前文件的分类标签 bunch.label.append(myDir) # 保存当前文件路径 bunch.filename.append(fullname) # 保存文件词向量 bunch.contents.append(readfile(fullname, "GBK").strip()) # Bunch对象的持久化 file_obj = open(wordbag_path, "wb") pickle.dump(bunch, file_obj) file_obj.close() print("构建文本对象结束!!!") #训练模型 def startTrain(stopword_path, wordbag_path, space_path): stpwrdlst = readfile(stopword_path,"UTF-8").splitlines() # 从训练集生成TF-IDF向量词袋 # 2.导入分词后的词向量Bunch对象 bunch = readbunchobj(wordbag_path) # 3.构建TF-IDF向量空间模型 tfidfspace = Bunch(target_name=bunch.target_name, label=bunch.label, filename=bunch.filename, tdm=[], vocabulary={}) # 使用TfidfVectorizer初始化向量空间模型 vectorizer = TfidfVectorizer(stop_words=stpwrdlst, sublinear_tf=True, max_df=0.5) transform = TfidfTransformer() # 该类会统计每个词语放入Tf-IDF权重 # 4.文本转化为词频矩阵:单独保存字典文件 tfidfspace.tdm = vectorizer.fit_transform(bunch.contents) tfidfspace.vocabulary = vectorizer.vocabulary_ # 5.创建词袋的持久化 writebunchobj(space_path, tfidfspace) print("文本分类模型训练完成") # 未分词分类语料库路径 corpus_path = "/Users/FengZhen/Desktop/accumulate/机器学习/文本集/train/" # 分词后的分类语料库路径 segment_path = "/Users/FengZhen/Desktop/accumulate/机器学习/classify_test/train_segment/" # 分词语料Bunch对象持久化文件路径 wordbag_path = "/Users/FengZhen/Desktop/accumulate/机器学习/classify_test/train_word_bag/train_set.dat" # 停用词路径 stop_words_path = "/Users/FengZhen/Desktop/accumulate/机器学习/classify_test/中文停用词表.txt" # 创建词袋的持久化路径 space_path = "/Users/FengZhen/Desktop/accumulate/机器学习/classify_test/train_word_bag/tfidfspace.dat" # 训练文本分词存储 # segment(corpus_path, segment_path) # 将分好词的文本文件转换并持久化为Bunch类形式 # bunchObj(wordbag_path, segment_path) #开始训练 startTrain(stop_words_path, wordbag_path, space_path)
准备测试数据
import os import jieba #Bunch类 from sklearn.datasets.base import Bunch import pickle from sklearn import feature_extraction from sklearn.feature_extraction.text import TfidfTransformer #TF-IDF向量转换类 from sklearn.feature_extraction.text import TfidfVectorizer # 定义两个函数读取和保存文件 # 保存至文件 def savefile(savepath, content): fp = open(savepath, "w", encoding="GBK") fp.write(content) fp.close() # 读文件 def readfile(path, encode): content = None try: fp = open(path, "r", encoding=encode) content = fp.read() fp.close() except UnicodeDecodeError: print("Error: 文件读取失败") else: return content # 1.读取和写入Bunch对象的函数 def readbunchobj(path): file_obj = open(path, "rb") bunch = pickle.load(file_obj) file_obj.close() return bunch # 写入Bunch对象 def writebunchobj(path,bunchobj): file_obj = open(path, "wb") pickle.dump(bunchobj, file_obj) file_obj.close() # 整个语料库分词的主程序 # 训练文本分词存储 def segment(corpus_path,seg_path): # 获取corpus_path下的所有子目录 cateList = os.listdir(corpus_path) for myDir in cateList: if not myDir.startswith("."): # 拼出分类子目录的路径 class_path = corpus_path+myDir+"/" # 拼出分词后的语料分类目录 seg_dir = seg_path+myDir+"/" # 是否存在目录,如果没有则创建 if not os.path.exists(seg_dir): os.makedirs(seg_dir) # 获得类别目录下的所有文件 file_list = os.listdir(class_path) # 遍历类别目录下的所有文件 for file_path in file_list: # 拼出文件名的全路径 fullname = class_path + file_path print("path:" + fullname) # 读取文件的内容 content = readfile(fullname, "GBK") if content != None: content = content.strip() # 删除换行和多余的空格 content = content.replace("\r\n", "").strip() # 为文件的内容分词 content_seg = jieba.cut(content) # 将处理后的文件保存到分词后的语目录 savefile(seg_dir + file_path, "".join(content_seg)) print("中文语料分析结束!!!") # 将分好词的文本文件转换并持久化为Bunch类形式 def bunchObj(wordbag_path,seg_path): bunch = Bunch(target_name=[], label=[], filename=[], contents=[]) # 将分好词的文本文件转换并持久化为Bunch类形式的代码如下: catelist = os.listdir(seg_path) # 按类别信息保存到Bunch对象中 bunch.target_name.extend(catelist) for myDir in catelist: if not myDir.startswith("."): class_path = seg_path + myDir + "/" file_list = os.listdir(class_path) for file_path in file_list: fullname = class_path + file_path print(fullname) # 保存当前文件的分类标签 bunch.label.append(myDir) # 保存当前文件路径 bunch.filename.append(fullname) # 保存文件词向量 bunch.contents.append(readfile(fullname, "GBK").strip()) # Bunch对象的持久化 file_obj = open(wordbag_path, "wb") pickle.dump(bunch, file_obj) file_obj.close() print("构建文本对象结束!!!") #训练模型 def startTrain(stopword_path, wordbag_path, space_path, train_space_path): stpwrdlst = readfile(stopword_path,"UTF-8").splitlines() # 从训练集生成TF-IDF向量词袋 # 2.导入分词后的词向量Bunch对象 bunch = readbunchobj(wordbag_path) # 3.构建测试集TF-IDF向量空间 testspace = Bunch(target_name=bunch.target_name, label=bunch.label, filenames= bunch.filename, tdm=[], vocabulary={}) # 4.导入训练集词袋 trainbunch = readbunchobj(train_space_path) # 5.使用TfidfVectorizer初始化向量空间模型 vectorizer = TfidfVectorizer(stop_words=stpwrdlst, sublinear_tf=True, max_df=0.5,vocabulary=trainbunch.vocabulary) # 使用训练集词袋向量 transformer = TfidfTransformer() testspace.tdm = vectorizer.fit_transform(bunch.contents) testspace.vocabulary = trainbunch.vocabulary writebunchobj(space_path, testspace) print("文本分类模型训练完成") # 未分词分类语料库路径 corpus_path = "/Users/FengZhen/Desktop/accumulate/机器学习/classify_test/测试文本集/" # 分词后的分类语料库路径 segment_path = "/Users/FengZhen/Desktop/accumulate/机器学习/classify_test/test_segment/" # 分词语料Bunch对象持久化文件路径 wordbag_path = "/Users/FengZhen/Desktop/accumulate/机器学习/classify_test/test_word_bag/test_set.dat" # 停用词路径 stop_words_path = "/Users/FengZhen/Desktop/accumulate/机器学习/classify_test/中文停用词表.txt" # 创建词袋的持久化路径 space_path = "/Users/FengZhen/Desktop/accumulate/机器学习/classify_test/test_word_bag/testspace.dat" train_space_path = "/Users/FengZhen/Desktop/accumulate/机器学习/classify_test/train_word_bag/tfidfspace.dat" # 训练文本分词存储 # segment(corpus_path, segment_path) # # # 将分好词的文本文件转换并持久化为Bunch类形式 # bunchObj(wordbag_path, segment_path) #开始训练 startTrain(stop_words_path, wordbag_path, space_path,train_space_path)
测试
import pickle from sklearn.naive_bayes import MultinomialNB #导入多项式贝叶斯算法包 def readbunchobj(path): file_obj = open(path,"rb") bunch = pickle.load(file_obj) file_obj.close() return bunch #导入训练向量空间 trainpath = "/Users/FengZhen/Desktop/accumulate/机器学习/classify_test/train_word_bag/tfidfspace.dat" train_set = readbunchobj(trainpath) #导入测试集向量空间 testpath = "/Users/FengZhen/Desktop/accumulate/机器学习/classify_test/test_word_bag/testspace.dat" test_set = readbunchobj(testpath) #应用朴素贝叶斯 #alpha:0.001 alpha越小,迭代次数越多,精度越高 clf = MultinomialNB(alpha = 0.001).fit(train_set.tdm,train_set.label) #预测分类结果 predicted = clf.predict(test_set.tdm) total = len(predicted) rate = 0 for flabel,file_name,expct_cate in zip(test_set.label,test_set.filenames,predicted): print(file_name, u":实际类别:", flabel, u"-->预测类别:", expct_cate) if flabel != expct_cate: rate += 1 # print(file_name,u":实际类别:",flabel,u"-->预测类别:",expct_cate) #精度 print("error rate:",float(rate)*100/float(total),"%")
本站文章如无特殊说明,均为本站原创,如若转载,请注明出处:机器学习-文本分类实例-朴素贝叶斯 - Python技术站