先运行main.py进行文本序列化,再train.py模型训练

 

dataset.py

from torch.utils.data import DataLoader,Dataset
import torch
import os
from utils import tokenlize
import config



class ImdbDataset(Dataset):
    def __init__(self,train=True):
        super(ImdbDataset,self).__init__()
        data_path = r"H:\073-nlp自然语言处理-v5.bt38[周大伟]\073-nlp自然语言处理-v5.bt38[周大伟]\第四天\代码\data\aclImdb_v1\aclImdb"
        data_path += r"\train" if train else r"\test"
        self.total_path = []
        for temp_path in [r"\pos",r"\neg"]:
            cur_path = data_path + temp_path
            self.total_path += [os.path.join(cur_path,i) for i in os.listdir(cur_path) if i.endswith(".txt")]

    def __getitem__(self, idx):
        file = self.total_path[idx]
        review = open(file,encoding="utf-8").read()
        review = tokenlize(review)
        label = int(file.split("_")[-1].split(".")[0])
        label = 0 if label < 5 else 1
        return review,label

    def __len__(self):
        return len(self.total_path)

def collate_fn(batch):
    '''
    对batch数据进行处理
    :param batch: 
    :return: 
    '''
    reviews,labels = zip(*batch)
    reviews = torch.LongTensor([config.ws.transform(i,max_len=config.max_len) for i in reviews])
    labels = torch.LongTensor(labels)
    return reviews,labels


def get_dataloader(train):
    imdbdataset = ImdbDataset(train=True)
    batch_size = config.train_batch_size if train else config.test_batch_size
    return DataLoader(imdbdataset,batch_size=batch_size,shuffle=True,collate_fn=collate_fn)


if __name__ == '__main__':
    # dataset = ImdbDataset(train=True)
    # print(dataset[1])
    for idx,(review,label) in enumerate(get_dataloader(train=True)):
        print(review)
        print(label)
        break

  utils.py

"""
实现额外的方法
"""
import re

def tokenlize(sentence):
    """
    进行文本分词
    :param sentence: str
    :return: [str,str,str]
    """

    fileters = ['!', '"', '#', '$', '%', '&', '\(', '\)', '\*', '\+', ',', '-', '\.', '/', ':', ';', '<', '=', '>',
                '\?', '@', '\[', '\\', '\]', '^', '_', '`', '\{', '\|', '\}', '~', '\t', '\n', '\x97', '\x96', '”', '“', ]
    sentence = sentence.lower() #把大写转化为小写
    sentence = re.sub("<br />"," ",sentence)
    # sentence = re.sub("I'm","I am",sentence)
    # sentence = re.sub("isn't","is not",sentence)
    sentence = re.sub("|".join(fileters)," ",sentence)
    result = [i for i in sentence.split(" ") if len(i)>0]

    return result

word_sequence.py

'''
文本序列化
'''

class WordSequence():
    UNK_TAG = "<UNK>"
    PAD_TAG = "<PAD>"
    UNK = 1
    PAD = 0

    def __init__(self):
        self.dict = {
            self.UNK_TAG:self.UNK,
            self.PAD_TAG:self.PAD
        }
        self.count = {}


    def fit(self,sentence):
        '''
        统计词频
        :param sentence: 
        :return: 
        '''
        for word in sentence:
            self.count[word] = self.count.get(word,0)+1

    def build_vocab(self,min_count=0,max_count = None,max_features = None):
        """
        根据条件构建 词典
        :param min_count:最小词频 
        :param max_count: 最大词频
        :param max_features: 最大词语数
        :return: 
        """
        if min_count is not None:
            self.count = {word:count for word,count in self.count.items() if count >min_count}
        if max_count is not None:
            self.count = {word:count for word,count in self.count.items() if count<max_count}
        if max_features is not None:
            #排序
            self.count = dict(sorted(self.count.items(),lambda x:x[-1],reverse=True)[:max_features])

        for word in self.count:
            self.dict[word] = len(self.dict) #每次word对应一个数字

        #把dict进行翻转
            self.inverse_dict = dict(zip(self.dict.values(),self.dict.keys()))

    def transform(self,sentence,max_len =None):
        '''
        把句子转化为数字序列
        :param sentence: 
        :return: 
        '''
        if len(sentence) > max_len:
            sentence = sentence[:max_len]
        else:
            sentence = sentence + [self.PAD_TAG]*(max_len-len(sentence))
        return [self.dict.get(i,1) for i in sentence]

    def inverse_transform(self,incides):
        """
        把数字序列转化为字符
        :param incides: 
        :return: 
        """
        return [self.inverse_dict.get(i,"<UNK>") for i in incides]

    def __len__(self):
        return len(self.dict)

if __name__ == '__main__':
    sentences = [["今天","天气","很","好"],
                 ["今天","去","吃","什么"]]

    ws = WordSequence()
    for sentence in sentences:
        ws.fit(sentence)

    ws.build_vocab(min_count=0)
    print(ws.dict)
    ret = ws.transform(["好","热","呀","呀","呀","呀","呀","呀","呀"],max_len=5)
    print(ret)
    ret = ws.inverse_transform(ret)
    print(ret)

  main.py

from word_sequence import WordSequence
from dataset import get_dataloader
import pickle
from tqdm import tqdm

if __name__ == '__main__':
    ws = WordSequence()
    train_data = get_dataloader(True)
    test_data = get_dataloader(False)
    for reviews,labels in tqdm(train_data,total=len(train_data)):
        for review in reviews:
            ws.fit(review)
    for reviews,labels in tqdm(test_data,total=len(test_data)):
        for review in reviews:
            ws.fit(review)
    print("正在建立...")
    ws.build_vocab()
    print(len(ws))
    pickle.dump(ws,open("./models/ws.pkl","wb"))

  model.py

"""
构建模型
"""
import torch.nn as nn
import config
import torch.nn.functional as F

class ImdbModel(nn.Module):
    def __init__(self):
        super(ImdbModel,self).__init__()
        self.embedding = nn.Embedding(num_embeddings=len(config.ws),embedding_dim=300,padding_idx=config.ws.PAD)
        self.fc = nn.Linear(config.max_len*300,2)

    def forward(self,input):
        '''
        :param input: 
        :return: 
        '''
        input_embeded = self.embedding(input)

        input_embeded_viewed = input_embeded.view(input_embeded.size(0),-1)

        out = self.fc(input_embeded_viewed)
        return  F.log_softmax(out,dim=-1)

  LSTMmodel.py

"""
构建模型
"""
import torch.nn as nn
import torch
import config
import torch.nn.functional as F

class ImdbModel(nn.Module):
    def __init__(self):
        super(ImdbModel,self).__init__()
        self.embedding = nn.Embedding(num_embeddings=len(config.ws),embedding_dim=300,padding_idx=config.ws.PAD)
        self.lstm = nn.LSTM(input_size=200,hidden_size=64,num_layers=2,batch_first=True,bidirectional=True,dropout=0.5)
        self.fc1 = nn.Linear(64*2,64)
        self.fc2 = nn.Linear(64,2)

    def forward(self,input):
        '''
        :param input: 
        :return: 
        '''
        input_embeded = self.embedding(input)    #[batch_size,seq_len,200]

        output,(h_n,c_n) = self.lstm(input_embeded)
        out = torch.cat(h_n[-1,:,:],h_n[-2,:,:],dim=-1) #拼接正向最后一个输出和反向最后一个输出

        #进行全连接
        out_fc1 = self.fc1(out)
        #进行relu
        out_fc1_relu = F.relu(out_fc1)
        #全连接
        out = self.fc2(out_fc1_relu)
        return  F.log_softmax(out,dim=-1)

  train.py

'''
进行模型的训练
'''
import torch

import config
from model import ImdbModel
from dataset import get_dataloader
from torch.optim import Adam
from tqdm import tqdm
import torch.nn.functional as F
import numpy as np
import matplotlib.pyplot as plt
from eval import eval

model = ImdbModel().to(config.device)
optimizer = Adam(model.parameters(),lr=0.001)
loss_list = []

def train(epoch):
    train_dataloader = get_dataloader(train=True)
    bar = tqdm(train_dataloader,total=len(train_dataloader))

    for idx,(input,target) in enumerate(bar):
        optimizer.zero_grad()
        input = input.to(config.device)
        target = target.to(config.device)
        output = model(input)
        loss = F.nll_loss(output,target)
        loss.backward()
        loss_list.append(loss.item())
        optimizer.step()
        bar.set_description("epoch:{} idx:{} loss:{:.6f}".format(epoch,idx,np.mean(loss_list)))

        if idx%10 == 0:
            torch.save(model.state_dict(),"./models/model.pkl")
            torch.save(optimizer.state_dict(),"./models/optimizer.pkl")

if __name__ == '__main__':
    for i in range(5):
        train(i)
        eval()
    plt.figure(figsize=(20,8))
    plt.plot(range(len(loss_list)),loss_list)

  eval.py

'''
进行模型的训练
'''
import torch

import config
from model import ImdbModel
from dataset import get_dataloader
from torch.optim import Adam
from tqdm import tqdm
import torch.nn.functional as F
import numpy as np
import matplotlib.pyplot as plt



def eval():
    model = ImdbModel().to(config.device)
    model.load_state_dict(torch.load("./models/model.pkl"))
    model.eval()
    loss_list = []
    acc_list = []
    test_dataloader = get_dataloader(train=False)
    with torch.no_grad():
        for input,target in test_dataloader:
            input = input.to(config.device)
            target = target.to(config.device)
            output = model(input)
            loss = F.nll_loss(output,target)
            loss_list.append(loss.item())
            #准确率
            pred= output.max(dim = -1)[-1]
            acc_list.append(pred.eq(target).cpu().float().mean())
        print("loss:{:.6f},acc:{}".format(np.mean(loss_list),np.mean(acc_list)))


if __name__ == '__main__':
    eval()