在PyTorch中使用CNN和LSTM对文本进行分类的完整攻略如下,包括两个示例说明。
1. 示例1:使用CNN和LSTM对IMDB电影评论进行分类
在这个示例中,我们将使用CNN和LSTM对IMDB电影评论进行分类。以下是使用CNN和LSTM对文本进行分类的步骤:
- 准备数据集
首先需要准备IMDB电影评论数据集,并将其转换为PyTorch所支持的格式。可以使用torchtext
库来加载和处理数据集。
```python
import torch
import torchtext
from torchtext.datasets import IMDB
from torchtext.data import Field, LabelField, BucketIterator
TEXT = Field(sequential=True, lower=True, batch_first=True)
LABEL = LabelField()
train_data, test_data = IMDB.splits(TEXT, LABEL)
TEXT.build_vocab(train_data, max_size=25000)
LABEL.build_vocab(train_data)
```
- 定义模型
可以使用CNN和LSTM结合的方式来对文本进行分类。以下是定义模型的示例代码:
```python
import torch.nn as nn
class TextCNNLSTM(nn.Module):
def init(self, vocab_size, embedding_dim, hidden_dim, output_dim, dropout):
super().init()
self.embedding = nn.Embedding(vocab_size, embedding_dim)
self.conv = nn.Conv2d(in_channels=1, out_channels=100, kernel_size=(3, embedding_dim))
self.lstm = nn.LSTM(input_size=100, hidden_size=hidden_dim, num_layers=1, bidirectional=True)
self.fc = nn.Linear(hidden_dim * 2, output_dim)
self.dropout = nn.Dropout(dropout)
def forward(self, text):
embedded = self.embedding(text)
embedded = embedded.unsqueeze(1)
conv_output = nn.functional.relu(self.conv(embedded)).squeeze(3)
lstm_output, (hidden, cell) = self.lstm(conv_output)
hidden = self.dropout(torch.cat((hidden[-2,:,:], hidden[-1,:,:]), dim=1))
output = self.fc(hidden)
return output
```
- 训练模型
定义好模型后,可以使用PyTorch的torch.utils.data.DataLoader
和torch.optim
来训练模型。以下是训练模型的示例代码:
```python
import torch.optim as optim
from torch.utils.data import DataLoader
BATCH_SIZE = 64
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
train_iterator, test_iterator = BucketIterator.splits(
(train_data, test_data),
batch_size=BATCH_SIZE,
device=device
)
model = TextCNNLSTM(len(TEXT.vocab), 100, 256, len(LABEL.vocab), 0.5)
optimizer = optim.Adam(model.parameters())
criterion = nn.CrossEntropyLoss()
model = model.to(device)
criterion = criterion.to(device)
def train(model, iterator, optimizer, criterion):
epoch_loss = 0
epoch_acc = 0
model.train()
for batch in iterator:
optimizer.zero_grad()
predictions = model(batch.text).squeeze(1)
loss = criterion(predictions, batch.label)
acc = binary_accuracy(predictions, batch.label)
loss.backward()
optimizer.step()
epoch_loss += loss.item()
epoch_acc += acc.item()
return epoch_loss / len(iterator), epoch_acc / len(iterator)
def evaluate(model, iterator, criterion):
epoch_loss = 0
epoch_acc = 0
model.eval()
with torch.no_grad():
for batch in iterator:
predictions = model(batch.text).squeeze(1)
loss = criterion(predictions, batch.label)
acc = binary_accuracy(predictions, batch.label)
epoch_loss += loss.item()
epoch_acc += acc.item()
return epoch_loss / len(iterator), epoch_acc / len(iterator)
def binary_accuracy(preds, y):
rounded_preds = torch.round(torch.sigmoid(preds))
correct = (rounded_preds == y).float()
acc = correct.sum() / len(correct)
return acc
N_EPOCHS = 10
for epoch in range(N_EPOCHS):
train_loss, train_acc = train(model, train_iterator, optimizer, criterion)
valid_loss, valid_acc = evaluate(model, test_iterator, criterion)
print(f'Epoch: {epoch+1:02}')
print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%')
print(f'\t Val. Loss: {valid_loss:.3f} | Val. Acc: {valid_acc*100:.2f}%')
```
- 测试模型
训练好模型后,可以使用测试集来测试模型的性能。以下是测试模型的示例代码:
```python
def predict_sentiment(model, sentence):
model.eval()
tokenized = [tok.text for tok in nlp.tokenizer(sentence)]
indexed = [TEXT.vocab.stoi[t] for t in tokenized]
tensor = torch.LongTensor(indexed).to(device)
tensor = tensor.unsqueeze(1)
prediction = torch.sigmoid(model(tensor))
return prediction.item()
print(predict_sentiment(model, "This film is terrible"))
print(predict_sentiment(model, "This film is great"))
```
2. 示例2:使用CNN和LSTM对中文文本进行分类
如果要对中文文本进行分类,可以使用相同的方法。以下是使用CNN和LSTM对中文文本进行分类的示例代码:
- 准备数据集
首先需要准备中文文本数据集,并将其转换为PyTorch所支持的格式。可以使用torchtext
库来加载和处理数据集。
```python
import torchtext
from torchtext.datasets import text_classification
from torchtext.data.utils import get_tokenizer
from collections import Counter
tokenizer = get_tokenizer('basic_english')
train_dataset, test_dataset = text_classification.DATASETS'AG_NEWS'
```
- 定义模型
可以使用CNN和LSTM结合的方式来对文本进行分类。以下是定义模型的示例代码:
```python
import torch.nn as nn
class TextCNNLSTM(nn.Module):
def init(self, vocab_size, embedding_dim, hidden_dim, output_dim, dropout):
super().init()
self.embedding = nn.Embedding(vocab_size, embedding_dim)
self.conv = nn.Conv2d(in_channels=1, out_channels=100, kernel_size=(3, embedding_dim))
self.lstm = nn.LSTM(input_size=100, hidden_size=hidden_dim, num_layers=1, bidirectional=True)
self.fc = nn.Linear(hidden_dim * 2, output_dim)
self.dropout = nn.Dropout(dropout)
def forward(self, text):
embedded = self.embedding(text)
embedded = embedded.unsqueeze(1)
conv_output = nn.functional.relu(self.conv(embedded)).squeeze(3)
lstm_output, (hidden, cell) = self.lstm(conv_output)
hidden = self.dropout(torch.cat((hidden[-2,:,:], hidden[-1,:,:]), dim=1))
output = self.fc(hidden)
return output
```
- 训练模型
定义好模型后,可以使用PyTorch的torch.utils.data.DataLoader
和torch.optim
来训练模型。以下是训练模型的示例代码:
```python
import torch.optim as optim
from torch.utils.data import DataLoader
BATCH_SIZE = 64
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=True)
model = TextCNNLSTM(len(train_dataset.get_vocab()), 100, 256, len(train_dataset.get_labels()), 0.5)
optimizer = optim.Adam(model.parameters())
criterion = nn.CrossEntropyLoss()
model = model.to(device)
criterion = criterion.to(device)
def train(model, iterator, optimizer, criterion):
epoch_loss = 0
epoch_acc = 0
model.train()
for batch in iterator:
optimizer.zero_grad()
predictions = model(batch.text).squeeze(1)
loss = criterion(predictions, batch.label)
acc = binary_accuracy(predictions, batch.label)
loss.backward()
optimizer.step()
epoch_loss += loss.item()
epoch_acc += acc.item()
return epoch_loss / len(iterator), epoch_acc / len(iterator)
def evaluate(model, iterator, criterion):
epoch_loss = 0
epoch_acc = 0
model.eval()
with torch.no_grad():
for batch in iterator:
predictions = model(batch.text).squeeze(1)
loss = criterion(predictions, batch.label)
acc = binary_accuracy(predictions, batch.label)
epoch_loss += loss.item()
epoch_acc += acc.item()
return epoch_loss / len(iterator), epoch_acc / len(iterator)
def binary_accuracy(preds, y):
rounded_preds = torch.round(torch.sigmoid(preds))
correct = (rounded_preds == y).float()
acc = correct.sum() / len(correct)
return acc
N_EPOCHS = 10
for epoch in range(N_EPOCHS):
train_loss, train_acc = train(model, train_loader, optimizer, criterion)
valid_loss, valid_acc = evaluate(model, test_loader, criterion)
print(f'Epoch: {epoch+1:02}')
print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%')
print(f'\t Val. Loss: {valid_loss:.3f} | Val. Acc: {valid_acc*100:.2f}%')
```
- 测试模型
训练好模型后,可以使用测试集来测试模型的性能。以下是测试模型的示例代码:
```python
def predict_sentiment(model, sentence):
model.eval()
tokenized = tokenizer(sentence)
indexed = [train_dataset.get_vocab()[t] for t in tokenized]
tensor = torch.LongTensor(indexed).to(device)
tensor = tensor.unsqueeze(1)
prediction = torch.sigmoid(model(tensor))
return prediction.item()
print(predict_sentiment(model, "This is a good news"))
print(predict_sentiment(model, "This is a bad news"))
```
以上就是使用CNN和LSTM对文本进行分类的完整攻略,包括两个示例说明。
本站文章如无特殊说明,均为本站原创,如若转载,请注明出处:pytorch实现用CNN和LSTM对文本进行分类方式 - Python技术站