【小白學(xué)習(xí)PyTorch教程】十、基于大型電影評論數(shù)據(jù)集訓(xùn)練第一個LSTM模型
「@Author:Runsen」
本博客對原始IMDB數(shù)據(jù)集進行預(yù)處理,建立一個簡單的深層神經(jīng)網(wǎng)絡(luò)模型,對給定數(shù)據(jù)進行情感分析。
數(shù)據(jù)集下載 here. 原始數(shù)據(jù)集,沒有進行處理here.
import numpy as np
# read data from text files
with open('reviews.txt', 'r') as f:
reviews = f.read()
with open('labels.txt', 'r') as f:
labels = f.read()
編碼
在將數(shù)據(jù)輸入深度學(xué)習(xí)模型之前,應(yīng)該將其轉(zhuǎn)換為數(shù)值,文本轉(zhuǎn)換被稱為「編碼」,這涉及到每個字符轉(zhuǎn)換成一個整數(shù)。在進行編碼之前,需要清理數(shù)據(jù)。 有以下幾個預(yù)處理步驟:
刪除標(biāo)點符號。 使用\n作為分隔符拆分文本。 把所有的評論重新組合成一個大串。
from string import punctuation
# remove punctuation
reviews = reviews.lower()
text = ''.join([c for c in reviews if c not in punctuation])
print(punctuation) # !"#$%&'()*+,-./:;<=>?@[\]^_`{|}~
# split by new lines and spaces
reviews_split = text.split('\n')
text = ' '.join(reviews_split)
# create a list of words
words = text.split()
建立字典并對評論進行編碼
創(chuàng)建一個「字典」,將詞匯表中的單詞映射為整數(shù)。然后通過這個字典,評論可以轉(zhuǎn)換成整數(shù),然后再傳送到模型網(wǎng)絡(luò)。
from collections import Counter
word_counts = Counter(words)
vocab = sorted(word_counts, key = word_counts.get, reverse = True)
vocab2idx = {vocab:idx for idx, vocab in enumerate(vocab, 1)}
print("Size of Vocabulary: ", len(vocab))
Size of Vocabulary: 74072
encoded_reviews = []
for review in reviews_split:
encoded_reviews.append([vocab2idx[vocab] for vocab in review.split()])
print("The number of reviews: ", len(encoded_reviews))
The number of reviews: 25001
對標(biāo)簽進行編碼
Negative 和Positive應(yīng)分別標(biāo)記為0和1(整數(shù))
splitted_labels = labels.split("\n")
encoded_labels = np.array([
1 if label == "positive" else 0 for label in splitted_labels
])
刪除異常值
應(yīng)刪除長度為0評論,然后,將對剩余的數(shù)據(jù)進行填充,保證所有數(shù)據(jù)具有相同的長度。
length_reviews = Counter([len(x) for x in encoded_reviews])
print("Zero-length reviews: ", length_reviews[0])
print("Maximum review length: ", max(length_reviews))
Zero-length reviews: 1 Maximum review length: 2514
# reviews with length 0
non_zero_idx = [i for i, review in enumerate(encoded_reviews) if len(review) != 0]
# Remove 0-length reviews
encoded_reviews = [encoded_reviews[i] for i in non_zero_idx]
encoded_labels = np.array([encoded_labels[i] for i in non_zero_idx])
填充序列
下面要處理很長和很短的評論,需要使用0填充短評論,使其適合特定的長度,
并將長評論剪切為seq_length的單詞。這里設(shè)置seq_length=200
def text_padding(encoded_reviews, seq_length):
reviews = []
for review in encoded_reviews:
if len(review) >= seq_length:
reviews.append(review[:seq_length])
else:
reviews.append([0]*(seq_length-len(review)) + review)
return np.array(reviews)
seq_length = 200
padded_reviews = text_padding(encoded_reviews, seq_length)
print(padded_reviews[:12, :12])

數(shù)據(jù)加載器
將數(shù)據(jù)按8:1:1的比例拆分為訓(xùn)練集、驗證集和測試集,然后使用“TensorDataset”和“DataLoader”函數(shù)來處理評論和標(biāo)簽數(shù)據(jù)。
ratio = 0.8
train_length = int(len(padded_reviews) * ratio)
X_train = padded_reviews[:train_length]
y_train = encoded_labels[:train_length]
remaining_x = padded_reviews[train_length:]
remaining_y = encoded_labels[train_length:]
test_length = int(len(remaining_x)*0.5)
X_val = remaining_x[: test_length]
y_val = remaining_y[: test_length]
X_test = remaining_x[test_length :]
y_test = remaining_y[test_length :]
print("Feature shape of train review set: ", X_train.shape)
print("Feature shape of val review set: ", X_val.shape)
print("Feature shape of test review set: ", X_test.shape)

import torch
from torch.utils.data import TensorDataset, DataLoader
batch_size = 50
device = "cuda" if torch.cuda.is_available() else "cpu"
train_dataset = TensorDataset(torch.from_numpy(X_train).to(device), torch.from_numpy(y_train).to(device))
valid_dataset = TensorDataset(torch.from_numpy(X_val).to(device), torch.from_numpy(y_val).to(device))
test_dataset = TensorDataset(torch.from_numpy(X_test).to(device), torch.from_numpy(y_test).to(device))
train_loader = DataLoader(train_dataset, batch_size = batch_size, shuffle = True)
valid_loader = DataLoader(valid_dataset, batch_size = batch_size, shuffle = True)
test_loader = DataLoader(test_dataset, batch_size = batch_size, shuffle = True)
data_iter = iter(train_loader)
X_sample, y_sample = data_iter.next()
RNN模型的實現(xiàn)

到目前為止,包括標(biāo)記化在內(nèi)的預(yù)處理已經(jīng)完成?,F(xiàn)在建立一個神經(jīng)網(wǎng)絡(luò)模型來預(yù)測評論的情緒。
首先,「嵌入層」將單詞標(biāo)記轉(zhuǎn)換為特定大小。
第二,一個 LSTM層,由
hidden_size和num_layers定義。第三,通過完全連接的層從LSTM層的輸出映射期望的輸出大小。
最后,sigmoid激活層以概率0到1的形式返回輸出。
import torch.nn as nn
from torch.autograd import Variable
class Model(nn.Module):
def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim, num_layers):
super(Model, self).__init__()
self.hidden_dim = hidden_dim
self.num_layers = num_layers
# embedding and LSTM
self.embedding = nn.Embedding(vocab_size, embedding_dim)
self.lstm = nn.LSTM(input_size = embedding_dim,
hidden_size = hidden_dim,
num_layers = num_layers,
batch_first = True,
dropout = 0.5,
bidirectional = False)
# 完連接層
self.fc = nn.Sequential(
nn.Dropout(0.5),
nn.Linear(hidden_dim, output_dim),
nn.Sigmoid()
)
def forward(self, token, hidden):
batch_size = token.size(0)
# embedding and lstm output
out = self.embedding(token.long())
out, hidden = self.lstm(out, hidden)
# stack up lstm outputs
out = out.contiguous().view(-1, self.hidden_dim)
# fully connected layer
out = self.fc(out)
# reshape to be batch_size first
out = out.view(batch_size, -1)
# get the last batch of labels
out = out[:, -1]
return out
def init_hidden(self, batch_size):
return (Variable(torch.zeros(self.num_layers, batch_size, self.hidden_dim).to(device)),
Variable(torch.zeros(self.num_layers, batch_size, self.hidden_dim).to(device)))
vocab_size : 詞匯量 embedding_dim : 嵌入查找表中的列數(shù) hidden_dim : LSTM單元隱藏層中的單元數(shù) output_dim : 期望輸出的大小
vocab_size = len(vocab)+1 # +1 for the 0 padding + our word tokens
embedding_dim = 400
hidden_dim = 256
output_dim = 1
num_layers = 2
model = Model(vocab_size, embedding_dim, hidden_dim, output_dim, num_layers).to(device)
model

訓(xùn)練
對于損失函數(shù),BCELoss被用于「二分類交叉熵?fù)p失」,通過給出介于0和1之間的概率進行分類。使用Adam優(yōu)化器,學(xué)習(xí)率為0.001
另外,torch.nn.utils.clip_grad_norm_(model.parameters(), clip = 5),防止了RNN中梯度的爆炸和消失問題clip是要剪裁最大值。
# Loss function and Optimizer
criterion = nn.BCELoss()
optimizer = torch.optim.Adam(model.parameters(), lr = 0.001)
for epoch in range(num_epochs):
model.train()
hidden = model.init_hidden(batch_size)
for i, (review, label) in enumerate(train_loader):
review, label = review.to(device), label.to(device)
# Initialize Optimizer
optimizer.zero_grad()
hidden = tuple([h.data for h in hidden])
# Feed Forward
output = model(review, hidden)
# Calculate the Loss
loss = criterion(output.squeeze(), label.float())
# Back Propagation
loss.backward()
# Prevent Exploding Gradient Problem
nn.utils.clip_grad_norm_(model.parameters(), 5)
# Update
optimizer.step()
train_losses.append(loss.item())
# Print Statistics
if (i+1) % 100 == 0:
### Evaluation ###
# initialize hidden state
val_h = model.init_hidden(batch_size)
val_losses = []
model.eval()
for review, label in valid_loader:
review, label = review.to(device), label.to(device)
val_h = tuple([h.data for h in val_h])
output = model(review, val_h)
val_loss = criterion(output.squeeze(), label.float())
val_losses.append(val_loss.item())
print("Epoch: {}/{} | Step {}, Train Loss {:.4f}, Val Loss {:.4f}".
format(epoch+1, num_epochs, i+1, np.mean(train_losses), np.mean(val_losses)))
