- 🍨 本文为🔗365天深度学习训练营中的学习记录博客
- 🍖 原作者:K同学啊
一、数据预处理
1.加载数据
import torch
import torch.nn as nn
import torchvision
from torchvision import transforms,datasets
import os,PIL,pathlib,warnings
warnings.filterwarnings("ignore")
#忽略警告信息
# win10系统
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device
device(type='cuda')
import pandas as pd
#加载自定义中文数据
train_data = pd.read_csv('F:/jupyter lab/DL-100-days/datasets/N8/train.csv', sep='\t', header=None, encoding='utf-8')
train_data.head()
0 | 1 | |
---|---|---|
0 | 还有双鸭山到淮阴的汽车票吗13号的 | Travel-Query |
1 | 从这里怎么回家 | Travel-Query |
2 | 随便播放一首专辑阁楼里的佛里的歌 | Music-Play |
3 | 给看一下墓王之王嘛 | FilmTele-Play |
4 | 我想看挑战两把s686打突变团竞的游戏视频 | Video-Play |
#构造数据集迭代器
def coustom_data_iter(texts, labels):
for x,y in zip(texts,labels):
yield x,y
x = train_data[0].values[:]
#多类标签的one-hot展开
y= train_data[1].values[:]
2.构建词典
from gensim.models.word2vec import Word2Vec
import numpy as np
#训练 Word2Vec 浅层神经网络模型
w2v = Word2Vec(vector_size=100, #是指特征向量的维度,默认为100。
min_count=3) #可以对字典做截断,词频少于min_count次数的单词会被丢弃掉,默认值为5。
w2v.build_vocab(x)
w2v.train(x,
total_examples=w2v.corpus_count,
epochs=28)
(3827441, 5128984)
#将文本转化为向量
def average_vec(text):
vec =np.zeros(100).reshape((1,100))
for word in text:
try:
vec += w2v.wv[word].reshape((1,100))
except KeyError:
continue
return vec
#将词向量保存为 Ndarray
x_vec =np.concatenate([average_vec(z) for z in x])
#保存 Word2Vec 模型及词向量
w2v.save('F:/jupyter lab/DL-100-days/datasets/N8/w2v_model.pkl')
train_iter=coustom_data_iter(x_vec,y)
len(x),len(x_vec)
(12100, 12100)
label_name =list(set(train_data[1].values[:]))
print(label_name)
['Alarm-Update', 'Radio-Listen', 'Calendar-Query', 'Weather-Query', 'Travel-Query', 'Audio-Play', 'HomeAppliance-Control', 'FilmTele-Play', 'TVProgram-Play', 'Other', 'Video-Play', 'Music-Play']
3.生成数据批次和迭代器
text_pipeline=lambda x:average_vec(x)
label_pipeline =lambda x: label_name.index(x)
text_pipeline("你在干嘛")
array([[-1.47463497e-01, 5.53675264e-01, 2.32937965e+00, 8.27723369e-01, -2.40717939e+00, 1.44922793e-01, 1.05791057e+00, 1.80504337e-01, 8.77318978e-02, 8.46821085e-01, -2.18721238e+00, -6.19571346e+00, 1.54999074e+00, -1.53929926e+00, 9.02176678e-01, 7.66459696e-01, 3.52216189e+00, -2.71442854e+00, 4.83723553e+00, -2.00612454e-01, 2.65928553e+00, -6.85812015e-01, 2.92455360e-01, -7.59955257e-01, -7.11056605e-01, -5.00715058e-02, -7.25709766e-01, -3.49449252e+00, 2.05362378e+00, 1.65073585e-01, 1.53607135e+00, 1.60568693e+00, -1.50479630e+00, -1.01070085e+00, 1.61834336e-01, 3.67275476e-02, -5.12860328e-01, 3.95214066e+00, -2.57145926e+00, 1.36886638e+00, 1.65003492e+00, 1.67193332e+00, -8.31996325e-01, 1.19858028e+00, -1.21710787e+00, 3.41078362e-01, 1.32124563e+00, -5.43934271e-01, -3.71614812e+00, 2.69695812e+00, -6.01738691e-04, -2.58512072e+00, 2.85854936e-03, -5.94619996e-01, -9.07128885e-01, -3.32832735e-01, -3.54674160e-02, -8.85167986e-01, -1.04638404e+00, -3.19511371e-01, 2.18448932e+00, -1.14190475e+00, 2.76876066e+00, -1.30832776e+00, -5.46692092e-01, -1.63290769e-01, -1.80786880e+00, 9.39842269e-01, 1.08917363e+00, -2.15198517e-01, 8.01670000e-01, 4.68951598e-01, 1.16898914e+00, -4.52896714e-01, 3.86154914e-01, -4.23372328e-01, -2.95600758e+00, 1.00093703e+00, 5.18836200e-01, -1.25538594e+00, -1.34598680e+00, -1.03631393e+00, -2.25449917e+00, 2.21089753e+00, -2.21546982e+00, -1.69246741e-01, 1.50789835e+00, -2.10600454e+00, -8.36849727e-01, -2.62724876e-01, -6.43695414e-01, -2.41657940e+00, 1.28879721e+00, 9.73569101e-01, 1.37036532e-01, -2.54981112e+00, -1.28008410e-01, 1.05215633e+00, -2.58280669e+00, 1.66395550e+00]])
label_pipeline("Travel-Query")
4
from torch.utils.data import DataLoader
def collate_batch(batch):
label_list, text_list = [], []
for _text, _label in batch:
# 标签转为 index
label_list.append(label_pipeline(_label))
# 每条文本转为词向量表示
processed_text = torch.tensor(text_pipeline(_text), dtype=torch.float32)
text_list.append(processed_text)
# 拼接成 [batch_size, 100] 的张量
text_tensor = torch.cat(text_list, dim=0).view(len(label_list), -1) # [B, 100]
label_tensor = torch.tensor(label_list, dtype=torch.int64) # [B]
# 确保全部送到 model 所在设备
return text_tensor.to(device), label_tensor.to(device)
#数据加载器,调用示例
dataloader =DataLoader(train_iter,
batch_size=8,
shuffle =False,
collate_fn=collate_batch)
二、模型构建
1.模型搭建
from torch import nn
class TextClassificationModel(nn.Module):
def __init__(self, num_class):
super(TextClassificationModel, self).__init__()
self.fc1 = nn.Linear(100, 128)
self.bn1 = nn.BatchNorm1d(128)
self.act1 = nn.LeakyReLU()
self.dropout1 = nn.Dropout(0.3)
self.fc2 = nn.Linear(128, 64)
self.bn2 = nn.BatchNorm1d(64)
self.act2 = nn.LeakyReLU()
self.dropout2 = nn.Dropout(0.2)
self.fc3 = nn.Linear(64, num_class)
def forward(self, x):
x = self.fc1(x)
x = self.bn1(x)
x = self.act1(x)
x = self.dropout1(x)
x = self.fc2(x)
x = self.bn2(x)
x = self.act2(x)
x = self.dropout2(x)
x = self.fc3(x)
return x
2.初始化模型
num_class= len(label_name)
vocab_size = 100000
em_size = 12
model=TextClassificationModel(num_class).to(device)
3.定义训练和评估函数
import time
def train(dataloader):
model.train()
total_acc, total_loss, total_count = 0, 0, 0
for idx, (text, label) in enumerate(dataloader):
optimizer.zero_grad()
predicted_label = model(text)
loss = criterion(predicted_label, label)
loss.backward()
torch.nn.utils.clip_grad_norm_(model.parameters(), 0.1)
optimizer.step()
total_acc += (predicted_label.argmax(1) == label).sum().item()
total_loss += loss.item()
total_count += label.size(0)
return total_acc / total_count, total_loss / total_count
def evaluate(dataloader):
model.eval()
total_acc, total_loss, total_count = 0, 0, 0
with torch.no_grad():
for text, label in dataloader:
predicted_label = model(text)
loss = criterion(predicted_label, label)
total_acc += (predicted_label.argmax(1) == label).sum().item()
total_loss += loss.item()
total_count += label.size(0)
return total_acc / total_count, total_loss / total_count
三、训练模型
1.拆分数据集并运行模型
from torch.utils.data.dataset import random_split
from torchtext.data.functional import to_map_style_dataset
EPOCHS = 10
LR = 5
BATCH_SIZE = 64
criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(model.parameters(), lr=LR)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 1.0, gamma=0.1)
total_accu = None
# 构建数据集
train_iter = coustom_data_iter(train_data[0].values[:], train_data[1].values[:])
train_dataset = to_map_style_dataset(train_iter)
split_train, split_valid = random_split(train_dataset, [
int(len(train_dataset) * 0.8),
int(len(train_dataset) * 0.2)
])
train_dataloader = DataLoader(split_train, batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_batch)
valid_dataloader = DataLoader(split_valid, batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_batch)
train_acc = []
train_loss = []
test_acc = []
test_loss = []
for epoch in range(1, EPOCHS + 1):
epoch_start_time = time.time()
train_epoch_acc, train_epoch_loss = train(train_dataloader)
val_acc, val_loss = evaluate(valid_dataloader)
train_acc.append(train_epoch_acc)
train_loss.append(train_epoch_loss)
test_acc.append(val_acc)
test_loss.append(val_loss)
lr = optimizer.state_dict()['param_groups'][0]['lr']
if total_accu is not None and total_accu > val_acc:
scheduler.step()
else:
total_accu = val_acc
print("_" * 69)
print('| Epoch {:2d} | Time: {:4.2f}s | Train Acc: {:4.3f}, Loss: {:4.3f} | '
'Val Acc: {:4.3f}, Loss: {:4.3f} | LR: {:4.6f}'.format(
epoch,
time.time() - epoch_start_time,
train_epoch_acc,
train_epoch_loss,
val_acc,
val_loss,
lr
))
print("-" * 69)
_____________________________________________________________________ | Epoch 1 | Time: 2.55s | Train Acc: 0.882, Loss: 0.006 | Val Acc: 0.909, Loss: 0.004 | LR: 5.000000 --------------------------------------------------------------------- _____________________________________________________________________ | Epoch 2 | Time: 2.56s | Train Acc: 0.892, Loss: 0.005 | Val Acc: 0.912, Loss: 0.004 | LR: 5.000000 --------------------------------------------------------------------- _____________________________________________________________________ | Epoch 3 | Time: 2.86s | Train Acc: 0.890, Loss: 0.005 | Val Acc: 0.916, Loss: 0.004 | LR: 5.000000 --------------------------------------------------------------------- _____________________________________________________________________ | Epoch 4 | Time: 2.78s | Train Acc: 0.892, Loss: 0.005 | Val Acc: 0.914, Loss: 0.004 | LR: 5.000000 --------------------------------------------------------------------- _____________________________________________________________________ | Epoch 5 | Time: 2.76s | Train Acc: 0.904, Loss: 0.005 | Val Acc: 0.921, Loss: 0.004 | LR: 0.500000 --------------------------------------------------------------------- _____________________________________________________________________ | Epoch 6 | Time: 2.78s | Train Acc: 0.910, Loss: 0.004 | Val Acc: 0.920, Loss: 0.004 | LR: 0.500000 --------------------------------------------------------------------- _____________________________________________________________________ | Epoch 7 | Time: 2.53s | Train Acc: 0.912, Loss: 0.004 | Val Acc: 0.921, Loss: 0.004 | LR: 0.050000 --------------------------------------------------------------------- _____________________________________________________________________ | Epoch 8 | Time: 2.63s | Train Acc: 0.910, Loss: 0.004 | Val Acc: 0.917, Loss: 0.004 | LR: 0.005000 --------------------------------------------------------------------- _____________________________________________________________________ | Epoch 9 | Time: 2.54s | Train Acc: 0.909, Loss: 0.004 | Val Acc: 0.921, Loss: 0.004 | LR: 0.000500 --------------------------------------------------------------------- _____________________________________________________________________ | Epoch 10 | Time: 2.40s | Train Acc: 0.912, Loss: 0.004 | Val Acc: 0.920, Loss: 0.004 | LR: 0.000500 ---------------------------------------------------------------------
2.Acc与Loss图
import matplotlib.pyplot as plt
import warnings
from datetime import datetime
warnings.filterwarnings("ignore")
plt.rcParams['font.sans-serif'] = ['SimHei'] # 显示中文
plt.rcParams['axes.unicode_minus'] = False
plt.rcParams['figure.dpi'] = 100
# 当前时间(用于标题)
current_time = datetime.now().strftime("%Y-%m-%d %H:%M")
# 横轴:epoch 范围
epochs_range = range(len(train_acc))
plt.figure(figsize=(12, 3))
# 准确率曲线
plt.subplot(1, 2, 1)
plt.plot(epochs_range, train_acc, label='Training Accuracy')
plt.plot(epochs_range, test_acc, label='Validation Accuracy')
plt.xlabel("Epochs")
plt.ylabel("Accuracy")
plt.title(f'准确率变化({current_time})')
plt.legend(loc='lower right')
# 损失曲线
plt.subplot(1, 2, 2)
plt.plot(epochs_range, train_loss, label='Training Loss')
plt.plot(epochs_range, test_loss, label='Validation Loss')
plt.xlabel("Epochs")
plt.ylabel("Loss")
plt.title('损失变化')
plt.legend(loc='upper right')
plt.tight_layout()
plt.show()
test_acc,test_loss =evaluate(valid_dataloader)
print('模型准确率为:{:5.4f}'.format(test_acc))
模型准确率为:0.9202
3.测试指定数据
def predict(text,text_pipeline):
with torch.no_grad():
text = torch.tensor(text_pipeline(text),dtype=torch.float32)
print(text.shape)
output = model(text)
return output.argmax(1).item()
#ex_text_str ="随便播放一首专辑阁楼里的佛里的歌"
ex_text_str ="还有双鸭山到淮阴的汽车票吗13号的"
model = model.to("cpu")
print("该文本的类别是:%s"% label_name[predict(ex_text_str,text_pipeline)])
torch.Size([1, 100]) 该文本的类别是:Travel-Query
四、学习心得
Word2Vec 模型能够实现中文文本分类任务中的词向量表示,是一种经典的词嵌入方法,通过对大规模文本数据进行训练,将词语映射为连续的向量空间表示。这些词向量能够有效捕捉词与词之间的语义和句法关联,为后续的文本分类模型提供更具表达力的输入特征。