from transformers import AutoTokenizer
#加载分词工具
tokenizer = AutoTokenizer.from_pretrained('distilbert-base-uncased')
tokenizer('What is your name?', 'My name is Sylvain.')
from datasets import load_dataset, load_from_disk
#加载数据集
dataset = load_dataset('squad')
# dataset = load_from_disk('datas/squad')
#采样,数据量太大了跑不动
dataset['train'] = dataset['train'].shuffle().select(range(10000))
dataset['validation'] = dataset['validation'].shuffle().select(range(200))
print(dataset['train'][0])
dataset
#从官方教程里抄出来的函数,总之就是squad数据的处理函数,过程非常复杂,即使是官方的实现也是有问题的,我实在没本事写这个
def prepare_train_features(examples):
# Some of the questions have lots of whitespace on the left, which is not useful and will make the
# truncation of the context fail (the tokenized question will take a lots of space). So we remove that
# left whitespace
examples["question"] = [q.lstrip() for q in examples["question"]]
# Tokenize our examples with truncation and padding, but keep the overflows using a stride. This results
# in one example possible giving several features when a context is long, each of those features having a
# context that overlaps a bit the context of the previous feature.
tokenized_examples = tokenizer(
examples['question'],
examples['context'],
truncation='only_second',
max_length=384,
stride=128,
return_overflowing_tokens=True,
return_offsets_mapping=True,
padding='max_length',
)
# Since one example might give us several features if it has a long context, we need a map from a feature to
# its corresponding example. This key gives us just that.
sample_mapping = tokenized_examples.pop("overflow_to_sample_mapping")
# The offset mappings will give us a map from token to character position in the original context. This will
# help us compute the start_positions and end_positions.
offset_mapping = tokenized_examples.pop("offset_mapping")
# Let's label those examples!
tokenized_examples["start_positions"] = []
tokenized_examples["end_positions"] = []
for i, offsets in enumerate(offset_mapping):
# We will label impossible answers with the index of the CLS token.
input_ids = tokenized_examples["input_ids"][i]
cls_index = input_ids.index(tokenizer.cls_token_id)
# Grab the sequence corresponding to that example (to know what is the context and what is the question).
sequence_ids = tokenized_examples.sequence_ids(i)
# One example can give several spans, this is the index of the example containing this span of text.
sample_index = sample_mapping[i]
answers = examples["answers"][sample_index]
# If no answers are given, set the cls_index as answer.
if len(answers["answer_start"]) == 0:
tokenized_examples["start_positions"].append(cls_index)
tokenized_examples["end_positions"].append(cls_index)
else:
# Start/end character index of the answer in the text.
start_char = answers["answer_start"][0]
end_char = start_char + len(answers["text"][0])
# Start token index of the current span in the text.
token_start_index = 0
while sequence_ids[token_start_index] != 1:
token_start_index += 1
# End token index of the current span in the text.
token_end_index = len(input_ids) - 1
while sequence_ids[token_end_index] != 1:
token_end_index -= 1
# Detect if the answer is out of the span (in which case this feature is labeled with the CLS index).
if not (offsets[token_start_index][0] <= start_char
and offsets[token_end_index][1] >= end_char):
tokenized_examples["start_positions"].append(cls_index)
tokenized_examples["end_positions"].append(cls_index)
else:
# Otherwise move the token_start_index and token_end_index to the two ends of the answer.
# Note: we could go after the last offset if the answer is the last word (edge case).
while token_start_index < len(offsets) and offsets[
token_start_index][0] <= start_char:
token_start_index += 1
tokenized_examples["start_positions"].append(
token_start_index - 1)
while offsets[token_end_index][1] >= end_char:
token_end_index -= 1
tokenized_examples["end_positions"].append(token_end_index + 1)
return tokenized_examples
#调用squad数据预处理函数
examples = prepare_train_features(dataset['train'][:3])
#先看看处理后的结果
for k, v in examples.items():
print(k, len(v), v)
print()
#还原成文字查看,很显然,即使是huggingface的实现也是有问题的
for i in range(len(examples['input_ids'])):
input_ids = examples['input_ids'][i]
start_positions = examples['start_positions'][i]
end_positions = examples['end_positions'][i]
print('问题和文本')
question_and_context = tokenizer.decode(input_ids)
print(question_and_context)
print('答案')
answer = tokenizer.decode(input_ids[start_positions:end_positions])
print(answer)
print('原答案')
original_answer = dataset['train'][i]['answers']['text'][0]
print(original_answer)
print()
#应用预处理函数
dataset = dataset.map(
function=prepare_train_features,
batched=True,
remove_columns=['id', 'title', 'context', 'question', 'answers'])
print(dataset['train'][0])
dataset
import torch
from transformers.data.data_collator import default_data_collator
#数据加载器
loader = torch.utils.data.DataLoader(
dataset=dataset['train'],
batch_size=8,
collate_fn=default_data_collator,
shuffle=True,
drop_last=True,
)
for i, data in enumerate(loader):
break
len(loader), data
from transformers import AutoModelForQuestionAnswering, DistilBertModel
#加载模型
#model = AutoModelForQuestionAnswering.from_pretrained('distilbert-base-uncased')
#定义下游任务模型
class Model(torch.nn.Module):
def __init__(self):
super().__init__()
self.pretrained = DistilBertModel.from_pretrained(
'distilbert-base-uncased')
self.fc = torch.nn.Sequential(torch.nn.Dropout(0.1),
torch.nn.Linear(768, 2))
#加载预训练模型的参数
parameters = AutoModelForQuestionAnswering.from_pretrained('distilbert-base-uncased')
self.fc[1].load_state_dict(parameters.qa_outputs.state_dict())
def forward(self, input_ids, attention_mask, start_positions,
end_positions):
#[b, lens] -> [b, lens, 768]
logits = self.pretrained(input_ids=input_ids,
attention_mask=attention_mask)
logits = logits.last_hidden_state
#[b, lens, 768] -> [b, lens, 2]
logits = self.fc(logits)
#[b, lens, 2] -> [b, lens, 1],[b, lens, 1]
start_logits, end_logits = logits.split(1, dim=2)
#[b, lens, 1] -> [b, lens]
start_logits = start_logits.squeeze(2)
end_logits = end_logits.squeeze(2)
#起点和终点都不能超出句子的长度
lens = start_logits.shape[1]
start_positions = start_positions.clamp(0, lens)
end_positions = end_positions.clamp(0, lens)
criterion = torch.nn.CrossEntropyLoss(ignore_index=lens)
start_loss = criterion(start_logits, start_positions)
end_loss = criterion(end_logits, end_positions)
loss = (start_loss + end_loss) / 2
return {
'loss': loss,
'start_logits': start_logits,
'end_logits': end_logits
}
model = Model()
#统计参数量
print(sum(i.numel() for i in model.parameters()) / 10000)
out = model(**data)
out['loss'], out['start_logits'].shape, out['end_logits'].shape
测试
#测试
def test():
model.eval()
#数据加载器
loader_val = torch.utils.data.DataLoader(
dataset=dataset['validation'],
batch_size=16,
collate_fn=default_data_collator,
shuffle=True,
drop_last=True,
)
start_offset = 0
end_offset = 0
total = 0
for i, data in enumerate(loader_val):
#计算
with torch.no_grad():
out = model(**data)
start_offset += (out['start_logits'].argmax(dim=1) -
data['start_positions']).abs().sum().item()
end_offset += (out['end_logits'].argmax(dim=1) -
data['end_positions']).abs().sum().item()
total += 16
if i % 10 == 0:
print(i)
if i == 50:
break
print(start_offset / total, end_offset / total)
start_logits = out['start_logits'].argmax(dim=1)
end_logits = out['end_logits'].argmax(dim=1)
for i in range(4):
input_ids = data['input_ids'][i]
pred_answer = input_ids[start_logits[i]:end_logits[i]]
label_answer = input_ids[
data['start_positions'][i]:data['end_positions'][i]]
print('input_ids=', tokenizer.decode(input_ids))
print('pred_answer=', tokenizer.decode(pred_answer))
print('label_answer=', tokenizer.decode(label_answer))
print()
test()
训练
from transformers import AdamW
from transformers.optimization import get_scheduler
#训练
def train():
optimizer = AdamW(model.parameters(), lr=2e-5)
scheduler = get_scheduler(name='linear',
num_warmup_steps=0,
num_training_steps=len(loader),
optimizer=optimizer)
model.train()
for i, data in enumerate(loader):
out = model(**data)
loss = out['loss']
loss.backward()
torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
optimizer.step()
scheduler.step()
optimizer.zero_grad()
model.zero_grad()
if i % 50 == 0:
lr = optimizer.state_dict()['param_groups'][0]['lr']
start_offset = (out['start_logits'].argmax(dim=1) -
data['start_positions']).abs().sum().item() / 8
end_offset = (out['end_logits'].argmax(dim=1) -
data['end_positions']).abs().sum().item() / 8
print(i, loss.item(), lr, start_offset, end_offset)
torch.save(model, 'models/3.阅读理解.model')
train()
model = torch.load('models/3.阅读理解.model')
test()
2022-12-08