transformers - 阅读理解

发布于:2024-05-09 ⋅ 阅读:(30) ⋅ 点赞:(0)

from transformers import AutoTokenizer

#加载分词工具
tokenizer = AutoTokenizer.from_pretrained('distilbert-base-uncased')

tokenizer('What is your name?', 'My name is Sylvain.')

from datasets import load_dataset, load_from_disk

#加载数据集
dataset = load_dataset('squad')
# dataset = load_from_disk('datas/squad')

#采样,数据量太大了跑不动
dataset['train'] = dataset['train'].shuffle().select(range(10000))
dataset['validation'] = dataset['validation'].shuffle().select(range(200))

print(dataset['train'][0])

dataset

#从官方教程里抄出来的函数,总之就是squad数据的处理函数,过程非常复杂,即使是官方的实现也是有问题的,我实在没本事写这个
def prepare_train_features(examples):
    # Some of the questions have lots of whitespace on the left, which is not useful and will make the
    # truncation of the context fail (the tokenized question will take a lots of space). So we remove that
    # left whitespace
    examples["question"] = [q.lstrip() for q in examples["question"]]

    # Tokenize our examples with truncation and padding, but keep the overflows using a stride. This results
    # in one example possible giving several features when a context is long, each of those features having a
    # context that overlaps a bit the context of the previous feature.
    tokenized_examples = tokenizer(
        examples['question'],
        examples['context'],
        truncation='only_second',
        max_length=384,
        stride=128,
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
        padding='max_length',
    )

    # Since one example might give us several features if it has a long context, we need a map from a feature to
    # its corresponding example. This key gives us just that.
    sample_mapping = tokenized_examples.pop("overflow_to_sample_mapping")
    # The offset mappings will give us a map from token to character position in the original context. This will
    # help us compute the start_positions and end_positions.
    offset_mapping = tokenized_examples.pop("offset_mapping")

    # Let's label those examples!
    tokenized_examples["start_positions"] = []
    tokenized_examples["end_positions"] = []

    for i, offsets in enumerate(offset_mapping):
        # We will label impossible answers with the index of the CLS token.
        input_ids = tokenized_examples["input_ids"][i]
        cls_index = input_ids.index(tokenizer.cls_token_id)

        # Grab the sequence corresponding to that example (to know what is the context and what is the question).
        sequence_ids = tokenized_examples.sequence_ids(i)

        # One example can give several spans, this is the index of the example containing this span of text.
        sample_index = sample_mapping[i]
        answers = examples["answers"][sample_index]
        # If no answers are given, set the cls_index as answer.
        if len(answers["answer_start"]) == 0:
            tokenized_examples["start_positions"].append(cls_index)
            tokenized_examples["end_positions"].append(cls_index)
        else:
            # Start/end character index of the answer in the text.
            start_char = answers["answer_start"][0]
            end_char = start_char + len(answers["text"][0])

            # Start token index of the current span in the text.
            token_start_index = 0
            while sequence_ids[token_start_index] != 1:
                token_start_index += 1

            # End token index of the current span in the text.
            token_end_index = len(input_ids) - 1
            while sequence_ids[token_end_index] != 1:
                token_end_index -= 1

            # Detect if the answer is out of the span (in which case this feature is labeled with the CLS index).
            if not (offsets[token_start_index][0] <= start_char
                    and offsets[token_end_index][1] >= end_char):
                tokenized_examples["start_positions"].append(cls_index)
                tokenized_examples["end_positions"].append(cls_index)
            else:
                # Otherwise move the token_start_index and token_end_index to the two ends of the answer.
                # Note: we could go after the last offset if the answer is the last word (edge case).
                while token_start_index < len(offsets) and offsets[
                        token_start_index][0] <= start_char:
                    token_start_index += 1
                tokenized_examples["start_positions"].append(
                    token_start_index - 1)
                while offsets[token_end_index][1] >= end_char:
                    token_end_index -= 1
                tokenized_examples["end_positions"].append(token_end_index + 1)

    return tokenized_examples


#调用squad数据预处理函数
examples = prepare_train_features(dataset['train'][:3])

#先看看处理后的结果
for k, v in examples.items():
    print(k, len(v), v)
    print()

#还原成文字查看,很显然,即使是huggingface的实现也是有问题的
for i in range(len(examples['input_ids'])):
    input_ids = examples['input_ids'][i]
    start_positions = examples['start_positions'][i]
    end_positions = examples['end_positions'][i]

    print('问题和文本')
    question_and_context = tokenizer.decode(input_ids)
    print(question_and_context)

    print('答案')
    answer = tokenizer.decode(input_ids[start_positions:end_positions])
    print(answer)

    print('原答案')
    original_answer = dataset['train'][i]['answers']['text'][0]
    print(original_answer)
    print()

#应用预处理函数
dataset = dataset.map(
    function=prepare_train_features,
    batched=True,
    remove_columns=['id', 'title', 'context', 'question', 'answers'])

print(dataset['train'][0])

dataset

import torch
from transformers.data.data_collator import default_data_collator

#数据加载器
loader = torch.utils.data.DataLoader(
    dataset=dataset['train'],
    batch_size=8,
    collate_fn=default_data_collator,
    shuffle=True,
    drop_last=True,
)

for i, data in enumerate(loader):
    break

len(loader), data

from transformers import AutoModelForQuestionAnswering, DistilBertModel

#加载模型
#model = AutoModelForQuestionAnswering.from_pretrained('distilbert-base-uncased')


#定义下游任务模型
class Model(torch.nn.Module):
    def __init__(self):
        super().__init__()
        self.pretrained = DistilBertModel.from_pretrained(
            'distilbert-base-uncased')

        self.fc = torch.nn.Sequential(torch.nn.Dropout(0.1),
                                      torch.nn.Linear(768, 2))
        
        #加载预训练模型的参数
        parameters = AutoModelForQuestionAnswering.from_pretrained('distilbert-base-uncased')
        self.fc[1].load_state_dict(parameters.qa_outputs.state_dict())

    def forward(self, input_ids, attention_mask, start_positions,
                end_positions):
        #[b, lens] -> [b, lens, 768]
        logits = self.pretrained(input_ids=input_ids,
                                 attention_mask=attention_mask)
        logits = logits.last_hidden_state

        #[b, lens, 768] -> [b, lens, 2]
        logits = self.fc(logits)

        #[b, lens, 2] -> [b, lens, 1],[b, lens, 1]
        start_logits, end_logits = logits.split(1, dim=2)

        #[b, lens, 1] -> [b, lens]
        start_logits = start_logits.squeeze(2)
        end_logits = end_logits.squeeze(2)

        #起点和终点都不能超出句子的长度
        lens = start_logits.shape[1]
        start_positions = start_positions.clamp(0, lens)
        end_positions = end_positions.clamp(0, lens)

        criterion = torch.nn.CrossEntropyLoss(ignore_index=lens)

        start_loss = criterion(start_logits, start_positions)
        end_loss = criterion(end_logits, end_positions)
        loss = (start_loss + end_loss) / 2

        return {
            'loss': loss,
            'start_logits': start_logits,
            'end_logits': end_logits
        }


model = Model()

#统计参数量
print(sum(i.numel() for i in model.parameters()) / 10000)

out = model(**data)

out['loss'], out['start_logits'].shape, out['end_logits'].shape

测试

#测试
def test():
    model.eval()

    #数据加载器
    loader_val = torch.utils.data.DataLoader(
        dataset=dataset['validation'],
        batch_size=16,
        collate_fn=default_data_collator,
        shuffle=True,
        drop_last=True,
    )

    start_offset = 0
    end_offset = 0
    total = 0
    for i, data in enumerate(loader_val):
        #计算
        with torch.no_grad():
            out = model(**data)

        start_offset += (out['start_logits'].argmax(dim=1) -
                         data['start_positions']).abs().sum().item()

        end_offset += (out['end_logits'].argmax(dim=1) -
                       data['end_positions']).abs().sum().item()

        total += 16

        if i % 10 == 0:
            print(i)

        if i == 50:
            break

    print(start_offset / total, end_offset / total)

    start_logits = out['start_logits'].argmax(dim=1)
    end_logits = out['end_logits'].argmax(dim=1)
    for i in range(4):
        input_ids = data['input_ids'][i]

        pred_answer = input_ids[start_logits[i]:end_logits[i]]

        label_answer = input_ids[
            data['start_positions'][i]:data['end_positions'][i]]

        print('input_ids=', tokenizer.decode(input_ids))
        print('pred_answer=', tokenizer.decode(pred_answer))
        print('label_answer=', tokenizer.decode(label_answer))
        print()


test()

训练

from transformers import AdamW
from transformers.optimization import get_scheduler


#训练
def train():
    optimizer = AdamW(model.parameters(), lr=2e-5)
    scheduler = get_scheduler(name='linear',
                              num_warmup_steps=0,
                              num_training_steps=len(loader),
                              optimizer=optimizer)

    model.train()
    for i, data in enumerate(loader):
        out = model(**data)
        loss = out['loss']

        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

        optimizer.step()
        scheduler.step()

        optimizer.zero_grad()
        model.zero_grad()

        if i % 50 == 0:
            lr = optimizer.state_dict()['param_groups'][0]['lr']

            start_offset = (out['start_logits'].argmax(dim=1) -
                            data['start_positions']).abs().sum().item() / 8

            end_offset = (out['end_logits'].argmax(dim=1) -
                          data['end_positions']).abs().sum().item() / 8

            print(i, loss.item(), lr, start_offset, end_offset)

    torch.save(model, 'models/3.阅读理解.model')


train()

model = torch.load('models/3.阅读理解.model')
test()

2022-12-08


网站公告

今日签到

点亮在社区的每一天
去签到