1. 前言
预训练大语言模型的难点不在于算法,而在于数据和算力,绝大多数企业和机构都没有预训练大语言模型的算力资源。在工业界的大语言模型应用实践中,通常会使用领域数据微调开源大语言模型参数,以构建领域大语言模型。
本文介绍加载开源大语言模型参数以替代大语言模型GPTModel
中的随机初始化参数的方法。后续微调大语言模型部分内容将分别使用监督微调及指令微调方法,微调大语言模型GPTModel
参数,使大语言模型具备文本分类及回答问题能力。
2. 获取开源大语言模型参数
OpenAI开源了其使用TensorFlow框架训练出来的大语言模型GPT-2的参数。可以使用如下代码,访问OpenAI官方提供的模型参数下载地址,下载开源大语言模型GPT-2 small版本(124M)的参数:
import os
import urllib.request
from tqdm import tqdm
def download_openai_params(model_size, openai_params_dir):
allowed_sizes = ["124M", "355M", "774M", "1558M"]
if model_size not in allowed_sizes:
raise ValueError(f"model_size not in {allowed_sizes}")
params_dir = os.path.join(openai_params_dir, "gpt2_" + model_size)
os.makedirs(params_dir, exist_ok=True)
base_url = "https://openaipublic.blob.core.windows.net/gpt-2/models"
filenames = [
"checkpoint", "encoder.json", "hparams.json", "vocab.bpe",
"model.ckpt.index", "model.ckpt.meta", "model.ckpt.data-00000-of-00001"
]
for filename in filenames:
file_url = os.path.join(base_url, model_size, filename)
file_path = os.path.join(params_dir, filename)
with urllib.request.urlopen(file_url) as response:
file_size = int(response.headers.get("Content-Length", 0))
if os.path.exists(file_path):
file_size_local = os.path.getsize(file_path)
if file_size == file_size_local:
print(f"File already exists and is up-to-date: {file_path}")
continue
block_size = 1024
progress_bar_description = os.path.basename(file_url)
with tqdm(total=file_size, unit="iB", unit_scale=True, desc=progress_bar_description) as progress_bar:
with open(file_path, "wb") as file:
while True:
chunk = response.read(block_size)
if not chunk:
break
file.write(chunk)
progress_bar.update(len(chunk))
download_openai_params(model_size="124M", openai_params_dir="openai_params")
执行上面代码,打印结果如下:
checkpoint: 100%|██████████████████████████████████████████████████████████████████| 77.0/77.0 [00:00<00:00, 73.2kiB/s]
encoder.json: 100%|███████████████████████████████████████████████████████████████| 1.04M/1.04M [00:04<00:00, 245kiB/s]
hparams.json: 100%|████████████████████████████████████████████████████████████████████████| 90.0/90.0 [00:00<?, ?iB/s]
vocab.bpe: 100%|████████████████████████████████████████████████████████████████████| 456k/456k [00:00<00:00, 489kiB/s]
model.ckpt.index: 100%|██████████████████████████████████████████████████████████| 5.21k/5.21k [00:00<00:00, 1.73MiB/s]
model.ckpt.meta: 100%|██████████████████████████████████████████████████████████████| 471k/471k [00:00<00:00, 505kiB/s]
model.ckpt.data-00000-of-00001: 100%|███████████████████████████████████████████████| 498M/498M [13:19<00:00, 622kiB/s]
OpenAI开源的大语言模型GPT-2的训练框架是TensorFlow,可以使用tf.train.latest_checkpoint
函数获取模型的checkpoint路径,并使用tf.train.list_variables
函数打印模型参数信息,具体代码如下所示:
import tensorflow as tf
ckpt_path = tf.train.latest_checkpoint("openai_params/gpt2_124M")
variables = tf.train.list_variables(ckpt_path)
variables
执行上面代码,打印结果如下:
[('model/h0/attn/c_attn/b', [2304]),
('model/h0/attn/c_attn/w', [1, 768, 2304]),
('model/h0/attn/c_proj/b', [768]),
('model/h0/attn/c_proj/w', [1, 768, 768]),
('model/h0/ln_1/b', [768]),
('model/h0/ln_1/g', [768]),
('model/h0/ln_2/b', [768]),
('model/h0/ln_2/g', [768]),
('model/h0/mlp/c_fc/b', [3072]),
('model/h0/mlp/c_fc/w', [1, 768, 3072]),
('model/h0/mlp/c_proj/b', [768]),
('model/h0/mlp/c_proj/w', [1, 3072, 768]),
[...]
('model/h9/attn/c_attn/b', [2304]),
('model/h9/attn/c_attn/w', [1, 768, 2304]),
('model/h9/attn/c_proj/b', [768]),
('model/h9/attn/c_proj/w', [1, 768, 768]),
('model/h9/ln_1/b', [768]),
('model/h9/ln_1/g', [768]),
('model/h9/ln_2/b', [768]),
('model/h9/ln_2/g', [768]),
('model/h9/mlp/c_fc/b', [3072]),
('model/h9/mlp/c_fc/w', [1, 768, 3072]),
('model/h9/mlp/c_proj/b', [768]),
('model/h9/mlp/c_proj/w', [1, 3072, 768]),
('model/ln_f/b', [768]),
('model/ln_f/g', [768]),
('model/wpe', [1024, 768]),
('model/wte', [50257, 768])]
3. 加载开源大语言模型参数
使用梯度下降算法训练深度神经网络,会先随机初始化深度神经网络模型参数,并使用梯度下降算法逐步更新深度神经网络参数直至收敛。在工业界的大语言模型应用实践中,通常会使用开源大语言模型参数替代大语言模型中的随机初始化参数,以解决预训练大语言模型的算力资源缺乏问题。
如下面的代码所示,加载OpenAI的开源大语言模型GPT-2的参数以替代大语言模型GPTModel
中的随机初始化参数,首先需要使用如下代码读取开源大语言模型参数,并构造参数字典params
。参数字典params
的key为OpenAI开源的大语言模型GPT-2中各个子模块参数的名称,value为各个子模块参数对应的torch.nn.Parameter
。具体代码如下所示:
import json
import torch
import numpy as np
def load_openai_ckpt(ckpt_dir):
ckpt_path = tf.train.latest_checkpoint(ckpt_dir)
with open(os.path.join(ckpt_dir, "hparams.json"), "rt", encoding="utf-8") as f:
settings = json.load(f)
params = {"blocks": [{} for _ in range(settings["n_layer"])]}
for name, _ in tf.train.list_variables(ckpt_path):
variable_array = np.squeeze(tf.train.load_variable(ckpt_path, name))
variable_array = torch.nn.Parameter(torch.tensor(variable_array))
variable_name_parts = name.split("/")[1:]
target_dict = params
if variable_name_parts[0].startswith("h"):
layer_number = int(variable_name_parts[0][1:])
target_dict = params["blocks"][layer_number]
for key in variable_name_parts[1:-1]:
target_dict = target_dict.setdefault(key, {})
last_key = variable_name_parts[-1]
target_dict[last_key] = variable_array # noqa
return params
params = load_openai_ckpt("openai_params/gpt2_124M")
print("Parameter dictionary keys:", params.keys())
print("Token embedding parameter dimensions:", params["wte"].shape)
print("Token embedding parameter:\n", params["wte"])
执行上面代码,打印结果如下:
Parameter dictionary keys: dict_keys(['blocks', 'b', 'g', 'wpe', 'wte'])
Token embedding parameter dimensions: torch.Size([50257, 768])
Token embedding parameter:
Parameter containing:
tensor([[-0.1101, -0.0393, 0.0331, ..., -0.1364, 0.0151, 0.0453],
[ 0.0403, -0.0486, 0.0462, ..., 0.0861, 0.0025, 0.0432],
[-0.1275, 0.0479, 0.1841, ..., 0.0899, -0.1297, -0.0879],
...,
[-0.0445, -0.0548, 0.0123, ..., 0.1044, 0.0978, -0.0695],
[ 0.1860, 0.0167, 0.0461, ..., -0.0963, 0.0785, -0.0225],
[ 0.0514, -0.0277, 0.0499, ..., 0.0070, 0.1552, 0.1207]],
requires_grad=True)
定义函数load_openai_params
,并按照如下规则使用参数字典params
中的参数替换大语言模型GPTModel
中的相应参数:
使用
wte
参数替代Token Embedding层参数gpt2_small.tok_emb.weight
;使用
wpe
参数替代Positional Embedding层参数gpt2_small.pos_emb.weight
;使用
attn.c_attn.w
参数替代多头注意力子模块中的att.W_qkv.weight
;使用
attn.c_attn.b
参数替代多头注意力子模块中的att.W_qkv.bias
;使用
attn.c_proj.w
参数替代多头注意力子模块中的att.out_proj.weight
;使用
attn.c_proj.b
参数替代多头注意力子模块中的att.out_proj.bias
;使用
mlp.c_fc.w
参数替代前馈神经网络子模块中第一个Linear层的ff.layers[0].weight
;使用
mlp.c_fc.b
参数替代前馈神经网络子模块中第一个Linear层的ff.layers[0].bias
;使用
mlp.c_proj.w
参数替代前馈神经网子络模块中第二个Linear层的ff.layers[2].weight
;使用
mlp.c_proj.b
参数替代前馈神经网子络模块中第二个Linear层的ff.layers[2].bias
;使用
ln_1.g
参数替代多头注意力子模块中Layer Normalization层参数norm1.scale
;使用
ln_1.b
参数替代多头注意力子模块中Layer Normalization层参数norm1.shift
;使用
ln_2.g
参数替代前馈神经网络子模块中Layer Normalization层参数norm2.scale
;使用
ln_2.b
参数替代前馈神经网络子模块中Layer Normalization层参数norm2.shift
;使用
g
参数替代对最后的输出层的输入张量做变换的Layer Normalization层参数final_norm.scale
;使用
b
参数替代对最后的输出层的输入张量做变换的Layer Normalization层参数final_norm.shift
;使用
wte
参数替代最后的输出层参数out_linear.weight
。
具体代码如下所示:
def load_openai_params(model, params):
model.tok_emb.weight = params['wte']
model.pos_emb.weight = params['wpe']
for b in range(len(params["blocks"])):
model.trf_blocks[b].att.W_qkv.weight = torch.nn.Parameter(params["blocks"][b]["attn"]["c_attn"]["w"].T)
model.trf_blocks[b].att.W_qkv.bias = params["blocks"][b]["attn"]["c_attn"]["b"]
model.trf_blocks[b].att.out_proj.weight = torch.nn.Parameter(params["blocks"][b]["attn"]["c_proj"]["w"].T)
model.trf_blocks[b].att.out_proj.bias = params["blocks"][b]["attn"]["c_proj"]["b"]
model.trf_blocks[b].ff.layers[0].weight = torch.nn.Parameter(params["blocks"][b]["mlp"]["c_fc"]["w"].T)
model.trf_blocks[b].ff.layers[0].bias = params["blocks"][b]["mlp"]["c_fc"]["b"]
model.trf_blocks[b].ff.layers[2].weight = torch.nn.Parameter(params["blocks"][b]["mlp"]["c_proj"]["w"].T)
model.trf_blocks[b].ff.layers[2].bias = params["blocks"][b]["mlp"]["c_proj"]["b"]
model.trf_blocks[b].norm1.scale = params["blocks"][b]["ln_1"]["g"]
model.trf_blocks[b].norm1.shift = params["blocks"][b]["ln_1"]["b"]
model.trf_blocks[b].norm2.scale = params["blocks"][b]["ln_2"]["g"]
model.trf_blocks[b].norm2.shift = params["blocks"][b]["ln_2"]["b"]
model.final_norm.scale = params["g"]
model.final_norm.shift = params["b"]
model.out_linear.weight = params["wte"]
实例化大语言模型gpt2_small
,使用函数load_openai_params
加载OpenAI的开源大语言模型GPT-2的参数,并调用从零开始实现大语言模型(十二):文本生成策略所述文本生成函数generate_text
,打印生成的文本信息:
import tiktoken
# from [从零开始实现大语言模型(七):多头注意力机制] import MultiHeadAttention
# from [从零开始实现大语言模型(八):Layer Normalization] import LayerNorm
# from [从零开始实现大语言模型(九):前馈神经网络与GELU激活函数] import GELU, FeedForward
# from [从零开始实现大语言模型(十一):构建大语言模型GPTModel] import TransformerBlock, GPTModel
# from [从零开始实现大语言模型(十二):文本生成策略] import generate_text
embedding_dim = 768
num_layers = 12
num_heads = 12
context_len = 1024
vocabulary_size = 50257
dropout = 0.1
qkv_bias = True
tokenizer = tiktoken.encoding_for_model("gpt2")
gpt2_small = GPTModel(
embedding_dim=embedding_dim,
num_layers=num_layers,
num_heads=num_heads,
context_len=context_len,
vocabulary_size=vocabulary_size,
dropout=dropout,
qkv_bias=qkv_bias
)
load_openai_params(gpt2_small, params)
torch.manual_seed(123)
text = generate_text(
model=gpt2_small, start_context="Every effort moves you", max_new_tokens=23,
context_size=1024, tokenizer=tokenizer, temperature=0.3, top_k=50, compact_format=True
)
print(text)
执行上面代码,打印结果如下:
Every effort moves you forward, but it's a process. It's a process of learning, and it's a process of learning.
4. Hugging Face与Model Scope
Hugging Face与Model Scope是全球最大的两个开源模型社区,可以直接通过Hugging Face或Model Scope加载开源大语言模型参数。如下面的代码所示,可以直接使用GPT2Model.from_pretrained
函数,从Hugging Face中加载开源大语言模型GPT-2:
from transformers import GPT2Model
hf_gpt2_small = GPT2Model.from_pretrained("openai-community/gpt2", cache_dir="huggingface_params")
print(hf_gpt2_small)
执行上面代码,打印结果如下:
GPT2Model(
(wte): Embedding(50257, 768)
(wpe): Embedding(1024, 768)
(drop): Dropout(p=0.1, inplace=False)
(h): ModuleList(
(0): GPT2Block(
(ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
(attn): GPT2Attention(
(c_attn): Conv1D()
(c_proj): Conv1D()
(attn_dropout): Dropout(p=0.1, inplace=False)
(resid_dropout): Dropout(p=0.1, inplace=False)
)
(ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
(mlp): GPT2MLP(
(c_fc): Conv1D()
(c_proj): Conv1D()
(act): NewGELUActivation()
(dropout): Dropout(p=0.1, inplace=False)
)
)
[...]
(11): GPT2Block(
(ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
(attn): GPT2Attention(
(c_attn): Conv1D()
(c_proj): Conv1D()
(attn_dropout): Dropout(p=0.1, inplace=False)
(resid_dropout): Dropout(p=0.1, inplace=False)
)
(ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
(mlp): GPT2MLP(
(c_fc): Conv1D()
(c_proj): Conv1D()
(act): NewGELUActivation()
(dropout): Dropout(p=0.1, inplace=False)
)
)
)
(ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
)
定义继承自torch.nn.Module
的HFGPT2ModelWrapper
类,在其__init__
方法中创建大语言模型GPT-2的输出层self.out_linear
,并使用Token Embedding层参数替代输出层self.out_linear
的随机初始化参数。在forward
方法中,将x
输入从Hugging Face中加载的大语言模型hf_gpt2_small
,并将大语言模型hf_gpt2_small
输出的last_hidden_state
输入在__init__
方法中创建的输出层self.out_linear
。创建HFGPT2ModelWrapper
类对象gpt2_small
,并调用文本生成函数generate_text
,打印生成的文本信息:
class HFGPT2ModelWrapper(torch.nn.Module):
def __init__(self, hf_model):
super().__init__()
self.hf_model = hf_model
self.out_linear = torch.nn.Linear(
hf_model.wte.weight.shape[1], hf_model.wte.weight.shape[0], bias=False
)
self.out_linear.weight = hf_model.wte.weight
def forward(self, x):
last_hidden_state = self.hf_model(x)["last_hidden_state"]
return self.out_linear(last_hidden_state)
gpt2_small = HFGPT2ModelWrapper(hf_gpt2_small)
torch.manual_seed(123)
text = generate_text(
model=gpt2_small, start_context="Every effort moves you", max_new_tokens=23,
context_size=1024, tokenizer=tokenizer, temperature=0.3, top_k=50, compact_format=True
)
print(text)
执行上面代码,打印结果如下:
Every effort moves you forward, but it's a process. It's a process of learning, and it's a process of learning.
可以进一步定义函数load_huggingface_params
,使用从Hugging Face中加载的开源大语言模型GPT-2参数替代大语言模型GPTModel
中的随机初始化参数:
def load_huggingface_params(model, hf_model):
state_dict = hf_model.state_dict()
model.pos_emb.weight = torch.nn.Parameter(state_dict["wpe.weight"])
model.tok_emb.weight = torch.nn.Parameter(state_dict["wte.weight"])
for b in range(len(hf_model.h)):
model.trf_blocks[b].att.W_qkv.weight = torch.nn.Parameter(state_dict[f"h.{b}.attn.c_attn.weight"].T)
model.trf_blocks[b].att.W_qkv.bias = torch.nn.Parameter(state_dict[f"h.{b}.attn.c_attn.bias"])
model.trf_blocks[b].att.out_proj.weight = torch.nn.Parameter(state_dict[f"h.{b}.attn.c_proj.weight"].T)
model.trf_blocks[b].att.out_proj.bias = torch.nn.Parameter(state_dict[f"h.{b}.attn.c_proj.bias"])
model.trf_blocks[b].ff.layers[0].weight = torch.nn.Parameter(state_dict[f"h.{b}.mlp.c_fc.weight"].T)
model.trf_blocks[b].ff.layers[0].bias = torch.nn.Parameter(state_dict[f"h.{b}.mlp.c_fc.bias"])
model.trf_blocks[b].ff.layers[2].weight = torch.nn.Parameter(state_dict[f"h.{b}.mlp.c_proj.weight"].T)
model.trf_blocks[b].ff.layers[2].bias = torch.nn.Parameter(state_dict[f"h.{b}.mlp.c_proj.bias"])
model.trf_blocks[b].norm1.scale = torch.nn.Parameter(state_dict[f"h.{b}.ln_1.weight"])
model.trf_blocks[b].norm1.shift = torch.nn.Parameter(state_dict[f"h.{b}.ln_1.bias"])
model.trf_blocks[b].norm2.scale = torch.nn.Parameter(state_dict[f"h.{b}.ln_2.weight"])
model.trf_blocks[b].norm2.shift = torch.nn.Parameter(state_dict[f"h.{b}.ln_2.bias"])
model.final_norm.scale = torch.nn.Parameter(state_dict[f"ln_f.weight"])
model.final_norm.shift = torch.nn.Parameter(state_dict[f"ln_f.bias"])
model.out_linear.weight = torch.nn.Parameter(state_dict["wte.weight"])
实例化大语言模型gpt2_small
,使用函数load_huggingface_params
加载Hugging Face中开源的大语言模型GPT-2的参数,并调用文本生成函数generate_text
,打印生成的文本信息:
embedding_dim = 768
num_layers = 12
num_heads = 12
context_len = 1024
vocabulary_size = 50257
dropout = 0.1
qkv_bias = True
gpt2_small = GPTModel(
embedding_dim=embedding_dim,
num_layers=num_layers,
num_heads=num_heads,
context_len=context_len,
vocabulary_size=vocabulary_size,
dropout=dropout,
qkv_bias=qkv_bias
)
load_huggingface_params(gpt2_small, hf_gpt2_small)
torch.manual_seed(123)
text = generate_text(
model=gpt2_small, start_context="Every effort moves you", max_new_tokens=23,
context_size=1024, tokenizer=tokenizer, temperature=0.3, top_k=50, compact_format=True
)
print(text)
执行上面代码,打印结果如下:
Every effort moves you forward, but it's a process. It's a process of learning, and it's a process of learning.
5. 结束语
未完待续……