import os
import json
import random
import time
import numpy as np
import pandas as pd
from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split
import paddle
import paddlenlp
import paddle.nn.functional as F
from functools import partial
from paddlenlp.data import Stack, Dict, Pad
from paddlenlp.datasets import load_dataset
import paddle.nn as nn
import matplotlib.pyplot as plt
import seaborn as sns
from paddlenlp.transformers.auto.tokenizer import AutoTokenizer
seed = 2022
paddle.seed(seed)
random.seed(seed)
np.random.seed(seed)
#MODEL_NAME = 'ernie-3.0-base-zh'
MODEL_NAME ='ernie-2.0-large-en'
# 设置最大阶段长度 和 batch_size
max_seq_length = 365
train_batch_size = 16
valid_batch_size = 16
test_batch_size = 16
# 训练过程中的最大学习率
learning_rate = 8e-5
# 训练轮次
epochs = 50
# 学习率预热比例
warmup_proportion = 0.1
# 权重衰减系数,类似模型正则项策略,避免模型过拟合
weight_decay = 0.01
max_grad_norm = 1.0
model_logging_dir = 'work/model_logging.csv'
early_stopping = 3
# 是否使用数据增强
enable_dataaug = True
# 是否开启对抗训练
enable_adversarial = False
# Rdrop Loss的超参数,若该值大于0.则加权使用R-drop loss
rdrop_coef = 0.1
# 训练结束后,存储模型参数
save_dir_curr = "checkpoint/{}-{}".format(MODEL_NAME.replace('/','-'),int(time.time()))
train=pd.read_csv(r'/home/aistudio/data/train_split_3.tsv',delimiter='\t',header=None)
test=pd.read_csv(r'/home/aistudio/data/dev_split_3.tsv',delimiter='\t',header=None)
以下test数据格式,
id | title | assignee | abstract | label_id | text | concat_len |
98627711cf92f3a2e76cd7bfdf9b0827 | 星载导航接收机自主完好性监测方法与系统 | 上海清申科技发展有限公司 | 本发明提供了一种星载导航接收机自主完好性监测方法与系统,应用于星载导航接收机,包括:获取目标卫星的导航信息;基于导航信息计算目标卫星的当前历元的瞬时轨道根数;获取目标卫星的多个历史历元的瞬时轨道根数;基于多个历史历元的瞬时轨道根数,拟合目标卫星的轨道根数随时间变化的曲线,得到目标时间曲线;基于当前历元的瞬时轨道根数和目标时间曲线,进行自主完好性监测,得到监测结果。本发明缓解了现有技术中存在的在接收到低于5颗可见导航卫星的信号情况下,不能有效发现故障和解算异常的技术问题。 | 26 | 星载导航接收机自主完好性监测方法与系统,上海清申科技发展有限公司,本发明提供了一种星载导航接收机自主完好性监测方法与系统,应用于星载导航接收机,包括:获取目标卫星的导航信息;基于导航信息计算目标卫星的当前历元的瞬时轨道根数;获取目标卫星的多个历史历元的瞬时轨道根数;基于多个历史历元的瞬时轨道根数,拟合目标卫星的轨道根数随时间变化的曲线,得到目标时间曲线;基于当前历元的瞬时轨道根数和目标时间曲线,进行自主完好性监测,得到监测结果。本发明缓解了现有技术中存在的在接收到低于5颗可见导航卫星的信号情况下,不能有效发现故障和解算异常的技术问题。 | 270 |
ae53c853f21c1586ecf448a0f2533d21 | 将UHPFRC破碎回收为再生混凝土骨料的制备方法 | 深圳大学 | 本发明公开了一种将UHPFRC破碎回收为再生混凝土骨料的制备方法,包括如下步骤:预处理步骤,将UHPFRC结构破坏成相对小块,再对UHPFRC进行浸润处理;液氮降温步骤:对浸润处理后的UHPFRC通过液氮进行降温,使UHPFRC温度降到0摄氏度以下;高温加热处理步骤:将降温处理后的UHPFRC进行高温加热处理,使UHPFRC发生高温爆裂,使UHPFRC和纤维连接面发生破坏;破碎步骤:对破裂后的混凝土进行破碎处理,形成破碎的UHPFRC;筛分处理步骤:对破碎处理后的UHPFRC按照粒径大小进行筛分。本发明是一种降低能耗、提高钢纤维回收率的将UHPFRC破碎回收为再生混凝土骨料的制备方法。 | 14 | 将UHPFRC破碎回收为再生混凝土骨料的制备方法,深圳大学,本发明公开了一种将UHPFRC破碎回收为再生混凝土骨料的制备方法,包括如下步骤:预处理步骤,将UHPFRC结构破坏成相对小块,再对UHPFRC进行浸润处理;液氮降温步骤:对浸润处理后的UHPFRC通过液氮进行降温,使UHPFRC温度降到0摄氏度以下;高温加热处理步骤:将降温处理后的UHPFRC进行高温加热处理,使UHPFRC发生高温爆裂,使UHPFRC和纤维连接面发生破坏;破碎步骤:对破裂后的混凝土进行破碎处理,形成破碎的UHPFRC;筛分处理步骤:对破碎处理后的UHPFRC按照粒径大小进行筛分。本发明是一种降低能耗、提高钢纤维回收率的将UHPFRC破碎回收为再生混凝土骨料的制备方法。 | 326 |
516ce3d3c650d2c0deda3d7ab68f49c7 | 一种后浇带基面处理剂及其施工方法 | 黄水良 | 本发明公开了一种后浇带基面处理剂,所述后浇带基面处理剂包括以下重量份数的原料:硅酸盐水泥:600~650份;石英砂:280~330份;活性母料:30~100份;可再分散乳胶粉:6~30份;羟丙基甲基纤维素:1~15份;减水剂:2~20份。本发明还公开了上述后浇带基面处理剂的施工方法。本发明的后浇带基面处理剂可以形成致密的抗渗区域水泥结晶层,致密坚硬,强度高,能防御一定的弹性变形,形成能自我修复并且可靠的永久性防水防渗层。本发明的施工方法简单,容易操作,能一次成活,减少渗漏带来的危害,提高混凝土结构的安全度,保证结构完整性,延长工程使用寿命,节约工程维修成本,并且节省两侧的钢板止水带,节约能源。 | 28 | 一种后浇带基面处理剂及其施工方法,黄水良,本发明公开了一种后浇带基面处理剂,所述后浇带基面处理剂包括以下重量份数的原料:硅酸盐水泥:600~650份;石英砂:280~330份;活性母料:30~100份;可再分散乳胶粉:6~30份;羟丙基甲基纤维素:1~15份;减水剂:2~20份。本发明还公开了上述后浇带基面处理剂的施工方法。本发明的后浇带基面处理剂可以形成致密的抗渗区域水泥结晶层,致密坚硬,强度高,能防御一定的弹性变形,形成能自我修复并且可靠的永久性防水防渗层。本发明的施工方法简单,容易操作,能一次成活,减少渗漏带来的危害,提高混凝土结构的安全度,保证结构完整性,延长工程使用寿命,节约工程维修成本,并且节省两侧的钢板止水带,节约能源。 | 321 |
train_data,valid_data = train_test_split(train,test_size=0.3667,random_state=5)
print("train size: {} \nvalid size {}".format(len(train_data),len(valid_data)))
print("train label: ",sorted(train_data["label"].unique()))
print("train label: ",sorted(valid_data["label"].unique()))
from paddlenlp.dataaug import WordSubstitute
if enable_dataaug:
with open("work/data.txt","w") as f:
for i in train['text']:
f.write(i+'\n')
for i in test['text']:
f.write(i+'\n')
random.seed(seed)
np.random.seed(seed)
tf_idf_file = "work/data.txt"
aug = WordSubstitute('synonym',
tf_idf=True, # 使用tf-idf
tf_idf_file=tf_idf_file,
create_n=30, # 生成增强数据的个数
aug_percent=0.15 # 数据增强句子中被替换词数量占全句词比例
)
# 为指定的label生成增强数据
def data_aug_sample(label_id,data,aug):
aug_sample = []
sample = data[data['label_id']==label_id]
for pre_aug_sample in aug.augment(sample['text']):
aug_sample.extend(pre_aug_sample)
return pd.DataFrame({"text":aug_sample,"label_id":[label_id]*len(aug_sample)})
# 设置每个标签的数据条数
upper_limit = 180
# 根据统计信息生成增强数据和采样增强数据
label_id_indexs = train["label_id"].value_counts().index
label_id_nums = train["label_id"].value_counts().values
for label_id,value in zip(label_id_indexs,label_id_nums):
if value < upper_limit:
# 计算采样数量
sample_nums = upper_limit-value
# 获得增强数据
label_aug_data = data_aug_sample(data=train_data,label_id=label_id,aug=aug)
# 如果增强数据的总条数,小于采样数量,将采样数量变为当前增强数据的总条数
if len(label_aug_data) < sample_nums:
sample_nums = len(label_aug_data)
# 采样增强数据
label_aug_data = label_aug_data.sample(n=sample_nums,random_state=0)
# 合并到训练集
train_data = pd.concat((train_data,label_aug_data),axis=0)
# 重置index
train_data = train_data.reset_index(drop=True)
print("train size: {} \nvalid size {}".format(len(train_data),len(valid_data)))
# 加载tokenizer
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
# 创建数据迭代器iter
def read(df,istrain=True):
if istrain:
for idx,data in df.iterrows():
yield {
"words":data['text'],
"labels":data['label']
}
else:
for idx,data in df.iterrows():
yield {
"words":data['text'],
}
# 将生成器传入load_dataset
train_ds = load_dataset(read, df=train_data, lazy=False)
valid_ds = load_dataset(read, df=valid_data, lazy=False)
# 查看数据
for idx in range(1,3):
print(train_ds[idx])
print("==="*30)
# 编码
def convert_example(example, tokenizer, max_seq_len=512, mode='train'):
# 调用tokenizer的数据处理方法把文本转为id
tokenized_input = tokenizer(example['words'],is_split_into_words=True,max_seq_len=max_seq_len)
if mode == "test":
return tokenized_input
# 把意图标签转为数字id
tokenized_input['labels'] = [example['labels']]
return tokenized_input # 字典形式,包含input_ids、token_type_ids、labels
train_trans_func = partial(
convert_example,
tokenizer=tokenizer,
mode='train',
max_seq_len=max_seq_length)
valid_trans_func = partial(
convert_example,
tokenizer=tokenizer,
mode='dev',
max_seq_len=max_seq_length)
# 映射编码
train_ds.map(train_trans_func, lazy=False)
valid_ds.map(valid_trans_func, lazy=False)
# 初始化BatchSampler
train_batch_sampler = paddle.io.BatchSampler(train_ds, batch_size=train_batch_size, shuffle=True)
valid_batch_sampler = paddle.io.BatchSampler(valid_ds, batch_size=valid_batch_size, shuffle=False)
# print("校准数据是否被seed固定")
# print([*train_batch_sampler][0])
# print([585, 407, 408, 535, 631, 93, 534, 422, 570, 648, 221, 518, 434, 788, 536, 113])
# 定义batchify_fn
batchify_fn = lambda samples, fn = Dict({
"input_ids": Pad(axis=0, pad_val=tokenizer.pad_token_id),
"token_type_ids": Pad(axis=0, pad_val=tokenizer.pad_token_type_id),
"labels": Stack(dtype="float"),
}): fn(samples)
# 初始化DataLoader
train_data_loader = paddle.io.DataLoader(
dataset=train_ds,
batch_sampler=train_batch_sampler,
collate_fn=batchify_fn,
return_list=True)
valid_data_loader = paddle.io.DataLoader(
dataset=valid_ds,
batch_sampler=valid_batch_sampler,
collate_fn=batchify_fn,
return_list=True)
# 相同方式构造测试集
test_ds = load_dataset(read,df=test, istrain=False, lazy=False)
test_trans_func = partial(
convert_example,
tokenizer=tokenizer,
mode='test',
max_seq_len=max_seq_length)
test_ds.map(test_trans_func, lazy=False)
test_batch_sampler = paddle.io.BatchSampler(test_ds, batch_size=test_batch_size, shuffle=False)
test_batchify_fn = lambda samples, fn = Dict({
"input_ids": Pad(axis=0, pad_val=tokenizer.pad_token_id),
"token_type_ids": Pad(axis=0, pad_val=tokenizer.pad_token_type_id),
}): fn(samples)
test_data_loader = paddle.io.DataLoader(
dataset=test_ds,
batch_sampler=test_batch_sampler,
collate_fn=test_batchify_fn,
return_list=True)
from paddlenlp.transformers.ernie.modeling import ErniePretrainedModel,ErnieForSequenceClassification
class CCFFSLModel(ErniePretrainedModel):
def __init__(self, ernie, num_classes=2, dropout=None):
super(CCFFSLModel,self).__init__()
self.ernie = ernie # allow ernie to be config
self.dropout = nn.Dropout(dropout if dropout is not None else self.ernie.config["hidden_dropout_prob"])
self.classifier = nn.Linear(self.ernie.config["hidden_size"],num_classes)
self.apply(self.init_weights)
def forward(self,
input_ids,
token_type_ids=None,
position_ids=None,
attention_mask=None):
_, pooled_output = self.ernie(input_ids,
token_type_ids=token_type_ids,
position_ids=position_ids,
attention_mask=attention_mask)
pooled_output = self.dropout(pooled_output)
logits = self.classifier(pooled_output)
return logits
label_classes = train['label'].unique()
model = CCFFSLModel.from_pretrained(MODEL_NAME,num_classes=len(label_classes))
# 训练总步数
num_training_steps = len(train_data_loader) * epochs
# 学习率衰减策略
lr_scheduler = paddlenlp.transformers.LinearDecayWithWarmup(learning_rate, num_training_steps,warmup_proportion)
decay_params = [
p.name for n, p in model.named_parameters()
if not any(nd in n for nd in ["bias", "norm"])
]
# 定义优化器
optimizer = paddle.optimizer.AdamW(
learning_rate=lr_scheduler,
parameters=model.parameters(),
weight_decay=weight_decay,
apply_decay_param_fun=lambda x: x in decay_params,
grad_clip=paddle.nn.ClipGradByGlobalNorm(max_grad_norm))
# utils - 对抗训练 FGM
class FGM(object):
"""
Fast Gradient Method(FGM)
针对 embedding 层梯度上升干扰的对抗训练方法
"""
def __init__(self, model, epsilon=1., emb_name='emb'):
# emb_name 这个参数要换成你模型中embedding 的参数名
self.model = model
self.epsilon = epsilon
self.emb_name = emb_name
self.backup = {}
def attack(self):
for name, param in self.model.named_parameters():
if not param.stop_gradient and self.emb_name in name: # 检验参数是否可训练及范围
self.backup[name] = param.numpy() # 备份原有参数值
grad_tensor = paddle.to_tensor(param.grad) # param.grad 是个 numpy 对象
norm = paddle.norm(grad_tensor) # norm 化
if norm != 0:
r_at = self.epsilon * grad_tensor / norm
param.add(r_at) # 在原有 embed 值上添加向上梯度干扰
def restore(self):
for name, param in self.model.named_parameters():
if not param.stop_gradient and self.emb_name in name:
assert name in self.backup
param.set_value(self.backup[name]) # 将原有 embed 参数还原
self.backup = {}
# 对抗训练
if enable_adversarial:
adv = FGM(model=model,epsilon=1e-6,emb_name='word_embeddings')
rdrop_coef=1
# 验证部分
@paddle.no_grad()
def evaluation(model, data_loader):
model.eval()
real_s = []
pred_s = []
for batch in data_loader:
input_ids, token_type_ids, labels = batch
logits = model(input_ids, token_type_ids)
probs = F.softmax(logits,axis=1)
pred_s.extend(probs.argmax(axis=1).numpy())
real_s.extend(labels.reshape([-1]).numpy())
score = f1_score(y_pred=pred_s,y_true=real_s,average="macro")
return score
# 训练阶段
def do_train(model,data_loader):
print("train ...")
total_loss = 0.
model_total_epochs = 0
best_score = 0.
num_early_stopping = 0
if rdrop_coef > 0:
rdrop_loss = paddlenlp.losses.RDropLoss()
# 训练
train_time = time.time()
valid_time = time.time()
model.train()
for epoch in range(0, epochs):
preds,reals = [],[]
for step, batch in enumerate(data_loader, start=1):
input_ids, token_type_ids, labels = batch
logits = model(input_ids, token_type_ids)
# 使用R-drop
if rdrop_coef > 0:
logits_2 = model(input_ids=input_ids, token_type_ids=token_type_ids)
ce_loss = (F.softmax_with_cross_entropy(logits,labels).mean() + F.softmax_with_cross_entropy(logits,labels).mean()) * 0.5
kl_loss = rdrop_loss(logits, logits_2)
loss = ce_loss + kl_loss * rdrop_coef
else:
loss = F.softmax_with_cross_entropy(logits,labels).mean()
loss.backward()
# 对抗训练
if enable_adversarial:
adv.attack() # 在 embedding 上添加对抗扰动
adv_logits = model(input_ids, token_type_ids)
adv_loss = F.softmax_with_cross_entropy(adv_logits,labels).mean()
adv_loss.backward() # 反向传播,并在正常的 grad 基础上,累加对抗训练的梯度
adv.restore() # 恢复 embedding 参数
total_loss += loss.numpy()
optimizer.step()
lr_scheduler.step()
optimizer.clear_grad()
model_total_epochs += 1
# probs = F.softmax(logits,axis=1)
# preds.extend(probs.argmax(axis=1))
# reals.extend(labels.reshape([-1]))
# train_f1 = f1_score(y_pred=preds,y_true=reals,average="macro")
# print("train f1: %.5f training loss: %.5f speed %.1f s" % (train_f1, total_loss/model_total_epochs,(time.time() - train_time)))
# train_time = time.time()
eval_score = evaluation(model, valid_data_loader)
print("【%.2f%%】validation speed %.2f s" % (model_total_epochs/num_training_steps*100,time.time() - valid_time))
valid_time = time.time()
if best_score < eval_score:
num_early_stopping = 0
print("eval f1: %.5f f1 update %.5f ---> %.5f " % (eval_score,best_score,eval_score))
best_score = eval_score
# 只在score高于0.6的时候保存模型
if best_score > 0.45:
# 保存模型
os.makedirs(save_dir_curr,exist_ok=True)
save_param_path = os.path.join(save_dir_curr, 'model_best.pdparams')
paddle.save(model.state_dict(), save_param_path)
# 保存tokenizer
tokenizer.save_pretrained(save_dir_curr)
else:
num_early_stopping = num_early_stopping + 1
print("eval f1: %.5f but best f1 %.5f early_stoping_num %d" % (eval_score,best_score,num_early_stopping))
model.train()
if num_early_stopping >= early_stopping:
break
return best_score
best_score = do_train(model,train_data_loader)
# logging part
logging_dir = 'work/sumbit'
os.makedirs(logging_dir,exist_ok=True)
logging_name = os.path.join(logging_dir,'run_logging.csv')
os.makedirs(logging_dir,exist_ok=True)
var = [MODEL_NAME, seed, learning_rate, max_seq_length, enable_dataaug, enable_adversarial, rdrop_coef, best_score, save_dir_curr]
names = ['model', 'seed', 'lr', "max_len" , 'enable_dataaug', 'enable_adversarial', 'rdrop_coef','best_score','save_mode_name']
vars_dict = {k: v for k, v in zip(names, var)}
results = dict(**vars_dict)
keys = list(results.keys())
values = list(results.values())
if not os.path.exists(logging_name):
ori = []
ori.append(values)
logging_df = pd.DataFrame(ori, columns=keys)
logging_df.to_csv(logging_name, index=False)
else:
logging_df= pd.read_csv(logging_name)
new = pd.DataFrame(results, index=[1])
logging_df = logging_df.append(new, ignore_index=True)
logging_df.to_csv(logging_name, index=False)
# 预测阶段
def do_sample_predict(model,data_loader,is_prob=False):
model.eval()
preds = []
probs_score = []
for batch in data_loader:
input_ids, token_type_ids= batch
logits = model(input_ids, token_type_ids)
probs = F.softmax(logits,axis=1)
preds.extend(probs.argmax(axis=1).numpy())
probs_score.extend(probs.numpy())
probs_score = np.array(probs_score)
df_probs_score=pd.DataFrame(
probs_score
)
df_probs_score.to_csv('./data/predict_probs_score.tsv',sep='\t',index=None)
if is_prob:
return probs
return preds
# 读取最佳模型
state_dict = paddle.load(os.path.join(save_dir_curr,'model_best.pdparams'))
model.load_dict(state_dict)
# 预测
print("predict start ...")
pred_score = do_sample_predict(model,test_data_loader)
print("predict end ...")