- 论文: https://arxiv.org/abs/2105.04714
- 源码: https://github.com/deepinsight/insightface/tree/master/detection/scrfd
环境搭建 🚀
pip install mmcv-full --user
pip install -r requirements/build.txt --user
pip install -v -e . --user
测试 ⛵️
python demo/image_demo.py 10.jpg configs/scrfd/scrfd_500m.py model.pth
训练人脸数据集 🤢
- 这里我使用 darknet 标注工具, 生成的 darknet 数据集格式,这里用python 转换下格式。
# 写这段代码的时候,只有上帝和我知道它是干嘛的
# 现在,只有上帝知道
# @File : make_scrfd_dataset.py
# @Time : 2022/10/17 16:17
# @Author : J.
# @desc : darknet 转换 scrfd 数据集
import os
import cv2
import shutil
from pathlib import Path
import uuid
import random
def get_uuid():
s_uuid = str(uuid.uuid4())
l_uuid = s_uuid.split('-')
s_uuid = ''.join(l_uuid)
return s_uuid
def get_bbox(txt, img_w, img_h):
bbox = []
with open(txt, 'r', encoding='utf-8') as f:
for ann in f.readlines():
lines = ann.split(" ")
if len(lines) == 5:
if int(lines[0]) != 0:
continue
c_x = float(lines[1]) * img_w
c_y = float(lines[2]) * img_h
b_w = float(lines[3]) * img_w
b_h = float(lines[4]) * img_h
bbox.append([c_x - b_w / 2.0, c_y - b_h / 2.0, c_x + b_w / 2.0, c_y + b_h / 2.0])
f.close()
return bbox
def get_data_lst(root):
lists = []
for f in os.listdir(root):
file_path = os.path.join(root, f)
if not file_path.endswith(".jpg"):
continue
file_path_and_name = os.path.split(file_path)
file_path_name = file_path_and_name[1]
file_name = file_path_name.split('.')[0]
txt_path = os.path.join(root, file_name + ".txt")
if not os.path.exists(txt_path):
continue
lists.append({"file": file_path, "label": txt_path, "name": file_path_name})
return lists
def save_data_lst(data_lst, save_dir):
imgs_save_path = os.path.join(save_dir, "images")
Path(imgs_save_path).mkdir(parents=True, exist_ok=True)
txt_file = open(os.path.join(save_dir, 'labelv2.txt'), 'w')
for i in range(len(data_lst)):
print("{} / {}".format(i, len(data_lst)))
img = cv2.imread(data_lst[i]["file"])
img_w = img.shape[1]
img_h = img.shape[0]
image_name = get_uuid() + ".jpg"
txt_file.writelines("# {} {} {}\n".format(image_name, img_w, img_h))
bboxes = get_bbox(data_lst[i]["label"], img_w, img_h)
for k in range(len(bboxes)):
txt_file.writelines(
"{} {} {} {}\n".format(round(bboxes[k][0], 5), round(bboxes[k][1], 5), round(bboxes[k][2], 5),
round(bboxes[k][3], 5)))
shutil.copy(data_lst[i]["file"], os.path.join(imgs_save_path, image_name))
txt_file.close()
def do(root_dir, out_dir, rate=0.9):
data_lst = get_data_lst(root_dir)
train_data_count = int(len(data_lst) * rate)
train_data_dir_path = os.path.join(out_dir, "train")
val_data_dir_path = os.path.join(out_dir, "val")
# save_data_lst(data_lst, train_data_dir_path)
val_dataset = []
for i in range(len(data_lst)):
if random.randint(1, 10) < 3:
val_dataset.append(data_lst[i])
save_data_lst(val_dataset, val_data_dir_path)
def rename(root):
lists = []
for f in os.listdir(root):
file_path = os.path.join(root, f)
if not file_path.endswith(".jpg"):
continue
file_path_and_name = os.path.split(file_path)
file_path_name = file_path_and_name[1]
file_name = file_path_name.split('.')[0]
dst_file_path = os.path.join(root, file_name + ".jpg")
os.rename(file_path, dst_file_path)
if __name__ == '__main__':
root_dir = r"xxxxxx\img"
out_dir = r"xxxxxx\out"
do(root_dir, out_dir)
- 大概结构如图:
- 因为我只想训练人脸,所以没有生成特征点。( 修改 mmdet/dataset/retinaface.py )
def _parse_ann_line
...
if len(values) >= 4:
if len(values) > 5:
# print(values)
kps = np.array(values[4:19], dtype=np.float32).reshape((self.NK, 3))
for li in range(kps.shape[0]):
if (kps[li, :] == -1).all():
# assert kps[li][2]==-1
kps[li][2] = 0.0 # weight = 0, ignore
else:
assert kps[li][2] >= 0
kps[li][2] = 1.0 # weight
# if li==0:
# landmark_num+=1
# if kps[li][2]==0.0:#visible
# kps[li][2] = 1.0
# else:
# kps[li][2] = 0.0
else: # len(values)==5
ignore = False
# if not ignore:
# ignore = (values[4] == 1)
...
- 预模型 , 这里选择训练 SCRFD_500M
- 作者默认训练 640 ,我这里修改成 416 (configs/scrfd/scrfd_500m.py)
optimizer = dict(type='SGD', lr=0.01, momentum=0.9, weight_decay=0.0005)
optimizer_config = dict(grad_clip=None)
lr_mult = 8
lr_config = dict(
policy='step',
warmup='linear',
warmup_iters=1500,
warmup_ratio=0.001,
step=[55 * lr_mult, 68 * lr_mult])
total_epochs = 80 * lr_mult
checkpoint_config = dict(interval=20)
log_config = dict(interval=10, hooks=[dict(type='TextLoggerHook')])
dist_params = dict(backend='nccl')
log_level = 'INFO'
load_from = None
resume_from = None
workflow = [('train', 1)]
dataset_type = 'RetinaFaceDataset'
data_root = '../data/retinaface/'
train_root = '../data/retinaface/train/'
val_root = '../data/retinaface/val/'
img_norm_cfg = dict(
mean=[127.5, 127.5, 127.5], std=[128.0, 128.0, 128.0], to_rgb=True)
train_pipeline = [
dict(type='LoadImageFromFile', to_float32=True),
dict(type='LoadAnnotations', with_bbox=True, with_keypoints=True),
dict(
type='RandomSquareCrop',
crop_choice=[0.3, 0.45, 0.6, 0.8, 1.0, 1.2, 1.4, 1.6, 1.8, 2.0]),
dict(type='Resize', img_scale=(416, 416), keep_ratio=False),
dict(type='RandomFlip', flip_ratio=0.5),
dict(
type='PhotoMetricDistortion',
brightness_delta=32,
contrast_range=(0.5, 1.5),
saturation_range=(0.5, 1.5),
hue_delta=18),
dict(
type='Normalize',
mean=[127.5, 127.5, 127.5],
std=[128.0, 128.0, 128.0],
to_rgb=True),
dict(type='DefaultFormatBundle'),
dict(
type='Collect',
keys=[
'img', 'gt_bboxes', 'gt_labels', 'gt_bboxes_ignore',
'gt_keypointss'
])
]
test_pipeline = [
dict(type='LoadImageFromFile'),
dict(
type='MultiScaleFlipAug',
img_scale=(416, 416),
flip=False,
transforms=[
dict(type='Resize', keep_ratio=True),
dict(type='RandomFlip', flip_ratio=0.0),
dict(
type='Normalize',
mean=[127.5, 127.5, 127.5],
std=[128.0, 128.0, 128.0],
to_rgb=True),
dict(type='Pad', size=(416, 416), pad_val=0),
dict(type='ImageToTensor', keys=['img']),
dict(type='Collect', keys=['img'])
])
]
data = dict(
samples_per_gpu=8,
workers_per_gpu=3,
train=dict(
type='RetinaFaceDataset',
ann_file='../data/retinaface/train/labelv2.txt',
img_prefix='../data/retinaface/train/images/',
pipeline=[
dict(type='LoadImageFromFile', to_float32=True),
dict(type='LoadAnnotations', with_bbox=True, with_keypoints=True),
dict(
type='RandomSquareCrop',
crop_choice=[
0.3, 0.45, 0.6, 0.8, 1.0, 1.2, 1.4, 1.6, 1.8, 2.0
]),
dict(type='Resize', img_scale=(416, 416), keep_ratio=False),
dict(type='RandomFlip', flip_ratio=0.5),
dict(
type='PhotoMetricDistortion',
brightness_delta=32,
contrast_range=(0.5, 1.5),
saturation_range=(0.5, 1.5),
hue_delta=18),
dict(
type='Normalize',
mean=[127.5, 127.5, 127.5],
std=[128.0, 128.0, 128.0],
to_rgb=True),
dict(type='DefaultFormatBundle'),
dict(
type='Collect',
keys=[
'img', 'gt_bboxes', 'gt_labels', 'gt_bboxes_ignore',
'gt_keypointss'
])
]),
val=dict(
type='RetinaFaceDataset',
ann_file='../data/retinaface/val/labelv2.txt',
img_prefix='../data/retinaface/val/images/',
pipeline=[
dict(type='LoadImageFromFile'),
dict(
type='MultiScaleFlipAug',
img_scale=(416, 416),
flip=False,
transforms=[
dict(type='Resize', keep_ratio=True),
dict(type='RandomFlip', flip_ratio=0.0),
dict(
type='Normalize',
mean=[127.5, 127.5, 127.5],
std=[128.0, 128.0, 128.0],
to_rgb=True),
dict(type='Pad', size=(416, 416), pad_val=0),
dict(type='ImageToTensor', keys=['img']),
dict(type='Collect', keys=['img'])
])
]),
test=dict(
type='RetinaFaceDataset',
ann_file='../data/retinaface/val/labelv2.txt',
img_prefix='../data/retinaface/val/images/',
pipeline=[
dict(type='LoadImageFromFile'),
dict(
type='MultiScaleFlipAug',
img_scale=(416, 416),
flip=False,
transforms=[
dict(type='Resize', keep_ratio=True),
dict(type='RandomFlip', flip_ratio=0.0),
dict(
type='Normalize',
mean=[127.5, 127.5, 127.5],
std=[128.0, 128.0, 128.0],
to_rgb=True),
dict(type='Pad', size=(416, 416), pad_val=0),
dict(type='ImageToTensor', keys=['img']),
dict(type='Collect', keys=['img'])
])
]))
model = dict(
type='SCRFD',
backbone=dict(
type='MobileNetV1',
block_cfg=dict(
stage_blocks=(2, 3, 2, 6), stage_planes=[16, 16, 40, 72, 152,
288])),
neck=dict(
type='PAFPN',
in_channels=[40, 72, 152, 288],
out_channels=16,
start_level=1,
add_extra_convs='on_output',
num_outs=3),
bbox_head=dict(
type='SCRFDHead',
num_classes=1,
in_channels=16,
stacked_convs=2,
feat_channels=64,
# norm_cfg=dict(type='BN', requires_grad=True),
norm_cfg=dict(type='GN', num_groups=16, requires_grad=True),
cls_reg_share=True,
strides_share=True,
dw_conv=True,
scale_mode=2,
anchor_generator=dict(
type='AnchorGenerator',
ratios=[1.0],
scales=[1, 2],
base_sizes=[16, 64, 256],
strides=[8, 16, 32]),
loss_cls=dict(
type='QualityFocalLoss',
use_sigmoid=True,
beta=2.0,
loss_weight=1.0),
loss_dfl=False,
reg_max=8,
loss_bbox=dict(type='DIoULoss', loss_weight=2.0),
use_kps=False,
loss_kps=dict(
type='SmoothL1Loss', beta=0.1111111111111111, loss_weight=0.1),
train_cfg=dict(
assigner=dict(type='ATSSAssigner', topk=9),
allowed_border=-1,
pos_weight=-1,
debug=False),
test_cfg=dict(
nms_pre=-1,
min_bbox_size=0,
score_thr=0.02,
nms=dict(type='nms', iou_threshold=0.45),
max_per_img=-1)))
train_cfg = dict(
assigner=dict(type='ATSSAssigner', topk=9),
allowed_border=-1,
pos_weight=-1,
debug=False)
test_cfg = dict(
nms_pre=-1,
min_bbox_size=0,
score_thr=0.02,
nms=dict(type='nms', iou_threshold=0.45),
max_per_img=-1)
epoch_multi = 1
evaluation = dict(interval=20, metric='mAP')
- 训练
python ./tools/train.py ./configs/scrfd/scrfd_500m.py --gpus=1
遇到的问题 ❓
- Only tuples, lists and Variables are supported as JIT inputs/outputs. Dictionaries and strings are also accepted, but their usage is not recommended. Here, received an input of unsupported type: numpy.ndarray
# 将所有的都注释掉就好了. 参考: https://github.com/deepinsight/insightface/issues/2054
lib/python3.7/site-packages/torch/onnx/utils.py
# args = _decide_input_format(model, args)
- AttributeError: module ‘pycocotools’ has no attribute ‘version’
pip uninstall pycocotools
pip uninstall mmpycocotools
pip install mmpycocotools (如果还报错,就换个版本安装试试)
- ModuleNotFoundError: No module named ‘mmcv._ext’
# 获取版本 , 根据版本安装
# 参考: https://mmcv.readthedocs.io/zh_CN/latest/get_started/installation.html
python -c 'import torch;print(torch.__version__);print(torch.version.cuda)'
pip install mmcv-full==1.6.2 -f https://download.openmmlab.com/mmcv/dist/cu116/torch1.12/index.html
- ModuleNotFoundError: No module named ‘mmdet’
pip install -v -e . --user
NCNN 💞
- ncnn 推理 可以参考 nihui 大佬的教程, 稳的一匹。
- https://zhuanlan.zhihu.com/p/372332267
END 🔚
记录下训练全过程,以便日后再次使用, 有需要的小伙伴可以参考参考,告辞。💥
本文含有隐藏内容,请 开通VIP 后查看