基于SCRFD，训练人脸数据集-EW帮帮网

在这里插入图片描述

论文： https://arxiv.org/abs/2105.04714
源码： https://github.com/deepinsight/insightface/tree/master/detection/scrfd

环境搭建 🚀

pip install mmcv-full --user
pip install -r requirements/build.txt --user
pip install -v -e . --user

测试 ⛵️

python demo/image_demo.py 10.jpg configs/scrfd/scrfd_500m.py model.pth

训练人脸数据集 🤢

这里我使用 darknet 标注工具, 生成的 darknet 数据集格式，这里用python 转换下格式。

# 写这段代码的时候，只有上帝和我知道它是干嘛的
# 现在，只有上帝知道
# @File : make_scrfd_dataset.py
# @Time : 2022/10/17 16:17 
# @Author : J.
# @desc :  darknet 转换 scrfd 数据集


import os
import cv2
import shutil
from pathlib import Path
import uuid
import random


def get_uuid():
    s_uuid = str(uuid.uuid4())
    l_uuid = s_uuid.split('-')
    s_uuid = ''.join(l_uuid)
    return s_uuid


def get_bbox(txt, img_w, img_h):
    bbox = []
    with open(txt, 'r', encoding='utf-8') as f:
        for ann in f.readlines():
            lines = ann.split(" ")
            if len(lines) == 5:
                if int(lines[0]) != 0:
                    continue
                c_x = float(lines[1]) * img_w
                c_y = float(lines[2]) * img_h
                b_w = float(lines[3]) * img_w
                b_h = float(lines[4]) * img_h
                bbox.append([c_x - b_w / 2.0, c_y - b_h / 2.0, c_x + b_w / 2.0, c_y + b_h / 2.0])
        f.close()
    return bbox


def get_data_lst(root):
    lists = []
    for f in os.listdir(root):
        file_path = os.path.join(root, f)
        if not file_path.endswith(".jpg"):
            continue
        file_path_and_name = os.path.split(file_path)
        file_path_name = file_path_and_name[1]
        file_name = file_path_name.split('.')[0]
        txt_path = os.path.join(root, file_name + ".txt")
        if not os.path.exists(txt_path):
            continue
        lists.append({"file": file_path, "label": txt_path, "name": file_path_name})
    return lists


def save_data_lst(data_lst, save_dir):
    imgs_save_path = os.path.join(save_dir, "images")
    Path(imgs_save_path).mkdir(parents=True, exist_ok=True)
    txt_file = open(os.path.join(save_dir, 'labelv2.txt'), 'w')
    for i in range(len(data_lst)):
        print("{} / {}".format(i, len(data_lst)))
        img = cv2.imread(data_lst[i]["file"])
        img_w = img.shape[1]
        img_h = img.shape[0]
        image_name = get_uuid() + ".jpg"
        txt_file.writelines("# {} {} {}\n".format(image_name, img_w, img_h))
        bboxes = get_bbox(data_lst[i]["label"], img_w, img_h)
        for k in range(len(bboxes)):
            txt_file.writelines(
                "{} {} {} {}\n".format(round(bboxes[k][0], 5), round(bboxes[k][1], 5), round(bboxes[k][2], 5),
                                       round(bboxes[k][3], 5)))
        shutil.copy(data_lst[i]["file"], os.path.join(imgs_save_path, image_name))
    txt_file.close()


def do(root_dir, out_dir, rate=0.9):
    data_lst = get_data_lst(root_dir)
    train_data_count = int(len(data_lst) * rate)

    train_data_dir_path = os.path.join(out_dir, "train")
    val_data_dir_path = os.path.join(out_dir, "val")

    # save_data_lst(data_lst, train_data_dir_path)

    val_dataset = []
    for i in range(len(data_lst)):
        if random.randint(1, 10) < 3:
            val_dataset.append(data_lst[i])
    save_data_lst(val_dataset, val_data_dir_path)


def rename(root):
    lists = []
    for f in os.listdir(root):
        file_path = os.path.join(root, f)
        if not file_path.endswith(".jpg"):
            continue
        file_path_and_name = os.path.split(file_path)
        file_path_name = file_path_and_name[1]
        file_name = file_path_name.split('.')[0]
        dst_file_path = os.path.join(root, file_name + ".jpg")
        os.rename(file_path, dst_file_path)


if __name__ == '__main__':
    root_dir = r"xxxxxx\img"
    out_dir = r"xxxxxx\out"
    do(root_dir, out_dir)

大概结构如图:
因为我只想训练人脸，所以没有生成特征点。( 修改 mmdet/dataset/retinaface.py )

	def _parse_ann_line
	...
		if len(values) >= 4:
            if len(values) > 5:
                # print(values)
                kps = np.array(values[4:19], dtype=np.float32).reshape((self.NK, 3))
                for li in range(kps.shape[0]):
                    if (kps[li, :] == -1).all():
                        # assert kps[li][2]==-1
                        kps[li][2] = 0.0  # weight = 0, ignore
                    else:
                        assert kps[li][2] >= 0
                        kps[li][2] = 1.0  # weight
                        # if li==0:
                        #  landmark_num+=1
                        # if kps[li][2]==0.0:#visible
                        #  kps[li][2] = 1.0
                        # else:
                        #  kps[li][2] = 0.0
            else:  # len(values)==5
                ignore = False
                # if not ignore:
                #     ignore = (values[4] == 1)
   ...

预模型 , 这里选择训练 SCRFD_500M
作者默认训练 640 ，我这里修改成 416 (configs/scrfd/scrfd_500m.py)

optimizer = dict(type='SGD', lr=0.01, momentum=0.9, weight_decay=0.0005)
optimizer_config = dict(grad_clip=None)
lr_mult = 8
lr_config = dict(
    policy='step',
    warmup='linear',
    warmup_iters=1500,
    warmup_ratio=0.001,
    step=[55 * lr_mult, 68 * lr_mult])
total_epochs = 80 * lr_mult
checkpoint_config = dict(interval=20)
log_config = dict(interval=10, hooks=[dict(type='TextLoggerHook')])
dist_params = dict(backend='nccl')
log_level = 'INFO'
load_from = None
resume_from = None
workflow = [('train', 1)]
dataset_type = 'RetinaFaceDataset'
data_root = '../data/retinaface/'
train_root = '../data/retinaface/train/'
val_root = '../data/retinaface/val/'
img_norm_cfg = dict(
    mean=[127.5, 127.5, 127.5], std=[128.0, 128.0, 128.0], to_rgb=True)
train_pipeline = [
    dict(type='LoadImageFromFile', to_float32=True),
    dict(type='LoadAnnotations', with_bbox=True, with_keypoints=True),
    dict(
        type='RandomSquareCrop',
        crop_choice=[0.3, 0.45, 0.6, 0.8, 1.0, 1.2, 1.4, 1.6, 1.8, 2.0]),
    dict(type='Resize', img_scale=(416, 416), keep_ratio=False),
    dict(type='RandomFlip', flip_ratio=0.5),
    dict(
        type='PhotoMetricDistortion',
        brightness_delta=32,
        contrast_range=(0.5, 1.5),
        saturation_range=(0.5, 1.5),
        hue_delta=18),
    dict(
        type='Normalize',
        mean=[127.5, 127.5, 127.5],
        std=[128.0, 128.0, 128.0],
        to_rgb=True),
    dict(type='DefaultFormatBundle'),
    dict(
        type='Collect',
        keys=[
            'img', 'gt_bboxes', 'gt_labels', 'gt_bboxes_ignore',
            'gt_keypointss'
        ])
]
test_pipeline = [
    dict(type='LoadImageFromFile'),
    dict(
        type='MultiScaleFlipAug',
        img_scale=(416, 416),
        flip=False,
        transforms=[
            dict(type='Resize', keep_ratio=True),
            dict(type='RandomFlip', flip_ratio=0.0),
            dict(
                type='Normalize',
                mean=[127.5, 127.5, 127.5],
                std=[128.0, 128.0, 128.0],
                to_rgb=True),
            dict(type='Pad', size=(416, 416), pad_val=0),
            dict(type='ImageToTensor', keys=['img']),
            dict(type='Collect', keys=['img'])
        ])
]
data = dict(
    samples_per_gpu=8,
    workers_per_gpu=3,
    train=dict(
        type='RetinaFaceDataset',
        ann_file='../data/retinaface/train/labelv2.txt',
        img_prefix='../data/retinaface/train/images/',
        pipeline=[
            dict(type='LoadImageFromFile', to_float32=True),
            dict(type='LoadAnnotations', with_bbox=True, with_keypoints=True),
            dict(
                type='RandomSquareCrop',
                crop_choice=[
                    0.3, 0.45, 0.6, 0.8, 1.0, 1.2, 1.4, 1.6, 1.8, 2.0
                ]),
            dict(type='Resize', img_scale=(416, 416), keep_ratio=False),
            dict(type='RandomFlip', flip_ratio=0.5),
            dict(
                type='PhotoMetricDistortion',
                brightness_delta=32,
                contrast_range=(0.5, 1.5),
                saturation_range=(0.5, 1.5),
                hue_delta=18),
            dict(
                type='Normalize',
                mean=[127.5, 127.5, 127.5],
                std=[128.0, 128.0, 128.0],
                to_rgb=True),
            dict(type='DefaultFormatBundle'),
            dict(
                type='Collect',
                keys=[
                    'img', 'gt_bboxes', 'gt_labels', 'gt_bboxes_ignore',
                    'gt_keypointss'
                ])
        ]),
    val=dict(
        type='RetinaFaceDataset',
        ann_file='../data/retinaface/val/labelv2.txt',
        img_prefix='../data/retinaface/val/images/',
        pipeline=[
            dict(type='LoadImageFromFile'),
            dict(
                type='MultiScaleFlipAug',
                img_scale=(416, 416),
                flip=False,
                transforms=[
                    dict(type='Resize', keep_ratio=True),
                    dict(type='RandomFlip', flip_ratio=0.0),
                    dict(
                        type='Normalize',
                        mean=[127.5, 127.5, 127.5],
                        std=[128.0, 128.0, 128.0],
                        to_rgb=True),
                    dict(type='Pad', size=(416, 416), pad_val=0),
                    dict(type='ImageToTensor', keys=['img']),
                    dict(type='Collect', keys=['img'])
                ])
        ]),
    test=dict(
        type='RetinaFaceDataset',
        ann_file='../data/retinaface/val/labelv2.txt',
        img_prefix='../data/retinaface/val/images/',
        pipeline=[
            dict(type='LoadImageFromFile'),
            dict(
                type='MultiScaleFlipAug',
                img_scale=(416, 416),
                flip=False,
                transforms=[
                    dict(type='Resize', keep_ratio=True),
                    dict(type='RandomFlip', flip_ratio=0.0),
                    dict(
                        type='Normalize',
                        mean=[127.5, 127.5, 127.5],
                        std=[128.0, 128.0, 128.0],
                        to_rgb=True),
                    dict(type='Pad', size=(416, 416), pad_val=0),
                    dict(type='ImageToTensor', keys=['img']),
                    dict(type='Collect', keys=['img'])
                ])
        ]))
model = dict(
    type='SCRFD',
    backbone=dict(
        type='MobileNetV1',
        block_cfg=dict(
            stage_blocks=(2, 3, 2, 6), stage_planes=[16, 16, 40, 72, 152,
                                                     288])),
    neck=dict(
        type='PAFPN',
        in_channels=[40, 72, 152, 288],
        out_channels=16,
        start_level=1,
        add_extra_convs='on_output',
        num_outs=3),
    bbox_head=dict(
        type='SCRFDHead',
        num_classes=1,
        in_channels=16,
        stacked_convs=2,
        feat_channels=64,
        # norm_cfg=dict(type='BN', requires_grad=True),
        norm_cfg=dict(type='GN', num_groups=16, requires_grad=True),
        cls_reg_share=True,
        strides_share=True,
        dw_conv=True,
        scale_mode=2,
        anchor_generator=dict(
            type='AnchorGenerator',
            ratios=[1.0],
            scales=[1, 2],
            base_sizes=[16, 64, 256],
            strides=[8, 16, 32]),
        loss_cls=dict(
            type='QualityFocalLoss',
            use_sigmoid=True,
            beta=2.0,
            loss_weight=1.0),
        loss_dfl=False,
        reg_max=8,
        loss_bbox=dict(type='DIoULoss', loss_weight=2.0),
        use_kps=False,
        loss_kps=dict(
            type='SmoothL1Loss', beta=0.1111111111111111, loss_weight=0.1),
        train_cfg=dict(
            assigner=dict(type='ATSSAssigner', topk=9),
            allowed_border=-1,
            pos_weight=-1,
            debug=False),
        test_cfg=dict(
            nms_pre=-1,
            min_bbox_size=0,
            score_thr=0.02,
            nms=dict(type='nms', iou_threshold=0.45),
            max_per_img=-1)))
train_cfg = dict(
    assigner=dict(type='ATSSAssigner', topk=9),
    allowed_border=-1,
    pos_weight=-1,
    debug=False)
test_cfg = dict(
    nms_pre=-1,
    min_bbox_size=0,
    score_thr=0.02,
    nms=dict(type='nms', iou_threshold=0.45),
    max_per_img=-1)
epoch_multi = 1
evaluation = dict(interval=20, metric='mAP')

训练 python ./tools/train.py ./configs/scrfd/scrfd_500m.py --gpus=1

遇到的问题 ❓

Only tuples, lists and Variables are supported as JIT inputs/outputs. Dictionaries and strings are also accepted, but their usage is not recommended. Here, received an input of unsupported type: numpy.ndarray

# 将所有的都注释掉就好了. 参考: https://github.com/deepinsight/insightface/issues/2054
lib/python3.7/site-packages/torch/onnx/utils.py
# args = _decide_input_format(model, args)

AttributeError: module ‘pycocotools’ has no attribute ‘version’

pip uninstall pycocotools
pip uninstall mmpycocotools
pip install mmpycocotools (如果还报错,就换个版本安装试试)

ModuleNotFoundError: No module named ‘mmcv._ext’

# 获取版本 , 根据版本安装 
# 参考: https://mmcv.readthedocs.io/zh_CN/latest/get_started/installation.html
python -c 'import torch;print(torch.__version__);print(torch.version.cuda)'
pip install mmcv-full==1.6.2 -f https://download.openmmlab.com/mmcv/dist/cu116/torch1.12/index.html

ModuleNotFoundError: No module named ‘mmdet’

pip install -v -e . --user

NCNN 💞

ncnn 推理可以参考 nihui 大佬的教程，稳的一匹。
https://zhuanlan.zhihu.com/p/372332267

END 🔚

记录下训练全过程，以便日后再次使用，有需要的小伙伴可以参考参考，告辞。💥

在这里插入图片描述

本文含有隐藏内容，请开通VIP 后查看

基于SCRFD，训练人脸数据集

环境搭建 🚀

测试 ⛵️

训练人脸数据集 🤢

遇到的问题 ❓

NCNN 💞

END 🔚

网站公告

今日签到

热门文章

最新发布

基于SCRFD，训练人脸数据集

环境搭建 ​🚀

测试 ⛵️​

训练人脸数据集 🤢​

遇到的问题 ❓​

NCNN 💞

END 🔚​

网站公告

今日签到

热门文章

最新发布

环境搭建 🚀

测试 ⛵️

训练人脸数据集 🤢

遇到的问题 ❓

END 🔚