yolov11安装,训练模型,tensorrtx加速,Qt预测图像

发布于:2025-06-30 ⋅ 阅读:(21) ⋅ 点赞:(0)

一. yolov11 python环境安装

  1. 基础环境
    CUDA:cuda_11.8.0_522.06_windows
    cudnn:cudnn-windows-x86_64-8.6.0.163_cuda11-archive
  2. 创建python环境
    conda create --name yolov11 python=3.10 -y
    
  3. 安装pytorch
    pip install torch==2.6.0 torchvision==0.21.0 torchaudio==2.6.0 --index-url https://download.pytorch.org/whl/cu118
    
  4. 安装yolov11
    pip install ultralytics -i https://pypi.mirrors.ustc.edu.cn/simple/
    
  5. 安装必要的库
    pip install -r requirements.txt -i https://pypi.mirrors.ustc.edu.cn/simple/
    

二. windows10下yolov11 tensorrtx推理加速

  1. 官网下载tensorrtx

    git clone https://github.com/wang-xinyu/tensorrtx.git
    
  2. 进入yolov11文件夹,转换模型.pt转.wts

    python gen_wts.py -w D:\code\ultralytics-main\yolo11n.pt -o yolo11n.wts -t detect
    

    在这里插入图片描述
    注意:pytorch2.6需要修改代码中的torch.load,在里面添加weigths_only=False
    在这里插入图片描述

  3. 修改cmakeList.txt文件
    根据自己的opencv,tensort,dirent所在目录路径,修改以下文件路径

    cmake_minimum_required(VERSION 3.10)
    
    project(yolov11)
    
    add_definitions(-std=c++11)
    add_definitions(-DAPI_EXPORTS)
    add_compile_definitions(NOMINMAX)
    
    set(CMAKE_CXX_STANDARD 11)
    set(CMAKE_BUILD_TYPE Debug)
    
    set(CMAKE_CUDA_ARCHITECTURES 70 75 80 86)
    set(CMAKE_CUDA_COMPILER "C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v11.8/bin/nvcc.exe")
    enable_language(CUDA)
    
    include_directories(${PROJECT_SOURCE_DIR}/include)
    include_directories(${PROJECT_SOURCE_DIR}/plugin)
    
    # include and link dirs of cuda and tensorrt, you need adapt them if yours are different
    if(CMAKE_SYSTEM_PROCESSOR MATCHES "aarch64")
        message("embed_platform on")
        include_directories(/usr/local/cuda/targets/aarch64-linux/include)
        link_directories(/usr/local/cuda/targets/aarch64-linux/lib)
    else()
        message("embed_platform off")
        # cuda
        find_package(CUDA REQUIRED)
        include_directories(${CUDA_INCLUDE_DIRS})
        
        # tensorrt
        set(TRT_DIR "C:\\Program Files\\NVIDIA GPU Computing Toolkit\\CUDA\\TensorRT-8.6.0.12")  
        set(TRT_INCLUDE_DIRS ${TRT_DIR}\\include) 
        set(TRT_LIB_DIRS ${TRT_DIR}\\lib) 
        include_directories(${TRT_INCLUDE_DIRS})
        link_directories(${TRT_LIB_DIRS})
    
        # opencv
        set(OpenCV_DIR "D:\\Program Files\\opencv\\build") 
        set(OpenCV_INCLUDE_DIRS ${OpenCV_DIR}\\include) 
        set(OpenCV_LIB_DIRS ${OpenCV_DIR}\\x64\\vc16\\lib) 
        set(OpenCV_Debug_LIBS "opencv_world4110d.lib") 
        set(OpenCV_Release_LIBS "opencv_world4110.lib") 
        include_directories(${OpenCV_INCLUDE_DIRS})
        link_directories(${OpenCV_LIB_DIRS})
    
        # dirent
        set(Dirent_INCLUDE_DIRS "D:\\Program Files\\dirent\\include")
        include_directories(${Dirent_INCLUDE_DIRS})
    endif()
    
    add_library(myplugins SHARED ${PROJECT_SOURCE_DIR}/plugin/yololayer.cu)
    target_link_libraries(myplugins nvinfer cudart)
    
    file(GLOB_RECURSE SRCS ${PROJECT_SOURCE_DIR}/src/*.cpp ${PROJECT_SOURCE_DIR}/src/*.cu)
    
    add_executable(yolo11_det ${PROJECT_SOURCE_DIR}/yolo11_det.cpp ${SRCS})
    target_link_libraries(yolo11_det nvinfer)
    target_link_libraries(yolo11_det cudart)
    target_link_libraries(yolo11_det myplugins)
    target_link_libraries(yolo11_det ${OpenCV_Debug_LIBS})
    target_link_libraries(yolo11_det ${OpenCV_Release_LIBS})
    
    add_executable(yolo11_cls ${PROJECT_SOURCE_DIR}/yolo11_cls.cpp ${SRCS})
    target_link_libraries(yolo11_cls nvinfer)
    target_link_libraries(yolo11_cls cudart)
    target_link_libraries(yolo11_cls myplugins)
    target_link_libraries(yolo11_cls ${OpenCV_Debug_LIBS})
    target_link_libraries(yolo11_cls ${OpenCV_Release_LIBS})
    
    add_executable(yolo11_seg ${PROJECT_SOURCE_DIR}/yolo11_seg.cpp ${SRCS})
    target_link_libraries(yolo11_seg nvinfer)
    target_link_libraries(yolo11_seg cudart)
    target_link_libraries(yolo11_seg myplugins)
    target_link_libraries(yolo11_seg ${OpenCV_Debug_LIBS})
    target_link_libraries(yolo11_seg ${OpenCV_Release_LIBS})
    
    add_executable(yolo11_pose ${PROJECT_SOURCE_DIR}/yolo11_pose.cpp ${SRCS})
    target_link_libraries(yolo11_pose nvinfer)
    target_link_libraries(yolo11_pose cudart)
    target_link_libraries(yolo11_pose myplugins)
    target_link_libraries(yolo11_pose ${OpenCV_Debug_LIBS})
    target_link_libraries(yolo11_pose ${OpenCV_Release_LIBS})
    
    add_executable(yolo11_obb ${PROJECT_SOURCE_DIR}/yolo11_obb.cpp ${SRCS})
    target_link_libraries(yolo11_obb nvinfer)
    target_link_libraries(yolo11_obb cudart)
    target_link_libraries(yolo11_obb myplugins)
    target_link_libraries(yolo11_obb ${OpenCV_Debug_LIBS})
    target_link_libraries(yolo11_obb ${OpenCV_Release_LIBS})
    
  4. 构建项目

    mkdir build
    cd build
    cmake ..
    

    在这里插入图片描述5. vs打开项目,生成解决方案
    在这里插入图片描述
    在这里插入图片描述

  5. 装换.wts为.engine
    在这里插入图片描述
    在这里插入图片描述
    在这里插入图片描述
    转换前,这里需要根据自己的模型,修改对应的配置,配置文件在以下位置
    在这里插入图片描述
    在这里插入图片描述
    在这里插入图片描述

    在这里插入图片描述

    -s ..\yolo11n.wts yolo11n.engine n
    
  6. 利用转换好的.engine进行推理

    -d yolo11n.engine D:\code\yolov5-6.1\data\images g
    

    在这里插入图片描述
    在这里插入图片描述
    在这里插入图片描述

三. windows10下qt调用tensorrtx加速的yolov11进行检测

  1. 拷贝文件
    在这里插入图片描述
  2. 修改Qt项目中的cmakeList.txt文件如下:
    cmake_minimum_required(VERSION 3.5)
    
    project(yolov11Test LANGUAGES CXX)
    
    add_definitions(-std=c++11)
    add_definitions(-DAPI_EXPORTS)
    add_compile_definitions(NOMINMAX)
    
    set(CMAKE_CXX_STANDARD 17)
    set(CMAKE_CXX_STANDARD_REQUIRED ON)
    
    set(CMAKE_CUDA_ARCHITECTURES 70 75 80 86)
    set(CMAKE_CUDA_COMPILER "C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v11.8/bin/nvcc.exe")
    enable_language(CUDA)
    
    include_directories(${PROJECT_SOURCE_DIR}/include)
    include_directories(${PROJECT_SOURCE_DIR}/plugin)
    
    # cuda
    find_package(CUDA REQUIRED)
    include_directories(${CUDA_INCLUDE_DIRS})
    
    # tensorrt
    set(TRT_DIR "C:\\Program Files\\NVIDIA GPU Computing Toolkit\\CUDA\\TensorRT-8.6.0.12")
    set(TRT_INCLUDE_DIRS ${TRT_DIR}\\include)
    set(TRT_LIB_DIRS ${TRT_DIR}\\lib)
    include_directories(${TRT_INCLUDE_DIRS})
    link_directories(${TRT_LIB_DIRS})
    
    # opencv
    set(OpenCV_DIR "D:\\Program Files\\opencv\\build")
    set(OpenCV_INCLUDE_DIRS ${OpenCV_DIR}\\include)
    set(OpenCV_LIB_DIRS ${OpenCV_DIR}\\x64\\vc16\\lib)
    set(OpenCV_Debug_LIBS "opencv_world4110d.lib")
    set(OpenCV_Release_LIBS "opencv_world4110.lib")
    include_directories(${OpenCV_INCLUDE_DIRS})
    link_directories(${OpenCV_LIB_DIRS})
    
    # dirent
    set(Dirent_INCLUDE_DIRS "D:\\Program Files\\dirent\\include")
    include_directories(${Dirent_INCLUDE_DIRS})
    
    add_library(myplugins SHARED ${PROJECT_SOURCE_DIR}/plugin/yololayer.cu)
    target_link_libraries(myplugins nvinfer cudart)
    
    file(GLOB_RECURSE SRCS ${PROJECT_SOURCE_DIR}/src/*.cpp ${PROJECT_SOURCE_DIR}/src/*.cu)
    
    add_executable(yolov11Test main.cpp ${SRCS})
    
    target_link_libraries(yolov11Test nvinfer)
    target_link_libraries(yolov11Test cudart)
    target_link_libraries(yolov11Test myplugins)
    target_link_libraries(yolov11Test ${OpenCV_Debug_LIBS})
    target_link_libraries(yolov11Test ${OpenCV_Release_LIBS})
    
    install(TARGETS yolov11Test
        LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR}
        RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR}
    )
    
    
  3. main函数代码如下:
    #include <fstream>
    #include <iostream>
    #include <opencv2/opencv.hpp>
    #include "cuda_utils.h"
    #include "logging.h"
    #include "model.h"
    #include "postprocess.h"
    #include "preprocess.h"
    #include "utils.h"
    
    Logger gLogger;
    using namespace nvinfer1;
    const int kOutputSize = kMaxNumOutputBbox * sizeof(Detection) / sizeof(float) + 1;
    
    void deserialize_engine(std::string& engine_name, IRuntime** runtime, ICudaEngine** engine, IExecutionContext** context)
    {
        std::ifstream file(engine_name, std::ios::binary);
        if (!file.good())
        {
            std::cerr << "read " << engine_name << " error!" << std::endl;
            assert(false);
        }
        size_t size = 0;
        file.seekg(0, file.end);
        size = file.tellg();
        file.seekg(0, file.beg);
        char* serialized_engine = new char[size];
        assert(serialized_engine);
        file.read(serialized_engine, size);
        file.close();
    
        *runtime = createInferRuntime(gLogger);
        assert(*runtime);
        *engine = (*runtime)->deserializeCudaEngine(serialized_engine, size);
        assert(*engine);
        *context = (*engine)->createExecutionContext();
        assert(*context);
        delete[] serialized_engine;
    }
    
    void prepare_buffer(ICudaEngine* engine, float** input_buffer_device, float** output_buffer_device, float** output_buffer_host, float** decode_ptr_host, float** decode_ptr_device, std::string cuda_post_process)
    {
        assert(engine->getNbBindings() == 2);
        // In order to bind the buffers, we need to know the names of the input and output tensors.
        // Note that indices are guaranteed to be less than IEngine::getNbBindings()
        const int inputIndex = engine->getBindingIndex(kInputTensorName);
        const int outputIndex = engine->getBindingIndex(kOutputTensorName);
        assert(inputIndex == 0);
        assert(outputIndex == 1);
        // Create GPU buffers on device
        CUDA_CHECK(cudaMalloc((void**)input_buffer_device, kBatchSize * 3 * kInputH * kInputW * sizeof(float)));
        CUDA_CHECK(cudaMalloc((void**)output_buffer_device, kBatchSize * kOutputSize * sizeof(float)));
        if (cuda_post_process == "c") {
            *output_buffer_host = new float[kBatchSize * kOutputSize];
        } else if (cuda_post_process == "g") {
            if (kBatchSize > 1) {
                std::cerr << "Do not yet support GPU post processing for multiple batches" << std::endl;
                exit(0);
            }
            // Allocate memory for decode_ptr_host and copy to device
            *decode_ptr_host = new float[1 + kMaxNumOutputBbox * bbox_element];
            CUDA_CHECK(cudaMalloc((void**)decode_ptr_device, sizeof(float) * (1 + kMaxNumOutputBbox * bbox_element)));
        }
    }
    
    void infer(IExecutionContext& context, cudaStream_t& stream, void** buffers, float* output, int batchsize,
               float* decode_ptr_host, float* decode_ptr_device, int model_bboxes, std::string cuda_post_process) {
        // infer on the batch asynchronously, and DMA output back to host
        auto start = std::chrono::system_clock::now();
        context.enqueueV2(buffers, stream, nullptr);
        if (cuda_post_process == "c") {
            CUDA_CHECK(cudaMemcpyAsync(output, buffers[1], batchsize * kOutputSize * sizeof(float), cudaMemcpyDeviceToHost,
                                       stream));
            auto end = std::chrono::system_clock::now();
            std::cout << "inference time: " << std::chrono::duration_cast<std::chrono::milliseconds>(end - start).count()
                      << "ms" << std::endl;
        } else if (cuda_post_process == "g") {
            CUDA_CHECK(
                cudaMemsetAsync(decode_ptr_device, 0, sizeof(float) * (1 + kMaxNumOutputBbox * bbox_element), stream));
            cuda_decode((float*)buffers[1], model_bboxes, kConfThresh, decode_ptr_device, kMaxNumOutputBbox, stream);
            cuda_nms(decode_ptr_device, kNmsThresh, kMaxNumOutputBbox, stream);  //cuda nms
            CUDA_CHECK(cudaMemcpyAsync(decode_ptr_host, decode_ptr_device,
                                       sizeof(float) * (1 + kMaxNumOutputBbox * bbox_element), cudaMemcpyDeviceToHost,
                                       stream));
            auto end = std::chrono::system_clock::now();
            std::cout << "inference and gpu postprocess time: "
                      << std::chrono::duration_cast<std::chrono::milliseconds>(end - start).count() << "ms" << std::endl;
        }
    
        CUDA_CHECK(cudaStreamSynchronize(stream));
    }
    
    int main(int argc, char** argv) {
        // yolo11_det -s ../models/yolo11n.wts ../models/yolo11n.fp32.trt n
        // yolo11_det -d ../models/yolo11n.fp32.trt ../images c
        cudaSetDevice(kGpuId);
        std::string engine_name= "D:\\code\\tensorrtx\\yolo11\\build\\yolo11n.engine"; //转换好的模型文件路径
        std::string img_dir= "D:\\code\\yolov5-6.1\\data\\images\\"; //要预测的图像文件夹所在路径
        std::string cuda_post_process = "g";
        int model_bboxes;
        float gd = 0, gw = 0;
        int max_channels = 0;
    
        // 反序列化模型文件 Deserialize the engine from file
        IRuntime* runtime = nullptr;
        ICudaEngine* engine = nullptr;
        IExecutionContext* context = nullptr;
        deserialize_engine(engine_name, &runtime, &engine, &context);
        cudaStream_t stream;
        CUDA_CHECK(cudaStreamCreate(&stream));
        cuda_preprocess_init(kMaxInputImageSize);
        auto out_dims = engine->getBindingDimensions(1);
        model_bboxes = out_dims.d[0];
    
        // 准备cpu和gpu缓存 Prepare cpu and gpu buffers
        float* device_buffers[2];
        float* output_buffer_host = nullptr;
        float* decode_ptr_host = nullptr;
        float* decode_ptr_device = nullptr;
    
        // 从文件夹中读取图像 Read images from directory
        std::vector<std::string> file_names;
        if (read_files_in_dir(img_dir.c_str(), file_names) < 0) {
            std::cerr << "read_files_in_dir failed." << std::endl;
            return -1;
        }
    
        prepare_buffer(engine, &device_buffers[0], &device_buffers[1], &output_buffer_host, &decode_ptr_host,
                       &decode_ptr_device, cuda_post_process);
    
        // 批预测batch predict
        for (size_t i = 0; i < file_names.size(); i += kBatchSize)
        {
            // 通过opencv读取一批图像Get a batch of images
            std::vector<cv::Mat> img_batch;
            std::vector<std::string> img_name_batch;
            for (size_t j = i; j < i + kBatchSize && j < file_names.size(); j++)
            {
                cv::Mat img = cv::imread(img_dir + "/" + file_names[j]);
                img_batch.push_back(img);
                img_name_batch.push_back(file_names[j]);
            }
            // Preprocess
            cuda_batch_preprocess(img_batch, device_buffers[0], kInputW, kInputH, stream);
            // 进行推理Run inference
            infer(*context, stream, (void**)device_buffers, output_buffer_host, kBatchSize, decode_ptr_host,
                  decode_ptr_device, model_bboxes, cuda_post_process);
            // 保存output_buffer_host的前100个值,一行一个
            //        std::ofstream out("../models/output.txt");
            //        for (int j = 0; j < 100; j++) {
            //            out << output_buffer_host[j] << std::endl;
            //        }
            //        out.close();
            std::vector<std::vector<Detection>> res_batch;
            if (cuda_post_process == "c")
            {
                // NMS非极大值抑制
                batch_nms(res_batch, output_buffer_host, img_batch.size(), kOutputSize, kConfThresh, kNmsThresh);
            } else if (cuda_post_process == "g")
            {
                //GPU非极大值抑制Process gpu decode and nms results
                batch_process(res_batch, decode_ptr_host, img_batch.size(), bbox_element, img_batch);
            }
    
            // 绘制结果Draw bounding boxes
            draw_bbox(img_batch, res_batch);
            //显示图像
            for (size_t j = 0; j < img_batch.size(); j++)
            {
                cv::imshow("results", img_batch[j]);
                cv::waitKey(0);
            }
    
            // 保存图像Save images
            for (size_t j = 0; j < img_batch.size(); j++)
            {
                cv::imwrite("_" + img_name_batch[j], img_batch[j]);
            }
        }
    
        // Release stream and buffers
        cudaStreamDestroy(stream);
        CUDA_CHECK(cudaFree(device_buffers[0]));
        CUDA_CHECK(cudaFree(device_buffers[1]));
        CUDA_CHECK(cudaFree(decode_ptr_device));
        delete[] decode_ptr_host;
        delete[] output_buffer_host;
        cuda_preprocess_destroy();
        // Destroy the engine
        delete context;
        delete engine;
        delete runtime;
    
        // Print histogram of the output distribution
        //std::cout << "\nOutput:\n\n";
        //for (unsigned int i = 0; i < kOutputSize; i++)
        //{
        //    std::cout << prob[i] << ", ";
        //    if (i % 10 == 0) std::cout << std::endl;
        //}
        //std::cout << std::endl;
    
        return 0;
    }
    
    

网站公告

今日签到

点亮在社区的每一天
去签到