文章目录
一. yolov11 python环境安装
- 基础环境
CUDA:cuda_11.8.0_522.06_windows
cudnn:cudnn-windows-x86_64-8.6.0.163_cuda11-archive - 创建python环境
conda create --name yolov11 python=3.10 -y
- 安装pytorch
pip install torch==2.6.0 torchvision==0.21.0 torchaudio==2.6.0 --index-url https://download.pytorch.org/whl/cu118
- 安装yolov11
pip install ultralytics -i https://pypi.mirrors.ustc.edu.cn/simple/
- 安装必要的库
pip install -r requirements.txt -i https://pypi.mirrors.ustc.edu.cn/simple/
二. windows10下yolov11 tensorrtx推理加速
官网下载tensorrtx
git clone https://github.com/wang-xinyu/tensorrtx.git
进入yolov11文件夹,转换模型.pt转.wts
python gen_wts.py -w D:\code\ultralytics-main\yolo11n.pt -o yolo11n.wts -t detect
注意:pytorch2.6需要修改代码中的torch.load,在里面添加weigths_only=False
修改cmakeList.txt文件
根据自己的opencv,tensort,dirent所在目录路径,修改以下文件路径cmake_minimum_required(VERSION 3.10) project(yolov11) add_definitions(-std=c++11) add_definitions(-DAPI_EXPORTS) add_compile_definitions(NOMINMAX) set(CMAKE_CXX_STANDARD 11) set(CMAKE_BUILD_TYPE Debug) set(CMAKE_CUDA_ARCHITECTURES 70 75 80 86) set(CMAKE_CUDA_COMPILER "C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v11.8/bin/nvcc.exe") enable_language(CUDA) include_directories(${PROJECT_SOURCE_DIR}/include) include_directories(${PROJECT_SOURCE_DIR}/plugin) # include and link dirs of cuda and tensorrt, you need adapt them if yours are different if(CMAKE_SYSTEM_PROCESSOR MATCHES "aarch64") message("embed_platform on") include_directories(/usr/local/cuda/targets/aarch64-linux/include) link_directories(/usr/local/cuda/targets/aarch64-linux/lib) else() message("embed_platform off") # cuda find_package(CUDA REQUIRED) include_directories(${CUDA_INCLUDE_DIRS}) # tensorrt set(TRT_DIR "C:\\Program Files\\NVIDIA GPU Computing Toolkit\\CUDA\\TensorRT-8.6.0.12") set(TRT_INCLUDE_DIRS ${TRT_DIR}\\include) set(TRT_LIB_DIRS ${TRT_DIR}\\lib) include_directories(${TRT_INCLUDE_DIRS}) link_directories(${TRT_LIB_DIRS}) # opencv set(OpenCV_DIR "D:\\Program Files\\opencv\\build") set(OpenCV_INCLUDE_DIRS ${OpenCV_DIR}\\include) set(OpenCV_LIB_DIRS ${OpenCV_DIR}\\x64\\vc16\\lib) set(OpenCV_Debug_LIBS "opencv_world4110d.lib") set(OpenCV_Release_LIBS "opencv_world4110.lib") include_directories(${OpenCV_INCLUDE_DIRS}) link_directories(${OpenCV_LIB_DIRS}) # dirent set(Dirent_INCLUDE_DIRS "D:\\Program Files\\dirent\\include") include_directories(${Dirent_INCLUDE_DIRS}) endif() add_library(myplugins SHARED ${PROJECT_SOURCE_DIR}/plugin/yololayer.cu) target_link_libraries(myplugins nvinfer cudart) file(GLOB_RECURSE SRCS ${PROJECT_SOURCE_DIR}/src/*.cpp ${PROJECT_SOURCE_DIR}/src/*.cu) add_executable(yolo11_det ${PROJECT_SOURCE_DIR}/yolo11_det.cpp ${SRCS}) target_link_libraries(yolo11_det nvinfer) target_link_libraries(yolo11_det cudart) target_link_libraries(yolo11_det myplugins) target_link_libraries(yolo11_det ${OpenCV_Debug_LIBS}) target_link_libraries(yolo11_det ${OpenCV_Release_LIBS}) add_executable(yolo11_cls ${PROJECT_SOURCE_DIR}/yolo11_cls.cpp ${SRCS}) target_link_libraries(yolo11_cls nvinfer) target_link_libraries(yolo11_cls cudart) target_link_libraries(yolo11_cls myplugins) target_link_libraries(yolo11_cls ${OpenCV_Debug_LIBS}) target_link_libraries(yolo11_cls ${OpenCV_Release_LIBS}) add_executable(yolo11_seg ${PROJECT_SOURCE_DIR}/yolo11_seg.cpp ${SRCS}) target_link_libraries(yolo11_seg nvinfer) target_link_libraries(yolo11_seg cudart) target_link_libraries(yolo11_seg myplugins) target_link_libraries(yolo11_seg ${OpenCV_Debug_LIBS}) target_link_libraries(yolo11_seg ${OpenCV_Release_LIBS}) add_executable(yolo11_pose ${PROJECT_SOURCE_DIR}/yolo11_pose.cpp ${SRCS}) target_link_libraries(yolo11_pose nvinfer) target_link_libraries(yolo11_pose cudart) target_link_libraries(yolo11_pose myplugins) target_link_libraries(yolo11_pose ${OpenCV_Debug_LIBS}) target_link_libraries(yolo11_pose ${OpenCV_Release_LIBS}) add_executable(yolo11_obb ${PROJECT_SOURCE_DIR}/yolo11_obb.cpp ${SRCS}) target_link_libraries(yolo11_obb nvinfer) target_link_libraries(yolo11_obb cudart) target_link_libraries(yolo11_obb myplugins) target_link_libraries(yolo11_obb ${OpenCV_Debug_LIBS}) target_link_libraries(yolo11_obb ${OpenCV_Release_LIBS})
构建项目
mkdir build cd build cmake ..
5. vs打开项目,生成解决方案
装换.wts为.engine
转换前,这里需要根据自己的模型,修改对应的配置,配置文件在以下位置
-s ..\yolo11n.wts yolo11n.engine n
利用转换好的.engine进行推理
-d yolo11n.engine D:\code\yolov5-6.1\data\images g
三. windows10下qt调用tensorrtx加速的yolov11进行检测
- 拷贝文件
- 修改Qt项目中的cmakeList.txt文件如下:
cmake_minimum_required(VERSION 3.5) project(yolov11Test LANGUAGES CXX) add_definitions(-std=c++11) add_definitions(-DAPI_EXPORTS) add_compile_definitions(NOMINMAX) set(CMAKE_CXX_STANDARD 17) set(CMAKE_CXX_STANDARD_REQUIRED ON) set(CMAKE_CUDA_ARCHITECTURES 70 75 80 86) set(CMAKE_CUDA_COMPILER "C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v11.8/bin/nvcc.exe") enable_language(CUDA) include_directories(${PROJECT_SOURCE_DIR}/include) include_directories(${PROJECT_SOURCE_DIR}/plugin) # cuda find_package(CUDA REQUIRED) include_directories(${CUDA_INCLUDE_DIRS}) # tensorrt set(TRT_DIR "C:\\Program Files\\NVIDIA GPU Computing Toolkit\\CUDA\\TensorRT-8.6.0.12") set(TRT_INCLUDE_DIRS ${TRT_DIR}\\include) set(TRT_LIB_DIRS ${TRT_DIR}\\lib) include_directories(${TRT_INCLUDE_DIRS}) link_directories(${TRT_LIB_DIRS}) # opencv set(OpenCV_DIR "D:\\Program Files\\opencv\\build") set(OpenCV_INCLUDE_DIRS ${OpenCV_DIR}\\include) set(OpenCV_LIB_DIRS ${OpenCV_DIR}\\x64\\vc16\\lib) set(OpenCV_Debug_LIBS "opencv_world4110d.lib") set(OpenCV_Release_LIBS "opencv_world4110.lib") include_directories(${OpenCV_INCLUDE_DIRS}) link_directories(${OpenCV_LIB_DIRS}) # dirent set(Dirent_INCLUDE_DIRS "D:\\Program Files\\dirent\\include") include_directories(${Dirent_INCLUDE_DIRS}) add_library(myplugins SHARED ${PROJECT_SOURCE_DIR}/plugin/yololayer.cu) target_link_libraries(myplugins nvinfer cudart) file(GLOB_RECURSE SRCS ${PROJECT_SOURCE_DIR}/src/*.cpp ${PROJECT_SOURCE_DIR}/src/*.cu) add_executable(yolov11Test main.cpp ${SRCS}) target_link_libraries(yolov11Test nvinfer) target_link_libraries(yolov11Test cudart) target_link_libraries(yolov11Test myplugins) target_link_libraries(yolov11Test ${OpenCV_Debug_LIBS}) target_link_libraries(yolov11Test ${OpenCV_Release_LIBS}) install(TARGETS yolov11Test LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR} RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR} )
- main函数代码如下:
#include <fstream> #include <iostream> #include <opencv2/opencv.hpp> #include "cuda_utils.h" #include "logging.h" #include "model.h" #include "postprocess.h" #include "preprocess.h" #include "utils.h" Logger gLogger; using namespace nvinfer1; const int kOutputSize = kMaxNumOutputBbox * sizeof(Detection) / sizeof(float) + 1; void deserialize_engine(std::string& engine_name, IRuntime** runtime, ICudaEngine** engine, IExecutionContext** context) { std::ifstream file(engine_name, std::ios::binary); if (!file.good()) { std::cerr << "read " << engine_name << " error!" << std::endl; assert(false); } size_t size = 0; file.seekg(0, file.end); size = file.tellg(); file.seekg(0, file.beg); char* serialized_engine = new char[size]; assert(serialized_engine); file.read(serialized_engine, size); file.close(); *runtime = createInferRuntime(gLogger); assert(*runtime); *engine = (*runtime)->deserializeCudaEngine(serialized_engine, size); assert(*engine); *context = (*engine)->createExecutionContext(); assert(*context); delete[] serialized_engine; } void prepare_buffer(ICudaEngine* engine, float** input_buffer_device, float** output_buffer_device, float** output_buffer_host, float** decode_ptr_host, float** decode_ptr_device, std::string cuda_post_process) { assert(engine->getNbBindings() == 2); // In order to bind the buffers, we need to know the names of the input and output tensors. // Note that indices are guaranteed to be less than IEngine::getNbBindings() const int inputIndex = engine->getBindingIndex(kInputTensorName); const int outputIndex = engine->getBindingIndex(kOutputTensorName); assert(inputIndex == 0); assert(outputIndex == 1); // Create GPU buffers on device CUDA_CHECK(cudaMalloc((void**)input_buffer_device, kBatchSize * 3 * kInputH * kInputW * sizeof(float))); CUDA_CHECK(cudaMalloc((void**)output_buffer_device, kBatchSize * kOutputSize * sizeof(float))); if (cuda_post_process == "c") { *output_buffer_host = new float[kBatchSize * kOutputSize]; } else if (cuda_post_process == "g") { if (kBatchSize > 1) { std::cerr << "Do not yet support GPU post processing for multiple batches" << std::endl; exit(0); } // Allocate memory for decode_ptr_host and copy to device *decode_ptr_host = new float[1 + kMaxNumOutputBbox * bbox_element]; CUDA_CHECK(cudaMalloc((void**)decode_ptr_device, sizeof(float) * (1 + kMaxNumOutputBbox * bbox_element))); } } void infer(IExecutionContext& context, cudaStream_t& stream, void** buffers, float* output, int batchsize, float* decode_ptr_host, float* decode_ptr_device, int model_bboxes, std::string cuda_post_process) { // infer on the batch asynchronously, and DMA output back to host auto start = std::chrono::system_clock::now(); context.enqueueV2(buffers, stream, nullptr); if (cuda_post_process == "c") { CUDA_CHECK(cudaMemcpyAsync(output, buffers[1], batchsize * kOutputSize * sizeof(float), cudaMemcpyDeviceToHost, stream)); auto end = std::chrono::system_clock::now(); std::cout << "inference time: " << std::chrono::duration_cast<std::chrono::milliseconds>(end - start).count() << "ms" << std::endl; } else if (cuda_post_process == "g") { CUDA_CHECK( cudaMemsetAsync(decode_ptr_device, 0, sizeof(float) * (1 + kMaxNumOutputBbox * bbox_element), stream)); cuda_decode((float*)buffers[1], model_bboxes, kConfThresh, decode_ptr_device, kMaxNumOutputBbox, stream); cuda_nms(decode_ptr_device, kNmsThresh, kMaxNumOutputBbox, stream); //cuda nms CUDA_CHECK(cudaMemcpyAsync(decode_ptr_host, decode_ptr_device, sizeof(float) * (1 + kMaxNumOutputBbox * bbox_element), cudaMemcpyDeviceToHost, stream)); auto end = std::chrono::system_clock::now(); std::cout << "inference and gpu postprocess time: " << std::chrono::duration_cast<std::chrono::milliseconds>(end - start).count() << "ms" << std::endl; } CUDA_CHECK(cudaStreamSynchronize(stream)); } int main(int argc, char** argv) { // yolo11_det -s ../models/yolo11n.wts ../models/yolo11n.fp32.trt n // yolo11_det -d ../models/yolo11n.fp32.trt ../images c cudaSetDevice(kGpuId); std::string engine_name= "D:\\code\\tensorrtx\\yolo11\\build\\yolo11n.engine"; //转换好的模型文件路径 std::string img_dir= "D:\\code\\yolov5-6.1\\data\\images\\"; //要预测的图像文件夹所在路径 std::string cuda_post_process = "g"; int model_bboxes; float gd = 0, gw = 0; int max_channels = 0; // 反序列化模型文件 Deserialize the engine from file IRuntime* runtime = nullptr; ICudaEngine* engine = nullptr; IExecutionContext* context = nullptr; deserialize_engine(engine_name, &runtime, &engine, &context); cudaStream_t stream; CUDA_CHECK(cudaStreamCreate(&stream)); cuda_preprocess_init(kMaxInputImageSize); auto out_dims = engine->getBindingDimensions(1); model_bboxes = out_dims.d[0]; // 准备cpu和gpu缓存 Prepare cpu and gpu buffers float* device_buffers[2]; float* output_buffer_host = nullptr; float* decode_ptr_host = nullptr; float* decode_ptr_device = nullptr; // 从文件夹中读取图像 Read images from directory std::vector<std::string> file_names; if (read_files_in_dir(img_dir.c_str(), file_names) < 0) { std::cerr << "read_files_in_dir failed." << std::endl; return -1; } prepare_buffer(engine, &device_buffers[0], &device_buffers[1], &output_buffer_host, &decode_ptr_host, &decode_ptr_device, cuda_post_process); // 批预测batch predict for (size_t i = 0; i < file_names.size(); i += kBatchSize) { // 通过opencv读取一批图像Get a batch of images std::vector<cv::Mat> img_batch; std::vector<std::string> img_name_batch; for (size_t j = i; j < i + kBatchSize && j < file_names.size(); j++) { cv::Mat img = cv::imread(img_dir + "/" + file_names[j]); img_batch.push_back(img); img_name_batch.push_back(file_names[j]); } // Preprocess cuda_batch_preprocess(img_batch, device_buffers[0], kInputW, kInputH, stream); // 进行推理Run inference infer(*context, stream, (void**)device_buffers, output_buffer_host, kBatchSize, decode_ptr_host, decode_ptr_device, model_bboxes, cuda_post_process); // 保存output_buffer_host的前100个值,一行一个 // std::ofstream out("../models/output.txt"); // for (int j = 0; j < 100; j++) { // out << output_buffer_host[j] << std::endl; // } // out.close(); std::vector<std::vector<Detection>> res_batch; if (cuda_post_process == "c") { // NMS非极大值抑制 batch_nms(res_batch, output_buffer_host, img_batch.size(), kOutputSize, kConfThresh, kNmsThresh); } else if (cuda_post_process == "g") { //GPU非极大值抑制Process gpu decode and nms results batch_process(res_batch, decode_ptr_host, img_batch.size(), bbox_element, img_batch); } // 绘制结果Draw bounding boxes draw_bbox(img_batch, res_batch); //显示图像 for (size_t j = 0; j < img_batch.size(); j++) { cv::imshow("results", img_batch[j]); cv::waitKey(0); } // 保存图像Save images for (size_t j = 0; j < img_batch.size(); j++) { cv::imwrite("_" + img_name_batch[j], img_batch[j]); } } // Release stream and buffers cudaStreamDestroy(stream); CUDA_CHECK(cudaFree(device_buffers[0])); CUDA_CHECK(cudaFree(device_buffers[1])); CUDA_CHECK(cudaFree(decode_ptr_device)); delete[] decode_ptr_host; delete[] output_buffer_host; cuda_preprocess_destroy(); // Destroy the engine delete context; delete engine; delete runtime; // Print histogram of the output distribution //std::cout << "\nOutput:\n\n"; //for (unsigned int i = 0; i < kOutputSize; i++) //{ // std::cout << prob[i] << ", "; // if (i % 10 == 0) std::cout << std::endl; //} //std::cout << std::endl; return 0; }