本文使用ONNX版本为1.14.1,OpenCv版本为4.8.0(同时也支持4.5.0,4.7.0),RTDETR不需要后处理所以不需要进行后处理,与YOLO不同的点还有YOLO需要特征金字塔所以输出的tenser尺寸是动态变动的,RTDETR默认输入300个框,所以Tenser在输出上默认为[1,300,class+4]。
#include <iostream>
#include <vector>
#include <string>
#include <memory>
#include <random>
#include <algorithm>
#include <opencv2/opencv.hpp>
#include <onnxruntime_cxx_api.h>
class RTDETR {
private:
std::string model_path_;
std::string img_path_;
float conf_thres_;
std::unique_ptr<Ort::Session> session_;
std::unique_ptr<Ort::Env> env_;
Ort::MemoryInfo memory_info_;
int input_width_;
int input_height_;
std::vector<std::string> classes_;
std::vector<cv::Scalar> color_palette_;
cv::Mat img_;
int img_height_;
int img_width_;
//更换你需要的类别
void initializeClasses() {
classes_ = {
"person", "bicycle", "car", "motorcycle", "airplane", "bus", "train", "truck",
"boat", "traffic light", "fire hydrant", "stop sign", "parking meter", "bench",
"bird", "cat", "dog", "horse", "sheep", "cow", "elephant", "bear", "zebra",
"giraffe", "backpack", "umbrella", "handbag", "tie", "suitcase", "frisbee",
"skis", "snowboard", "sports ball", "kite", "baseball bat", "baseball glove",
"skateboard", "surfboard", "tennis racket", "bottle", "wine glass", "cup",
"fork", "knife", "spoon", "bowl", "banana", "apple", "sandwich", "orange",
"broccoli", "carrot", "hot dog", "pizza", "donut", "cake", "chair", "couch",
"potted plant", "bed", "dining table", "toilet", "tv", "laptop", "mouse",
"remote", "keyboard", "cell phone", "microwave", "oven", "toaster", "sink",
"refrigerator", "book", "clock", "vase", "scissors", "teddy bear", "hair drier",
"toothbrush"
};
}
void generateColorPalette() {
std::random_device rd;
std::mt19937 gen(rd());
std::uniform_int_distribution<> dis(0, 255);
color_palette_.clear();
for (size_t i = 0; i < classes_.size(); ++i) {
color_palette_.push_back(cv::Scalar(dis(gen), dis(gen), dis(gen)));
}
}
public:
RTDETR(const std::string& model_path,
const std::string& img_path,
float conf_thres = 0.5f)
: model_path_(model_path), img_path_(img_path),
conf_thres_(conf_thres),
memory_info_(Ort::MemoryInfo::CreateCpu(OrtArenaAllocator, OrtMemTypeDefault)) {
// Initialize ONNX Runtime
env_ = std::make_unique<Ort::Env>(ORT_LOGGING_LEVEL_WARNING, "RTDETR");
Ort::SessionOptions session_options;
session_options.SetIntraOpNumThreads(1);
session_options.SetGraphOptimizationLevel(GraphOptimizationLevel::ORT_ENABLE_EXTENDED);
// Create session
#ifdef _WIN32
std::wstring wide_model_path(model_path.begin(), model_path.end());
session_ = std::make_unique<Ort::Session>(*env_, wide_model_path.c_str(), session_options);
#else
session_ = std::make_unique<Ort::Session>(*env_, model_path.c_str(), session_options);
#endif
// Get input dimensions
auto input_type_info = session_->GetInputTypeInfo(0);
auto input_tensor_info = input_type_info.GetTensorTypeAndShapeInfo();
auto input_dims = input_tensor_info.GetShape();
if (input_dims.size() >= 4) {
input_height_ = static_cast<int>(input_dims[2]);
input_width_ = static_cast<int>(input_dims[3]);
}
else {
throw std::runtime_error("Invalid model input dimensions");
}
std::cout << "Model input size: " << input_width_ << "x" << input_height_ << std::endl;
// Initialize classes and color palette
initializeClasses();
generateColorPalette();
std::cout << "Loaded " << classes_.size() << " classes" << std::endl;
}
void drawDetections(const std::vector<float>& box, float score, int class_id) {
if (class_id >= static_cast<int>(color_palette_.size()) || class_id < 0) return;
int x1 = static_cast<int>(box[0]);
int y1 = static_cast<int>(box[1]);
int x2 = static_cast<int>(box[2]);
int y2 = static_cast<int>(box[3]);
cv::Scalar color = color_palette_[class_id];
// Draw bounding box
cv::rectangle(img_, cv::Point(x1, y1), cv::Point(x2, y2), color, 2);
// Create label
std::string label = classes_[class_id] + ": " + std::to_string(score).substr(0, 4);
// Get text size
int baseline = 0;
cv::Size text_size = cv::getTextSize(label, cv::FONT_HERSHEY_SIMPLEX, 0.5, 1, &baseline);
// Calculate label position
int label_x = x1;
int label_y = (y1 - 10 > text_size.height) ? y1 - 10 : y1 + 10;
// Draw label background
cv::rectangle(img_,
cv::Point(label_x, label_y - text_size.height),
cv::Point(label_x + text_size.width, label_y + text_size.height),
color, cv::FILLED);
// Draw label text
cv::putText(img_, label, cv::Point(label_x, label_y),
cv::FONT_HERSHEY_SIMPLEX, 0.5, cv::Scalar(0, 0, 0), 1, cv::LINE_AA);
}
std::vector<float> preprocess() {
// Read image
img_ = cv::imread(img_path_);
if (img_.empty()) {
throw std::runtime_error("Could not load image: " + img_path_);
}
img_height_ = img_.rows;
img_width_ = img_.cols;
std::cout << "Original image size: " << img_width_ << "x" << img_height_ << std::endl;
// Convert BGR to RGB
cv::Mat rgb_img;
cv::cvtColor(img_, rgb_img, cv::COLOR_BGR2RGB);
// Resize image
cv::Mat resized_img;
cv::resize(rgb_img, resized_img, cv::Size(input_width_, input_height_));
// Normalize and convert to float
resized_img.convertTo(resized_img, CV_32F, 1.0 / 255.0);
// Convert to CHW format and flatten
std::vector<float> input_data;
input_data.reserve(3 * input_height_ * input_width_);
// Channel-first format (CHW)
for (int c = 0; c < 3; ++c) {
for (int h = 0; h < input_height_; ++h) {
for (int w = 0; w < input_width_; ++w) {
input_data.push_back(resized_img.at<cv::Vec3f>(h, w)[c]);
}
}
}
return input_data;
}
std::vector<std::vector<float>> bboxCxcywhToXyxy(const std::vector<std::vector<float>>& boxes) {
std::vector<std::vector<float>> result;
for (const auto& box : boxes) {
if (box.size() < 4) continue;
float cx = box[0];
float cy = box[1];
float w = box[2];
float h = box[3];
float half_w = w / 2.0f;
float half_h = h / 2.0f;
float x1 = cx - half_w;
float y1 = cy - half_h;
float x2 = cx + half_w;
float y2 = cy + half_h;
result.push_back({ x1, y1, x2, y2 });
}
return result;
}
cv::Mat postprocess(const std::vector<float>& model_output) {
size_t num_classes = classes_.size();
size_t elements_per_detection = 4 + num_classes; // 4 for bbox + num_classes
size_t num_detections = model_output.size() / elements_per_detection;
std::cout << "Output shape: (" << num_detections << ", " << elements_per_detection << ")" << std::endl;
std::vector<std::vector<float>> boxes;
std::vector<float> scores;
std::vector<int> labels;
for (size_t i = 0; i < num_detections; ++i) {
size_t offset = i * elements_per_detection;
// 获取BoX (center_x, center_y, width, height)
std::vector<float> box = {
model_output[offset], // cx
model_output[offset + 1], // cy
model_output[offset + 2], // w
model_output[offset + 3] // h
};
// 获取类别
std::vector<float> class_scores(model_output.begin() + offset + 4,
model_output.begin() + offset + 4 + num_classes);
// 获取最大类别
auto max_it = std::max_element(class_scores.begin(), class_scores.end());
float max_score = *max_it;
int class_id = static_cast<int>(std::distance(class_scores.begin(), max_it));
if (max_score > conf_thres_) {
boxes.push_back(box);
scores.push_back(max_score);
labels.push_back(class_id);
}
}
// 坐标转换
auto xyxy_boxes = bboxCxcywhToXyxy(boxes);
std::cout << "Found " << xyxy_boxes.size() << " detections above threshold" << std::endl;
// Draw detections
for (size_t i = 0; i < xyxy_boxes.size(); ++i) {
// Scale bounding boxes to original image dimensions
std::vector<float> scaled_box = {
xyxy_boxes[i][0] * img_width_, // x1
xyxy_boxes[i][1] * img_height_, // y1
xyxy_boxes[i][2] * img_width_, // x2
xyxy_boxes[i][3] * img_height_ // y2
};
drawDetections(scaled_box, scores[i], labels[i]);
}
return img_;
}
cv::Mat main() {
try {
// Preprocess
std::vector<float> input_data = preprocess();
// Prepare input tensor
std::vector<int64_t> input_shape = { 1, 3, input_height_, input_width_ };
Ort::Value input_tensor = Ort::Value::CreateTensor<float>(
memory_info_, input_data.data(), input_data.size(),
input_shape.data(), input_shape.size());
// Get input/output names
Ort::AllocatorWithDefaultOptions allocator;
auto input_name = session_->GetInputNameAllocated(0, allocator);
auto output_name = session_->GetOutputNameAllocated(0, allocator);
const char* input_names[] = { input_name.get() };
const char* output_names[] = { output_name.get() };
// Run inference
std::cout << "Running inference..." << std::endl;
auto output_tensors = session_->Run(Ort::RunOptions{ nullptr },
input_names, &input_tensor, 1,
output_names, 1);
// Process output
float* output_data = output_tensors[0].GetTensorMutableData<float>();
auto output_shape = output_tensors[0].GetTensorTypeAndShapeInfo().GetShape();
size_t output_size = 1;
for (auto dim : output_shape) {
output_size *= dim;
}
std::cout << "Output tensor size: " << output_size << std::endl;
std::vector<float> model_output(output_data, output_data + output_size);
return postprocess(model_output);
}
catch (const std::exception& e) {
std::cerr << "Error in main pipeline: " << e.what() << std::endl;
return cv::Mat();
}
}
};
int main() {
try {
// 创建检测器实例,使用与Python代码相同的参数
RTDETR detector("model.onnx",
"bus.jpg",
0.5f);
// 执行检测并获取输出图像
cv::Mat output_image = detector.main();
if (output_image.empty()) {
std::cerr << "Error: Failed to process image" << std::endl;
return -1;
}
// 显示带注释的输出图像
cv::namedWindow("Output", cv::WINDOW_NORMAL);
cv::imshow("Output", output_image);
cv::waitKey(0);
cv::destroyAllWindows();
std::cout << "Detection completed successfully!" << std::endl;
}
catch (const std::exception& e) {
std::cerr << "Error: " << e.what() << std::endl;
return -1;
}
return 0;
}
运行结果: