llava是一个典型的视觉语言模型,之前视觉语言模型VLM部署:基于tensorrt和tensorrt-llm的python代码使用python进行推理。由于在部署端一般采用c++进行部署,这里采用C++进行实现。基本逻辑与之前相似,以llava-interleave-qwen-0.5b-hf为例,模型整体是采用Siglip(视觉模型)+MLP(投射模型)+Qwen0.5B(语言模型),采用tensorrt和tensorrt-llm进行模型部署。
/*------------------------
Author: kk
Date: 2025.8.27
Version: v0.1
function: vlm llava inference with tensorrt and tensorrt-llm
-------------------------*/
#include <filesystem>
#include <fstream>
#include <iostream>
#include <memory>
#include <vector>
#include <string>
#include <cassert>
#include <iomanip>
#include <algorithm>
#include <cstring>
#include <cstdint>
#include <opencv2/opencv.hpp>
#include "tokenizers_cpp.h"
#include <cuda_fp16.h>
#include <NvInfer.h>
#include <NvInferRuntime.h>
#include <cuda_runtime_api.h>
#include "tensorrt_llm/common/logger.h"
#include "tensorrt_llm/executor/executor.h"
#include "tensorrt_llm/plugins/api/tllmPlugin.h"
// tensorrtllm ns
namespace llm = tensorrt_llm::executor;
// image process parameters
static constexpr int kH = 384;
static constexpr int kW = 384;
static constexpr bool kDoConvertRGB = true;
static constexpr bool kDoResize = true;
static constexpr bool kDoRescale = true;
static constexpr bool kDoNormalize = true;
static constexpr float kRescale = 1.0f / 255.0f;
static constexpr float kMean[3] = {0.5f, 0.5f, 0.5f};
static constexpr float kStd[3] = {0.5f, 0.5f, 0.5f};
static constexpr int kInterp = cv::INTER_CUBIC; // PIL BICUBIC ≈ OpenCV INTER_CUBIC
// =====================================================
// 图像预处理,输出:CHW float32,长度 3*kH*kW
inline std::vector<float> preprocess_siglip_chw_f32(const cv::Mat& bgr) {
CV_Assert(!bgr.empty() && bgr.type() == CV_8UC3);
cv::Mat img = bgr;
if (kDoConvertRGB) {cv::cvtColor(img, img, cv::COLOR_BGR2RGB);}
if (kDoResize) {cv::resize(img, img, cv::Size(kW, kH), 0, 0, kInterp); }
img.convertTo(img, CV_32FC3);
if (kDoRescale) {img *= kRescale;}
// Normalize: (x - mean) / std
if (kDoNormalize) {
std::vector<cv::Mat> ch(3);
cv::split(img, ch);
for (int i = 0; i < 3; ++i) {
ch[i] = (ch[i] - kMean[i]) / kStd[i];
}
cv::merge(ch, img);
}
// print img
// cv::Mat roi = img(cv::Rect(0,0,5,5)).clone();
// std::cout << cv::format(roi, cv::Formatter::FMT_NUMPY) << std::endl;
// HWC -> CHW
std::vector<float> out(3 * kH * kW);
std::vector<cv::Mat> ch(3);
cv::split(img, ch);
const size_t plane = static_cast<size_t>(kH) * static_cast<size_t>(kW);
for (int c = 0; c < 3; ++c) {
std::memcpy(out.data() + c * plane, ch[c].ptr<float>(), plane * sizeof(float));
}
return out;
}
// =====================================================
// ----------------- load tensorrt engine -----------------
class Logger : public nvinfer1::ILogger {
public:
explicit Logger(Severity s = Severity::kWARNING) : reportableSeverity(s) {}
void log(Severity severity, const char* msg) noexcept override {
if (severity <= reportableSeverity) {
std::cerr << "[TRT] " << msg << std::endl;
}
}
private:
Severity reportableSeverity;
};
static std::vector<char> readFile(const std::string& path) {
std::ifstream f(path, std::ios::binary);
if (!f) { throw std::runtime_error("Cannot open " + path); }
f.seekg(0, std::ios::end);
size_t sz = static_cast<size_t>(f.tellg());
f.seekg(0, std::ios::beg);
std::vector<char> buf(sz);
f.read(buf.data(), sz);
return buf;
}
static inline void checkCuda(cudaError_t e, const char* file, int line) {
if (e != cudaSuccess) {
std::cerr << "CUDA Error " << cudaGetErrorString(e) << " at " << file << ":" << line << std::endl;
std::exit(1);
}
}
#define CHECK_CUDA(expr) checkCuda((expr), __FILE__, __LINE__)
static inline size_t trtTypeSize(nvinfer1::DataType t) {
switch (t) {
case nvinfer1::DataType::kFLOAT: return 4;
case nvinfer1::DataType::kHALF: return 2;
case nvinfer1::DataType::kINT8: return 1;
case nvinfer1::DataType::kINT32: return 4;
case nvinfer1::DataType::kBOOL: return 1;
default: return 0;
}
}
// cal length
static inline int64_t volume(const nvinfer1::Dims& d) {
int64_t v = 1;
for (int i = 0; i < d.nbDims; ++i) v *= d.d[i];
return v;
}
template <typename T>
using TrtUniquePtr = std::unique_ptr<T>;
struct EngineIO { std::string inName; std::string outName; };
struct RunResult {
std::vector<uint8_t> outHost; // raw bytes of output (CPU)
nvinfer1::Dims outShape; // TRT output shape
nvinfer1::DataType outType; // TRT output dtype
};
RunResult runEngineV3(
const std::string& enginePath,
const EngineIO& io,
const nvinfer1::Dims& inputShape,
const void* inputDataHost,
size_t inputBytes,
Logger& logger
){
TrtUniquePtr<nvinfer1::IRuntime> runtime{nvinfer1::createInferRuntime(logger)};
auto blob = readFile(enginePath);
TrtUniquePtr<nvinfer1::ICudaEngine> engine{runtime->deserializeCudaEngine(blob.data(), blob.size())};
if (!engine) throw std::runtime_error("Deserialize failed: " + enginePath);
auto hasByName = [&](const std::string& n){
return engine->getTensorIOMode(n.c_str()) != nvinfer1::TensorIOMode::kNONE;
};
if (!hasByName(io.inName) || !hasByName(io.outName)) {
throw std::runtime_error("Tensor not found: " + io.inName + " or " + io.outName);
}
TrtUniquePtr<nvinfer1::IExecutionContext> context{engine->createExecutionContext()};
if (!context) throw std::runtime_error("createExecutionContext failed");
// 设置动态输入形状
if (!context->setInputShape(io.inName.c_str(), inputShape)) {
throw std::runtime_error("setInputShape failed for " + io.inName);
}
const nvinfer1::DataType outType = engine->getTensorDataType(io.outName.c_str());
const nvinfer1::Dims outShape = context->getTensorShape(io.outName.c_str());
cudaStream_t stream{}; CHECK_CUDA(cudaStreamCreate(&stream));
// H2D 输入
void* dIn{nullptr}; CHECK_CUDA(cudaMalloc(&dIn, inputBytes));
CHECK_CUDA(cudaMemcpyAsync(dIn, inputDataHost, inputBytes, cudaMemcpyHostToDevice, stream));
const int64_t outElems = volume(outShape);
const size_t outBytes = static_cast<size_t>(outElems) * trtTypeSize(outType);
void* dOut{nullptr}; CHECK_CUDA(cudaMalloc(&dOut, outBytes));
std::vector<uint8_t> hOut(outBytes);
// 绑定地址并执行
if (!context->setTensorAddress(io.inName.c_str(), dIn)) throw std::runtime_error("setTensorAddress(in) failed");
if (!context->setTensorAddress(io.outName.c_str(), dOut)) throw std::runtime_error("setTensorAddress(out) failed");
if (!context->enqueueV3(stream)) throw std::runtime_error("enqueueV3 failed");
CHECK_CUDA(cudaMemcpyAsync(hOut.data(), dOut, outBytes, cudaMemcpyDeviceToHost, stream));
CHECK_CUDA(cudaStreamSynchronize(stream));
cudaFree(dIn); cudaFree(dOut); cudaStreamDestroy(stream);
RunResult rr;
rr.outHost = std::move(hOut);
rr.outShape = outShape;
rr.outType = outType;
return rr;
}
// =====================================================
// ----------------- load tensorrt-llm engine --------------
static constexpr llm::DataType kPromptDtype = llm::DataType::kFP16;
// fp32 -> fp16
inline std::vector<uint16_t> fp32_to_fp16(const std::vector<float>& x) {
std::vector<uint16_t> h(x.size());
for (size_t i=0;i<x.size();++i) {
uint32_t u; std::memcpy(&u, &x[i], 4);
uint32_t s=(u>>31)&1; int e=((u>>23)&0xFF)-127+15; uint32_t m=(u>>13)&0x3FF;
uint16_t out;
if (e<=0){ if(e<-10) out=(uint16_t)(s<<15); else { m=(m|0x400)>>(1-e); out=(uint16_t)((s<<15)|m);} }
else if(e>=31){ out=(uint16_t)((s<<15)|(0x1F<<10)); if(u&0x7FFFFF) out|=1; }
else { out=(uint16_t)((s<<15)|(e<<10)|(m&0x3FF)); }
h[i]=out;
}
return h;
}
// // fp16 -> fp32
// inline float fp16_to_fp32_cuda(uint16_t bits){
// __half h;
// std::memcpy(&h, &bits, sizeof(uint16_t)); // 把位模式放进 __half
// return __half2float(h); // 转 float32(Host可用)
// }
// 数据拷到 llm::Tensor
template <typename T>
llm::Tensor makeCpuTensor2D(const std::vector<T>& flat, size_t M, size_t H) {
llm::Tensor t = llm::Tensor::cpu(
std::is_same<T,uint16_t>::value ? llm::DataType::kFP16 : llm::DataType::kFP32,
llm::Shape{ static_cast<int64_t>(M), static_cast<int64_t>(H) }
);
std::memcpy(t.getData(), flat.data(), flat.size()*sizeof(T));
return t;
}
// 构造 expanded_ids:把 token [eng_vocab, eng_vocab+M) 插到 pos 位置
static std::vector<int32_t> buildExpandedIds(const std::vector<int32_t>& ids, int pos, int eng_vocab, int M) {
std::vector<int32_t> out;
out.reserve(ids.size() - 1 + M);
out.insert(out.end(), ids.begin(), ids.begin()+pos); // left
for (int i=0;i<M;++i) out.push_back(eng_vocab + i); // fake ids
out.insert(out.end(), ids.begin()+pos+1, ids.end()); // right
return out;
}
// =====================================================
// load tokenizer
static std::string LoadBytesFromFile(const std::string& path) {
std::ifstream ifs(path, std::ios::binary);
if (!ifs) { throw std::runtime_error("open failed: " + path); }
return std::string{std::istreambuf_iterator<char>(ifs), std::istreambuf_iterator<char>()};
}
std::string build_chat_prompt(const std::string& user_message = "What are these?")
{
std::string prompt;
prompt += "<|im_start|>";
prompt += "user\n";
prompt += "<image>\n";
prompt += user_message;
prompt += "<|im_end|>\n<|im_start|>";
prompt += "assistant\n";
return prompt;
}
// =====================================================
// -------------------- main主函数----------------------
int main(int argc, char** argv) {
// load img and preprocess
cv::Mat bgr = cv::imread("/media/wangyl/OS/wyl/program/VLMInference/vlm_cxx_inference/source/000000039769.jpg");
if (bgr.empty()) { std::cerr << "read image failed\n"; return 1; }
auto preprocess_img = preprocess_siglip_chw_f32(bgr);
// tensorrt inference parameters
Logger logger;
std::string V_ENGINE = "/media/wangyl/OS/wyl/program/VLMInference/export_llava_qwen05B/vision_siglip.engine";
std::string P_ENGINE = "/media/wangyl/OS/wyl/program/VLMInference/export_llava_qwen05B/projector.engine";
const int Batch_size = 1, Channel = 3, H_img = 384, W_img = 384;
// 1) vision infer
nvinfer1::Dims inDimsVision{4, {Batch_size, Channel, H_img, W_img}};
RunResult vis = runEngineV3(
V_ENGINE,
EngineIO{ "pixel_values", "feats" },/* io */
inDimsVision,
preprocess_img.data(), /* input host ptr */
preprocess_img.size() * sizeof(float), /* bytes */
logger
);
std::cout << "[Vision] out dtype=" << static_cast<int>(vis.outType) << " shape=(";
for (int i=0;i<vis.outShape.nbDims;++i)
std::cout << vis.outShape.d[i] << (i+1<vis.outShape.nbDims? ",":"");
std::cout << ")\n";
// 2) projector infer
const int64_t featsElems = volume(vis.outShape);
const size_t featsBytes = static_cast<size_t>(featsElems) * trtTypeSize(vis.outType);
nvinfer1::Dims inDimsProj = vis.outShape;
RunResult proj = runEngineV3(
P_ENGINE,
EngineIO{ "feats_in", "feats_out" },/* io */
inDimsProj,
vis.outHost.data(),/* input host ptr */
featsBytes,/* bytes */
logger
);
std::cout << "[Projector] out dtype=" << static_cast<int>(proj.outType) << " shape=(";
for (int i=0;i<proj.outShape.nbDims;++i)
std::cout << proj.outShape.d[i] << (i+1<proj.outShape.nbDims? ",":"");
std::cout << ")\n";
// 3) vision feature for llm
assert(proj.outType == nvinfer1::DataType::kFLOAT);
const int nb = proj.outShape.nbDims;
int64_t hiddenDim = proj.outShape.d[nb - 1];
int64_t M_rows = 1;
for (int i = 0; i < nb - 1; ++i) M_rows *= proj.outShape.d[i];
const int64_t total = M_rows * hiddenDim;
std::vector<float> mm_features_fp32(static_cast<size_t>(total));
std::memcpy(mm_features_fp32.data(), proj.outHost.data(), static_cast<size_t>(total) * sizeof(float));
// std::cout << std::fixed << std::setprecision(6);
// std::cout << "mm_features: " << M_rows << " x " << hiddenDim << "\n"
// << "first row (8 vals): ";
// for (int j = 0; j < std::min<int64_t>(8, hiddenDim); ++j) {
// std::cout << mm_features_fp32[j] << " ";
// }
// std::cout << "\n";
// 4) tensorrt-llm infer
initTrtLlmPlugins();
std::string engineDir = "/media/wangyl/OS/wyl/program/VLMInference/export_llava_qwen05B/llm_qwen2_0.5b/llama_trt_engine";
//tokenizer
const int EOS_ID = 151645; // "<|im_end|>"
const int PAD_ID = 151643; // "<|endoftext|>"
const int IMAGE_TOKEN_ID= 151646; // "<image>"
int eng_vocab = 152000; // vocab_size
std::string blob = LoadBytesFromFile("/media/wangyl/OS/wyl/program/VLMInference/export_llava_qwen05B/tokenizer/tokenizer.json");
auto tok = tokenizers::Tokenizer::FromBlobJSON(blob);
std::string prompt = build_chat_prompt("What are these?");
std::vector<int32_t> ids = tok->Encode(prompt);
// 定位<image>的位置作为图像特征插入位置 pos
auto itImg = std::find(ids.begin(), ids.end(), IMAGE_TOKEN_ID);
if (itImg == ids.end())
{
std::cerr << "[FATAL] prompt ids does not contain <image> (id=" << IMAGE_TOKEN_ID << ")\n";
return 1;
}
int pos = static_cast<int>(std::distance(ids.begin(), itImg)); //pos=3
// 构造 expanded ids:把 [eng_vocab, eng_vocab+M_rows) 插在 pos 位置
std::vector<int32_t> expanded = buildExpandedIds(ids, pos, eng_vocab, static_cast<int>(M_rows));
// kv复用标志位
std::vector<long unsigned int> inputTokenExtraIds(expanded.size(), static_cast<long unsigned int>(-1));
for (int i = 0; i < static_cast<int>(M_rows); ++i) {
inputTokenExtraIds[pos + i] = i;
}
// 构造 Prompt 表([M_rows, hiddenDim])
llm::Tensor promptTable;
if (kPromptDtype == llm::DataType::kFP16){
auto half_bits = fp32_to_fp16(mm_features_fp32);
promptTable = makeCpuTensor2D<uint16_t>(
half_bits, static_cast<size_t>(M_rows), static_cast<size_t>(hiddenDim));
}
// 采样参数
llm::SamplingConfig sampling;
sampling.setTemperature(0.0f);
sampling.setTopP(1.0f);
sampling.setSeed(0);
sampling.setTopK(0); // 显式关闭 top-k
// infer
llm::PromptTuningConfig pConfig(promptTable, std::make_optional(inputTokenExtraIds));
llm::VecTokens batchInput;
batchInput.assign(expanded.begin(), expanded.end());
llm::SizeType32 maxNewTokens = 200;
llm::Request req(
batchInput,
maxNewTokens,
/*streaming=*/false,
sampling,
llm::OutputConfig{}, // 默认输出配置
/*endId=*/EOS_ID,
/*padId=*/PAD_ID,
/*positionIds=*/std::nullopt,
/*badWords=*/std::nullopt,
/*stopWords=*/std::nullopt,
/*embeddingBias=*/std::nullopt,
/*externalDraftTokens=*/std::nullopt,
/*pTuningConfig=*/pConfig, // 关键:挂上图像 Prompt 表
/*multimodalInput=*/std::nullopt,
/*multimodalEmbedding=*/std::nullopt
);
llm::SizeType32 beamWidth = 1;
llm::ExecutorConfig execCfg(beamWidth);
execCfg.setGpuWeightsPercent(1.0f);
llm::Executor executor(std::filesystem::path(engineDir),
llm::ModelType::kDECODER_ONLY,
execCfg);
auto rid = executor.enqueueRequest(req);
auto responses = executor.awaitResponses(rid);
// decode
std::vector<int32_t> out_ids;
if (!responses.empty())
{
const auto& res = responses.at(0).getResult();
if (!res.outputTokenIds.empty())
{
// 取第0个 beam
const auto& beams = res.outputTokenIds;
out_ids.assign(beams[0].begin(), beams[0].end());
}
}
std::string text = tok->Decode(out_ids);
std::cout << "decoded: " << text << "\n";
return 0;
}
CMakeLists.txt文件:
cmake_minimum_required(VERSION 3.27)
set(TRTLLM_DIR "/home/wangyl/docker_program/trtllm/TensorRT-LLM")
list(APPEND CMAKE_MODULE_PATH "${TRTLLM_DIR}/cpp/cmake/modules")
if(NOT TRTLLM_BUILD_DIR)
set(TRTLLM_BUILD_DIR "${TRTLLM_DIR}/cpp/build")
endif()
set(TRTLLM_LIB_PATH "${TRTLLM_BUILD_DIR}/tensorrt_llm/libtensorrt_llm.so")
set(TRTLLM_PLUGIN_PATH
"${TRTLLM_BUILD_DIR}/tensorrt_llm/plugins/libnvinfer_plugin_tensorrt_llm.so"
)
set(TRTLLM_INCLUDE_DIR "${TRTLLM_DIR}/cpp/include")
# Determine CXX11 ABI compatibility
execute_process(
COMMAND bash -c "nm -f posix -D ${TRTLLM_LIB_PATH} | grep __cxx11"
RESULT_VARIABLE GLIB_CXX11_FOUND
OUTPUT_QUIET)
if(GLIB_CXX11_FOUND EQUAL 0)
set(USE_CXX11_ABI 1)
else()
set(USE_CXX11_ABI 0)
endif()
message(STATUS "Use CXX11 ABI: ${USE_CXX11_ABI}")
add_compile_options("-D_GLIBCXX_USE_CXX11_ABI=${USE_CXX11_ABI}")
set(CMAKE_CXX_STANDARD 17)
set(CMAKE_CXX_STANDARD_REQUIRED TRUE)
set(CMAKE_VERBOSE_MAKEFILE 1)
# Define project name
project(llava_vlm_Inference)
# Compile options
set(CMAKE_CXX_FLAGS "-Wall -pthread -lstdc++ -DENABLE_MULTI_DEVICE=1")
# set(CMAKE_CXX_FLAGS_RELEASE "-O3")
set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} -g -O0")
set(CMAKE_BUILD_TYPE Debug)
# find_package(CUDA REQUIRED)
find_package(OpenCV REQUIRED)
find_package(CUDAToolkit REQUIRED COMPONENTS cuda_driver cudart_static nvml)
message(STATUS "CUDA library status:")
message(STATUS " version: ${CUDAToolkit_VERSION}")
message(STATUS " libraries: ${CUDAToolkit_LIBRARY_DIR}")
message(STATUS " include path: ${CUDAToolkit_INCLUDE_DIRS}")
# TRT dependencies
find_package(TensorRT 10 REQUIRED)
if(${CUDAToolkit_VERSION} VERSION_GREATER_EQUAL "11")
add_definitions("-DENABLE_BF16")
message(
STATUS
"CUDA_VERSION ${CUDA_VERSION} is greater or equal than 11.0, enable -DENABLE_BF16 flag"
)
endif()
if(${CUDAToolkit_VERSION} VERSION_GREATER_EQUAL "11.8")
add_definitions("-DENABLE_FP8")
message(
STATUS
"CUDA_VERSION ${CUDA_VERSION} is greater or equal than 11.8, enable -DENABLE_FP8 flag"
)
endif()
set(TOKCPP_ROOT /home/wangyl/docker_program/trtllm/tokenizers-cpp)
set(TOKCPP_BUILD ${TOKCPP_ROOT}/build)
# tensorrt_llm shared lib
add_library(tensorrt_llm SHARED IMPORTED)
set_property(TARGET tensorrt_llm PROPERTY IMPORTED_LOCATION ${TRTLLM_LIB_PATH})
set_property(
TARGET tensorrt_llm PROPERTY IMPORTED_LINK_INTERFACE_LIBRARIES
CUDA::cuda_driver CUDA::cudart_static CUDA::nvml)
# nvinfer_plugin_tensorrt_llm shared lib
add_library(nvinfer_plugin_tensorrt_llm SHARED IMPORTED)
set_property(TARGET nvinfer_plugin_tensorrt_llm PROPERTY IMPORTED_LOCATION
${TRTLLM_PLUGIN_PATH})
set_property(TARGET nvinfer_plugin_tensorrt_llm
PROPERTY IMPORTED_LINK_INTERFACE_LIBRARIES tensorrt_llm)
include_directories(${TRTLLM_INCLUDE_DIR} ${CUDAToolkit_INCLUDE_DIRS})
include_directories(${CUDA_INCLUDE_DIRS} /usr/include/opencv4)
add_executable(llava_qwen_05B_infer ./src/llava_qwen_05B_infer.cpp)
target_include_directories(llava_qwen_05B_infer PRIVATE ${TOKCPP_ROOT}/include)
target_link_directories(llava_qwen_05B_infer PRIVATE ${TOKCPP_BUILD})
target_link_libraries(llava_qwen_05B_infer
${CUDA_LIBRARIES}
nvinfer
CUDA::cudart
opencv_core
opencv_imgproc
opencv_highgui
nvinfer_plugin_tensorrt_llm
tokenizers_cpp
tokenizers_c
-lcnpy)
同样的图像和提示词,输出结果与python结果输出一致:
These are two cats, one on the left and one on the right. They are lying on a pink blanket, which is placed on a couch. The cat on the left is sleeping, while the one on the right is resting.
其他例子:自动驾驶仿真场景描述:
These are images of a virtual environment, likely from a video game or a simulation. The scene depicts a car driving down a road with a cloudy sky, greenery on the sides, and a street lamp in the background. The environment is designed to simulate a realistic driving experience, with realistic textures and lighting effects that give the impression of a real-world setting.