Yolov8-obb使用cuda前后推理
Export-OnnxConvert-Engineyolo_cuda_utils.hyolo_obb.hprocess.hyolo_obb.cppprocess.cumain.cpp
·
Export-Onnx
from ultralytics import YOLO
model = YOLO("yolov8n-obb.yaml")
model = YOLO("yolov8n-obb.pt")
trainer = model.train(data="dota8.yaml", epochs=3, imgsz=1024)
metrics = model.val(data="dota8.yaml")
results = model("boats.jpg")
model.export(format="onnx")
Convert-Engine
trtexec.exe --onnx=yolov8n-obb.onnx --saveEngine=yolov8n-obb.engine --fp16
logger.h
#include "logging.h"
class SampleErrorRecorder;
extern SampleErrorRecorder gRecorder;
namespace sample
{
extern Logger gLogger;
extern LogStreamConsumer gLogVerbose;
extern LogStreamConsumer gLogInfo;
extern LogStreamConsumer gLogWarning;
extern LogStreamConsumer gLogError;
extern LogStreamConsumer gLogFatal;
void setReportableSeverity(Logger::Severity severity);
}
yolo_cuda_utils.h
#include <cuda_runtime_api.h>
#include <opencv2/opencv.hpp>
#include <dirent.h>
#include <fstream>
#include "logger.h"
#include "ErrorRecorder.h"
#include "NvInfer.h"
#ifndef CUDA_CHECK
#define CUDA_CHECK(callstr)\
{\
cudaError_t error_code = callstr;\
if (error_code != cudaSuccess) {\
std::cerr << "CUDA error " << error_code << " at " << __FILE__ << ":" << __LINE__;\
assert(0);\
}\
}
#endif
#pragma once
static inline cv::Mat preprocess_img(cv::Mat& img, int input_w, int input_h) {
int w, h, x, y;
float r_w = input_w / (img.cols * 1.0);
float r_h = input_h / (img.rows * 1.0);
if (r_h > r_w) {
w = input_w;
h = r_w * img.rows;
x = 0;
y = (input_h - h) / 2;
}
else {
w = r_h * img.cols;
h = input_h;
x = (input_w - w) / 2;
y = 0;
}
cv::Mat re(h, w, CV_8UC3);
cv::resize(img, re, re.size(), 0, 0, cv::INTER_LINEAR);
cv::Mat out(input_h, input_w, CV_8UC3, cv::Scalar(128, 128, 128));
re.copyTo(out(cv::Rect(x, y, re.cols, re.rows)));
return out;
}
static inline int read_files_in_dir(const char* p_dir_name, std::vector<std::string>& file_names) {
DIR* p_dir = opendir(p_dir_name);
if (p_dir == nullptr) {
return -1;
}
struct dirent* p_file = nullptr;
while ((p_file = readdir(p_dir)) != nullptr) {
if (strcmp(p_file->d_name, ".") != 0 &&
strcmp(p_file->d_name, "..") != 0) {
std::string cur_file_name(p_file->d_name);
file_names.push_back(cur_file_name);
}
}
closedir(p_dir);
return 0;
}
static inline std::string trim_leading_whitespace(const std::string& str) {
size_t first = str.find_first_not_of(' ');
if (std::string::npos == first) {
return str;
}
size_t last = str.find_last_not_of(' ');
return str.substr(first, (last - first + 1));
}
static inline std::string to_string_with_precision(const float a_value, const int n = 2) {
std::ostringstream out;
out.precision(n);
out << std::fixed << a_value;
return out.str();
}
static inline int read_labels(const std::string labels_filename, std::unordered_map<int, std::string>& labels_map) {
std::ifstream file(labels_filename);
std::string line;
int index = 0;
while (std::getline(file, line)) {
line = trim_leading_whitespace(line);
labels_map[index] = line;
index++;
}
file.close();
return 0;
}
#define USE_FP16
const static char* kInputTensorName = "images";
const static char* kOutputTensorName = "output0";
const static int kNumClass = 80;
const static int kBatchSize = 1;
const static int kGpuId = 0;
const static int kInputH = 1024;
const static int kInputW = 1024;
const static float kNmsThresh = 0.45f;
const static float kConfThresh = 0.5f;
const static float kConfThreshKeypoints = 0.5f;
const static int kMaxInputImageSize = 3000 * 3000;
const static int kMaxNumOutputBbox = 1000;
const static char* kInputQuantizationFolder = "./coco_calib";
constexpr static int kClsNumClass = 1000;
constexpr static int kClsInputH = 224;
constexpr static int kClsInputW = 224;
constexpr static int kPoseNumClass = 1;
const static int kNumberOfPoints = 17;
constexpr static int kObbNumClass = 15;
struct alignas(float) Detection {
float bbox[4];
float conf;
float class_id;
float mask[32];
float keypoints[kNumberOfPoints * 3];
float angle;
};
struct AffineMatrix {
float value[6];
};
const int bbox_element =
sizeof(AffineMatrix) / sizeof(float) + 1;
yolo_obb.h
#pragma once
#include <opencv2/opencv.hpp>
#include "NvInfer.h"
#include "yolo_cuda_utils.h"
#include <fstream>
#include <iostream>
#include "process.h"
#include "logging.h"
cv::Rect get_rect(cv::Mat& img, float bbox[4]);
void batch_process_obb(std::vector<std::vector<Detection>>& res_batch, const float* decode_ptr_host, int batch_size,
int bbox_element, const std::vector<cv::Mat>& img_batch);
void cuda_decode(float* predict, int num_bboxes, float confidence_threshold, float* parray, int max_objects,
cudaStream_t stream);
void cuda_nms(float* parray, float nms_threshold, int max_objects, cudaStream_t stream);
void cuda_decode_obb(float* predict, int num_bboxes, float confidence_threshold, float* parray, int max_objects,
cudaStream_t stream);
void cuda_nms_obb(float* parray, float nms_threshold, int max_objects, cudaStream_t stream);
void draw_bbox(std::vector<cv::Mat>& img_batch, std::vector<std::vector<Detection>>& res_batch);
void draw_bbox_obb(std::vector<cv::Mat>& img_batch, std::vector<std::vector<Detection>>& res_batch);
void draw_bbox_keypoints_line(std::vector<cv::Mat>& img_batch, std::vector<std::vector<Detection>>& res_batch);
void draw_mask_bbox(cv::Mat& img, std::vector<Detection>& dets, std::vector<cv::Mat>& masks,
std::unordered_map<int, std::string>& labels_map);
void infer_image_demo();
process.h
#pragma once
#include <opencv2/opencv.hpp>
#include "NvInfer.h"
#include "yolo_cuda_utils.h"
#include <map>
void cuda_preprocess_init(int max_image_size);
void cuda_preprocess_destroy();
void cuda_preprocess(uint8_t *src, int src_width, int src_height, float *dst, int dst_width, int dst_height, cudaStream_t stream);
void cuda_batch_preprocess(std::vector<cv::Mat> &img_batch, float *dst, int dst_width, int dst_height, cudaStream_t stream);
void cuda_decode(float* predict, int num_bboxes, float confidence_threshold, float* parray, int max_objects,
cudaStream_t stream);
void cuda_nms(float* parray, float nms_threshold, int max_objects, cudaStream_t stream);
void cuda_decode_obb(float* predict, int num_bboxes, float confidence_threshold, float* parray, int max_objects,
cudaStream_t stream);
void cuda_nms_obb(float* parray, float nms_threshold, int max_objects, cudaStream_t stream);
logger.cpp
#include "logger.h"
#include "ErrorRecorder.h"
#include "logging.h"
using namespace nvinfer1;
SampleErrorRecorder gRecorder;
namespace sample
{
Logger gLogger{ Logger::Severity::kINFO };
LogStreamConsumer gLogVerbose{ LOG_VERBOSE(gLogger) };
LogStreamConsumer gLogInfo{ LOG_INFO(gLogger) };
LogStreamConsumer gLogWarning{ LOG_WARN(gLogger) };
LogStreamConsumer gLogError{ LOG_ERROR(gLogger) };
LogStreamConsumer gLogFatal{ LOG_FATAL(gLogger) };
void setReportableSeverity(Logger::Severity severity)
{
gLogger.setReportableSeverity(severity);
gLogVerbose.setReportableSeverity(severity);
gLogInfo.setReportableSeverity(severity);
gLogWarning.setReportableSeverity(severity);
gLogError.setReportableSeverity(severity);
gLogFatal.setReportableSeverity(severity);
}
}
yolo_obb.cpp
#include "yolo_obb.h"
#include "yolo_cuda_utils.h"
#include "ErrorRecorder.h"
#include "logging.h"
using namespace nvinfer1;
sample::Logger gLogger;
cv::Rect get_rect(cv::Mat& img, float bbox[4]) {
float l, r, t, b;
float r_w = kInputW / (img.cols * 1.0);
float r_h = kInputH / (img.rows * 1.0);
if (r_h > r_w) {
l = bbox[0];
r = bbox[2];
t = bbox[1] - (kInputH - r_w * img.rows) / 2;
b = bbox[3] - (kInputH - r_w * img.rows) / 2;
l = l / r_w;
r = r / r_w;
t = t / r_w;
b = b / r_w;
} else {
l = bbox[0] - (kInputW - r_h * img.cols) / 2;
r = bbox[2] - (kInputW - r_h * img.cols) / 2;
t = bbox[1];
b = bbox[3];
l = l / r_h;
r = r / r_h;
t = t / r_h;
b = b / r_h;
}
l = std::max(0.0f, l);
t = std::max(0.0f, t);
int width = std::max(0, std::min(int(round(r - l)), img.cols - int(round(l))));
int height = std::max(0, std::min(int(round(b - t)), img.rows - int(round(t))));
return cv::Rect(int(round(l)), int(round(t)), width, height);
}
cv::Rect get_rect_adapt_landmark(cv::Mat& img, float bbox[4], float lmk[kNumberOfPoints * 3]) {
float l, r, t, b;
float r_w = kInputW / (img.cols * 1.0);
float r_h = kInputH / (img.rows * 1.0);
if (r_h > r_w) {
l = bbox[0] / r_w;
r = bbox[2] / r_w;
t = (bbox[1] - (kInputH - r_w * img.rows) / 2) / r_w;
b = (bbox[3] - (kInputH - r_w * img.rows) / 2) / r_w;
for (int i = 0; i < kNumberOfPoints * 3; i += 3) {
lmk[i] /= r_w;
lmk[i + 1] = (lmk[i + 1] - (kInputH - r_w * img.rows) / 2) / r_w;
}
} else {
l = (bbox[0] - (kInputW - r_h * img.cols) / 2) / r_h;
r = (bbox[2] - (kInputW - r_h * img.cols) / 2) / r_h;
t = bbox[1] / r_h;
b = bbox[3] / r_h;
for (int i = 0; i < kNumberOfPoints * 3; i += 3) {
lmk[i] = (lmk[i] - (kInputW - r_h * img.cols) / 2) / r_h;
lmk[i + 1] /= r_h;
}
}
l = std::max(0.0f, l);
t = std::max(0.0f, t);
int width = std::max(0, std::min(int(round(r - l)), img.cols - int(round(l))));
int height = std::max(0, std::min(int(round(b - t)), img.rows - int(round(t))));
return cv::Rect(int(round(l)), int(round(t)), width, height);
}
void draw_bbox(std::vector<cv::Mat>& img_batch, std::vector<std::vector<Detection>>& res_batch) {
for (size_t i = 0; i < img_batch.size(); i++) {
auto& res = res_batch[i];
cv::Mat img = img_batch[i];
for (size_t j = 0; j < res.size(); j++) {
cv::Rect r = get_rect(img, res[j].bbox);
cv::rectangle(img, r, cv::Scalar(0x27, 0xC1, 0x36), 2);
cv::putText(img, std::to_string((int)res[j].class_id), cv::Point(r.x, r.y - 1), cv::FONT_HERSHEY_PLAIN, 1.2,
cv::Scalar(0xFF, 0xFF, 0xFF), 2);
}
}
}
void draw_bbox_keypoints_line(std::vector<cv::Mat>& img_batch, std::vector<std::vector<Detection>>& res_batch) {
const std::vector<std::pair<int, int>> skeleton_pairs = {
{0, 1}, {0, 2}, {0, 5}, {0, 6}, {1, 2}, {1, 3}, {2, 4}, {5, 6}, {5, 7}, {5, 11},
{6, 8}, {6, 12}, {7, 9}, {8, 10}, {11, 12}, {11, 13}, {12, 14}, {13, 15}, {14, 16}};
for (size_t i = 0; i < img_batch.size(); i++) {
auto& res = res_batch[i];
cv::Mat img = img_batch[i];
for (size_t j = 0; j < res.size(); j++) {
cv::Rect r = get_rect_adapt_landmark(img, res[j].bbox, res[j].keypoints);
cv::rectangle(img, r, cv::Scalar(0x27, 0xC1, 0x36), 2);
cv::putText(img, std::to_string((int)res[j].class_id), cv::Point(r.x, r.y - 1), cv::FONT_HERSHEY_PLAIN, 1.2,
cv::Scalar(0xFF, 0xFF, 0xFF), 2);
for (int k = 0; k < kNumberOfPoints * 3; k += 3) {
if (res[j].keypoints[k + 2] > 0.5) {
cv::circle(img, cv::Point((int)res[j].keypoints[k], (int)res[j].keypoints[k + 1]), 3,
cv::Scalar(0, 0x27, 0xC1), -1);
}
}
for (const auto& bone : skeleton_pairs) {
int kp1_idx = bone.first * 3;
int kp2_idx = bone.second * 3;
if (res[j].keypoints[kp1_idx + 2] > 0.5 && res[j].keypoints[kp2_idx + 2] > 0.5) {
cv::Point p1((int)res[j].keypoints[kp1_idx], (int)res[j].keypoints[kp1_idx + 1]);
cv::Point p2((int)res[j].keypoints[kp2_idx], (int)res[j].keypoints[kp2_idx + 1]);
cv::line(img, p1, p2, cv::Scalar(0, 0x27, 0xC1), 2);
}
}
}
}
}
cv::Mat scale_mask(cv::Mat mask, cv::Mat img) {
int x, y, w, h;
float r_w = kInputW / (img.cols * 1.0);
float r_h = kInputH / (img.rows * 1.0);
if (r_h > r_w) {
w = kInputW;
h = r_w * img.rows;
x = 0;
y = (kInputH - h) / 2;
} else {
w = r_h * img.cols;
h = kInputH;
x = (kInputW - w) / 2;
y = 0;
}
cv::Rect r(x, y, w, h);
cv::Mat res;
cv::resize(mask(r), res, img.size());
return res;
}
void draw_mask_bbox(cv::Mat& img, std::vector<Detection>& dets, std::vector<cv::Mat>& masks,
std::unordered_map<int, std::string>& labels_map) {
static std::vector<uint32_t> colors = {0xFF3838, 0xFF9D97, 0xFF701F, 0xFFB21D, 0xCFD231, 0x48F90A, 0x92CC17,
0x3DDB86, 0x1A9334, 0x00D4BB, 0x2C99A8, 0x00C2FF, 0x344593, 0x6473FF,
0x0018EC, 0x8438FF, 0x520085, 0xCB38FF, 0xFF95C8, 0xFF37C7};
for (size_t i = 0; i < dets.size(); i++) {
cv::Mat img_mask = scale_mask(masks[i], img);
auto color = colors[(int)dets[i].class_id % colors.size()];
auto bgr = cv::Scalar(color & 0xFF, color >> 8 & 0xFF, color >> 16 & 0xFF);
cv::Rect r = get_rect(img, dets[i].bbox);
for (int x = r.x; x < r.x + r.width; x++) {
for (int y = r.y; y < r.y + r.height; y++) {
float val = img_mask.at<float>(y, x);
if (val <= 0.5)
continue;
img.at<cv::Vec3b>(y, x)[0] = img.at<cv::Vec3b>(y, x)[0] / 2 + bgr[0] / 2;
img.at<cv::Vec3b>(y, x)[1] = img.at<cv::Vec3b>(y, x)[1] / 2 + bgr[1] / 2;
img.at<cv::Vec3b>(y, x)[2] = img.at<cv::Vec3b>(y, x)[2] / 2 + bgr[2] / 2;
}
}
cv::rectangle(img, r, bgr, 2);
cv::Size textSize =
cv::getTextSize(labels_map[(int)dets[i].class_id] + " " + to_string_with_precision(dets[i].conf),
cv::FONT_HERSHEY_PLAIN, 1.2, 2, NULL);
cv::Point topLeft(r.x, r.y - textSize.height);
cv::Point bottomRight(r.x + textSize.width, r.y + textSize.height);
int lineThickness = 2;
cv::rectangle(img, topLeft, bottomRight, bgr, -1);
cv::putText(img, labels_map[(int)dets[i].class_id] + " " + to_string_with_precision(dets[i].conf),
cv::Point(r.x, r.y + 4), cv::FONT_HERSHEY_PLAIN, 1.2, cv::Scalar::all(0xFF), 2);
}
}
void process_decode_ptr_host_obb(std::vector<Detection>& res, const float* decode_ptr_host, int bbox_element,
cv::Mat& img, int count) {
Detection det;
for (int i = 0; i < count; i++) {
int basic_pos = 1 + i * bbox_element;
int keep_flag = decode_ptr_host[basic_pos + 6];
if (keep_flag == 1) {
det.bbox[0] = decode_ptr_host[basic_pos + 0];
det.bbox[1] = decode_ptr_host[basic_pos + 1];
det.bbox[2] = decode_ptr_host[basic_pos + 2];
det.bbox[3] = decode_ptr_host[basic_pos + 3];
det.conf = decode_ptr_host[basic_pos + 4];
det.class_id = decode_ptr_host[basic_pos + 5];
det.angle = decode_ptr_host[basic_pos + 7];
res.push_back(det);
}
}
}
void batch_process_obb(std::vector<std::vector<Detection>>& res_batch, const float* decode_ptr_host, int batch_size,
int bbox_element, const std::vector<cv::Mat>& img_batch) {
res_batch.resize(batch_size);
int count = static_cast<int>(*decode_ptr_host);
count = std::min(count, kMaxNumOutputBbox);
for (int i = 0; i < batch_size; i++) {
auto& img = const_cast<cv::Mat&>(img_batch[i]);
process_decode_ptr_host_obb(res_batch[i], &decode_ptr_host[i * count], bbox_element, img, count);
}
}
static std::vector<cv::Point> get_corner(cv::Mat& img, const Detection& box) {
float cos_value, sin_value;
float x1 = box.bbox[0];
float y1 = box.bbox[1];
float w = box.bbox[2];
float h = box.bbox[3];
float angle = box.angle * 180.0f / CV_PI;
std::cout << "Original angle: " << angle << std::endl;
if (h >= w) {
std::swap(w, h);
angle = fmod(angle + 90.0f, 180.0f);
}
if (angle < 0) {
angle += 360.0f;
}
if (angle > 180.0f) {
angle -= 180.0f;
}
std::cout << "Adjusted angle: " << angle << std::endl;
float normal_angle = fmod(angle, 180.0f);
if (normal_angle < 0) {
normal_angle += 180.0f;
}
std::cout << "Normal angle: " << normal_angle << std::endl;
cos_value = std::cos(angle * CV_PI / 180.0f);
sin_value = std::sin(angle * CV_PI / 180.0f);
float l = x1 - w / 2;
float r = x1 + w / 2;
float t = y1 - h / 2;
float b = y1 + h / 2;
float bbox[4] = {l, t, r, b};
cv::Rect rect = get_rect(img, bbox);
float x_ = (rect.x + rect.x + rect.width) / 2;
float y_ = (rect.y + rect.y + rect.height) / 2;
float width = rect.width;
float height = rect.height;
std::vector<cv::Point> corner_points(4);
float vec1x = width / 2 * cos_value;
float vec1y = width / 2 * sin_value;
float vec2x = -height / 2 * sin_value;
float vec2y = height / 2 * cos_value;
corner_points[0] = cv::Point(int(round(x_ + vec1x + vec2x)), int(round(y_ + vec1y + vec2y)));
corner_points[1] = cv::Point(int(round(x_ + vec1x - vec2x)), int(round(y_ + vec1y - vec2y)));
corner_points[2] =
cv::Point(int(round(x_ - vec1x - vec2x)), int(round(y_ - vec1y - vec2y)));
corner_points[3] = cv::Point(int(round(x_ - vec1x + vec2x)), int(round(y_ - vec1y + vec2y)));
for (auto& point : corner_points) {
point.x = std::max(0, std::min(point.x, img.cols - 1));
point.y = std::max(0, std::min(point.y, img.rows - 1));
}
return corner_points;
}
void draw_bbox_obb(std::vector<cv::Mat>& img_batch, std::vector<std::vector<Detection>>& res_batch) {
static std::vector<uint32_t> colors = {0xFF3838, 0xFF9D97, 0xFF701F, 0xFFB21D, 0xCFD231, 0x48F90A, 0x92CC17,
0x3DDB86, 0x1A9334, 0x00D4BB, 0x2C99A8, 0x00C2FF, 0x344593, 0x6473FF,
0x0018EC, 0x8438FF, 0x520085, 0xCB38FF, 0xFF95C8, 0xFF37C7};
for (size_t i = 0; i < img_batch.size(); i++) {
auto& res = res_batch[i];
auto& img = img_batch[i];
for (auto& obj : res) {
auto color = colors[(int)obj.class_id % colors.size()];
auto bgr = cv::Scalar(color & 0xFF, color >> 8 & 0xFF, color >> 16 & 0xFF);
auto corner_points = get_corner(img, obj);
cv::polylines(img, std::vector<std::vector<cv::Point>>{corner_points}, true, bgr, 1);
auto text = (std::to_string((int)(obj.class_id)) + ":" + to_string_with_precision(obj.conf));
cv::Size textsize = cv::getTextSize(text, 0, 0.3, 1, nullptr);
int width = textsize.width;
int height = textsize.height;
bool outside = (corner_points[0].y - height >= 3) ? true : false;
cv::Point p1(corner_points[0].x, corner_points[0].y), p2;
p2.x = corner_points[0].x + width;
if (outside) {
p2.y = corner_points[0].y - height - 3;
} else {
p2.y = corner_points[0].y + height + 3;
}
cv::rectangle(img, p1, p2, bgr, -1, cv::LINE_AA);
cv::putText(
img, text,
cv::Point(corner_points[0].x, (outside ? corner_points[0].y - 2 : corner_points[0].y + height + 2)),
0, 0.3, cv::Scalar::all(255), 1, cv::LINE_AA);
}
}
}
const int kOutputSize = kMaxNumOutputBbox * sizeof(Detection) / sizeof(float) + 1;
void deserialize_engine(std::string& engine_name, IRuntime** runtime, ICudaEngine** engine,
IExecutionContext** context) {
std::ifstream file(engine_name, std::ios::binary);
if (!file.good()) {
std::cerr << "read " << engine_name << " error!" << std::endl;
assert(false);
}
size_t size = 0;
file.seekg(0, file.end);
size = file.tellg();
file.seekg(0, file.beg);
char* serialized_engine = new char[size];
assert(serialized_engine);
file.read(serialized_engine, size);
file.close();
*runtime = createInferRuntime(gLogger);
assert(*runtime);
*engine = (*runtime)->deserializeCudaEngine(serialized_engine, size);
assert(*engine);
*context = (*engine)->createExecutionContext();
assert(*context);
delete[] serialized_engine;
}
void prepare_buffer(ICudaEngine* engine, float** input_buffer_device, float** output_buffer_device,
float** output_buffer_host, float** decode_ptr_host, float** decode_ptr_device)
{
assert(engine->getNbBindings() == 2);
const int inputIndex = engine->getBindingIndex(kInputTensorName);
const int outputIndex = engine->getBindingIndex(kOutputTensorName);
assert(inputIndex == 0);
assert(outputIndex == 1);
CUDA_CHECK(cudaMalloc((void**)input_buffer_device, kBatchSize * 3 * kInputH * kInputW * sizeof(float)));
CUDA_CHECK(cudaMalloc((void**)output_buffer_device, kBatchSize * kOutputSize * sizeof(float)));
if (kBatchSize > 1) {
std::cerr << "Do not yet support GPU post processing for multiple batches" << std::endl;
exit(0);
}
*decode_ptr_host = new float[1 + kMaxNumOutputBbox * bbox_element];
CUDA_CHECK(cudaMalloc((void**)decode_ptr_device, sizeof(float) * (1 + kMaxNumOutputBbox * bbox_element)));
}
void infer(IExecutionContext& context, cudaStream_t& stream, void** buffers, float* output, int batchsize,
float* decode_ptr_host, float* decode_ptr_device, int model_bboxes) {
auto start = std::chrono::system_clock::now();
context.enqueue(batchsize, buffers, stream, nullptr);
CUDA_CHECK(
cudaMemsetAsync(decode_ptr_device, 0, sizeof(float) * (1 + kMaxNumOutputBbox * bbox_element), stream));
cuda_decode_obb((float*)buffers[1], model_bboxes, kConfThresh, decode_ptr_device, kMaxNumOutputBbox, stream);
cuda_nms_obb(decode_ptr_device, kNmsThresh, kMaxNumOutputBbox, stream); //cuda nms
CUDA_CHECK(cudaMemcpyAsync(decode_ptr_host, decode_ptr_device,
sizeof(float) * (1 + kMaxNumOutputBbox * bbox_element), cudaMemcpyDeviceToHost,
stream));
auto end = std::chrono::system_clock::now();
std::cout << "inference and gpu postprocess time: "
<< std::chrono::duration_cast<std::chrono::milliseconds>(end - start).count() << "ms" << std::endl;
CUDA_CHECK(cudaStreamSynchronize(stream));
}
void infer_image_demo() {
cudaSetDevice(kGpuId);
std::string engine_name = "yolov8n-obb.engine";
std::string img_dir = "images";
std::string sub_type = "";
int model_bboxes;
int is_p = 0;
float gd = 0.0f, gw = 0.0f;
int max_channels = 0;
IRuntime* runtime = nullptr;
ICudaEngine* engine = nullptr;
IExecutionContext* context = nullptr;
deserialize_engine(engine_name, &runtime, &engine, &context);
cudaStream_t stream;
CUDA_CHECK(cudaStreamCreate(&stream));
cuda_preprocess_init(kMaxInputImageSize);
auto out_dims = engine->getBindingDimensions(1);
model_bboxes = out_dims.d[0];
float* device_buffers[2];
float* output_buffer_host = nullptr;
float* decode_ptr_host = nullptr;
float* decode_ptr_device = nullptr;
std::vector<std::string> file_names;
if (read_files_in_dir(img_dir.c_str(), file_names) < 0) {
std::cerr << "read_files_in_dir failed." << std::endl;
}
prepare_buffer(engine, &device_buffers[0], &device_buffers[1], &output_buffer_host, &decode_ptr_host,
&decode_ptr_device);
for (size_t i = 0; i < file_names.size(); i += kBatchSize) {
std::vector<cv::Mat> img_batch;
std::vector<std::string> img_name_batch;
for (size_t j = i; j < i + kBatchSize && j < file_names.size(); j++) {
cv::Mat img = cv::imread(img_dir + "/" + file_names[j]);
img_batch.push_back(img);
img_name_batch.push_back(file_names[j]);
}
cuda_batch_preprocess(img_batch, device_buffers[0], kInputW, kInputH, stream);
infer(*context, stream, (void**)device_buffers, output_buffer_host, kBatchSize, decode_ptr_host,
decode_ptr_device, model_bboxes);
std::vector<std::vector<Detection>> res_batch;
batch_process_obb(res_batch, decode_ptr_host, img_batch.size(), bbox_element, img_batch);
draw_bbox_obb(img_batch, res_batch);
for (size_t j = 0; j < img_batch.size(); j++) {
cv::imwrite("_" + img_name_batch[j], img_batch[j]);
}
}
cudaStreamDestroy(stream);
CUDA_CHECK(cudaFree(device_buffers[0]));
CUDA_CHECK(cudaFree(device_buffers[1]));
CUDA_CHECK(cudaFree(decode_ptr_device));
delete[] decode_ptr_host;
delete[] output_buffer_host;
cuda_preprocess_destroy();
delete context;
delete engine;
delete runtime;
}
process.cu
#include "yolo_cuda_utils.h"
#include "process.h"
static uint8_t* img_buffer_host = nullptr;
static uint8_t* img_buffer_device = nullptr;
__global__ void warpaffine_kernel(uint8_t* src, int src_line_size, int src_width, int src_height, float* dst,
int dst_width, int dst_height, uint8_t const_value_st, AffineMatrix d2s, int edge) {
int position = blockDim.x * blockIdx.x + threadIdx.x;
if (position >= edge)
return;
float m_x1 = d2s.value[0];
float m_y1 = d2s.value[1];
float m_z1 = d2s.value[2];
float m_x2 = d2s.value[3];
float m_y2 = d2s.value[4];
float m_z2 = d2s.value[5];
int dx = position % dst_width;
int dy = position / dst_width;
float src_x = m_x1 * dx + m_y1 * dy + m_z1 + 0.5f;
float src_y = m_x2 * dx + m_y2 * dy + m_z2 + 0.5f;
float c0, c1, c2;
if (src_x <= -1 || src_x >= src_width || src_y <= -1 || src_y >= src_height) {
c0 = const_value_st;
c1 = const_value_st;
c2 = const_value_st;
} else {
int y_low = floorf(src_y);
int x_low = floorf(src_x);
int y_high = y_low + 1;
int x_high = x_low + 1;
uint8_t const_value[] = {const_value_st, const_value_st, const_value_st};
float ly = src_y - y_low;
float lx = src_x - x_low;
float hy = 1 - ly;
float hx = 1 - lx;
float w1 = hy * hx, w2 = hy * lx, w3 = ly * hx, w4 = ly * lx;
uint8_t* v1 = const_value;
uint8_t* v2 = const_value;
uint8_t* v3 = const_value;
uint8_t* v4 = const_value;
if (y_low >= 0) {
if (x_low >= 0)
v1 = src + y_low * src_line_size + x_low * 3;
if (x_high < src_width)
v2 = src + y_low * src_line_size + x_high * 3;
}
if (y_high < src_height) {
if (x_low >= 0)
v3 = src + y_high * src_line_size + x_low * 3;
if (x_high < src_width)
v4 = src + y_high * src_line_size + x_high * 3;
}
c0 = w1 * v1[0] + w2 * v2[0] + w3 * v3[0] + w4 * v4[0];
c1 = w1 * v1[1] + w2 * v2[1] + w3 * v3[1] + w4 * v4[1];
c2 = w1 * v1[2] + w2 * v2[2] + w3 * v3[2] + w4 * v4[2];
}
float t = c2;
c2 = c0;
c0 = t;
c0 = c0 / 255.0f;
c1 = c1 / 255.0f;
c2 = c2 / 255.0f;
int area = dst_width * dst_height;
float* pdst_c0 = dst + dy * dst_width + dx;
float* pdst_c1 = pdst_c0 + area;
float* pdst_c2 = pdst_c1 + area;
*pdst_c0 = c0;
*pdst_c1 = c1;
*pdst_c2 = c2;
}
void cuda_preprocess(uint8_t* src, int src_width, int src_height, float* dst, int dst_width, int dst_height,
cudaStream_t stream) {
int img_size = src_width * src_height * 3;
memcpy(img_buffer_host, src, img_size);
CUDA_CHECK(cudaMemcpyAsync(img_buffer_device, img_buffer_host, img_size, cudaMemcpyHostToDevice, stream));
AffineMatrix s2d, d2s;
float scale = std::min(dst_height / (float)src_height, dst_width / (float)src_width);
s2d.value[0] = scale;
s2d.value[1] = 0;
s2d.value[2] = -scale * src_width * 0.5 + dst_width * 0.5;
s2d.value[3] = 0;
s2d.value[4] = scale;
s2d.value[5] = -scale * src_height * 0.5 + dst_height * 0.5;
cv::Mat m2x3_s2d(2, 3, CV_32F, s2d.value);
cv::Mat m2x3_d2s(2, 3, CV_32F, d2s.value);
cv::invertAffineTransform(m2x3_s2d, m2x3_d2s);
memcpy(d2s.value, m2x3_d2s.ptr<float>(0), sizeof(d2s.value));
int jobs = dst_height * dst_width;
int threads = 256;
int blocks = ceil(jobs / (float)threads);
warpaffine_kernel<<<blocks, threads, 0, stream>>>(img_buffer_device, src_width * 3, src_width, src_height, dst,
dst_width, dst_height, 128, d2s, jobs);
}
void cuda_batch_preprocess(std::vector<cv::Mat>& img_batch, float* dst, int dst_width, int dst_height,
cudaStream_t stream) {
int dst_size = dst_width * dst_height * 3;
for (size_t i = 0; i < img_batch.size(); i++) {
cuda_preprocess(img_batch[i].ptr(), img_batch[i].cols, img_batch[i].rows, &dst[dst_size * i], dst_width,
dst_height, stream);
CUDA_CHECK(cudaStreamSynchronize(stream));
}
}
void cuda_preprocess_init(int max_image_size) {
CUDA_CHECK(cudaMallocHost((void**)&img_buffer_host, max_image_size * 3));
CUDA_CHECK(cudaMalloc((void**)&img_buffer_device, max_image_size * 3));
}
void cuda_preprocess_destroy() {
CUDA_CHECK(cudaFree(img_buffer_device));
CUDA_CHECK(cudaFreeHost(img_buffer_host));
}
static __global__ void decode_kernel_obb(float* predict, int num_bboxes, float confidence_threshold, float* parray,
int max_objects) {
float count = predict[0];
int position = (blockDim.x * blockIdx.x + threadIdx.x);
if (position >= count)
return;
float* pitem = predict + 1 + position * (sizeof(Detection) / sizeof(float));
int index = atomicAdd(parray, 1);
if (index >= max_objects)
return;
float confidence = pitem[4];
if (confidence < confidence_threshold)
return;
float cx = pitem[0];
float cy = pitem[1];
float width = pitem[2];
float height = pitem[3];
float label = pitem[5];
float angle = pitem[89];
float* pout_item = parray + 1 + index * bbox_element;
*pout_item++ = cx;
*pout_item++ = cy;
*pout_item++ = width;
*pout_item++ = height;
*pout_item++ = confidence;
*pout_item++ = label;
*pout_item++ = 1;
*pout_item++ = angle;
}
static __global__ void decode_kernel(float* predict, int num_bboxes, float confidence_threshold, float* parray,
int max_objects) {
float count = predict[0];
int position = (blockDim.x * blockIdx.x + threadIdx.x);
if (position >= count)
return;
float* pitem = predict + 1 + position * (sizeof(Detection) / sizeof(float));
int index = atomicAdd(parray, 1);
if (index >= max_objects)
return;
float confidence = pitem[4];
if (confidence < confidence_threshold)
return;
float left = pitem[0];
float top = pitem[1];
float right = pitem[2];
float bottom = pitem[3];
float label = pitem[5];
float* pout_item = parray + 1 + index * bbox_element;
*pout_item++ = left;
*pout_item++ = top;
*pout_item++ = right;
*pout_item++ = bottom;
*pout_item++ = confidence;
*pout_item++ = label;
*pout_item++ = 1;
}
static __device__ float box_iou(float aleft, float atop, float aright, float abottom, float bleft, float btop,
float bright, float bbottom) {
float cleft = max(aleft, bleft);
float ctop = max(atop, btop);
float cright = min(aright, bright);
float cbottom = min(abottom, bbottom);
float c_area = max(cright - cleft, 0.0f) * max(cbottom - ctop, 0.0f);
if (c_area == 0.0f)
return 0.0f;
float a_area = max(0.0f, aright - aleft) * max(0.0f, abottom - atop);
float b_area = max(0.0f, bright - bleft) * max(0.0f, bbottom - btop);
return c_area / (a_area + b_area - c_area);
}
static __global__ void nms_kernel(float* bboxes, int max_objects, float threshold) {
int position = (blockDim.x * blockIdx.x + threadIdx.x);
int count = bboxes[0];
if (position >= count)
return;
float* pcurrent = bboxes + 1 + position * bbox_element;
for (int i = 0; i < count; ++i) {
float* pitem = bboxes + 1 + i * bbox_element;
if (i == position || pcurrent[5] != pitem[5])
continue;
if (pitem[4] >= pcurrent[4]) {
if (pitem[4] == pcurrent[4] && i < position)
continue;
float iou =
box_iou(pcurrent[0], pcurrent[1], pcurrent[2], pcurrent[3], pitem[0], pitem[1], pitem[2], pitem[3]);
if (iou > threshold) {
pcurrent[6] = 0;
return;
}
}
}
}
static __device__ void convariance_matrix(float w, float h, float r, float& a, float& b, float& c) {
float a_val = w * w / 12.0f;
float b_val = h * h / 12.0f;
float cos_r = cosf(r);
float sin_r = sinf(r);
a = a_val * cos_r * cos_r + b_val * sin_r * sin_r;
b = a_val * sin_r * sin_r + b_val * cos_r * cos_r;
c = (a_val - b_val) * sin_r * cos_r;
}
static __device__ float box_probiou(float cx1, float cy1, float w1, float h1, float r1, float cx2, float cy2, float w2,
float h2, float r2, float eps = 1e-7) {
float a1, b1, c1, a2, b2, c2;
convariance_matrix(w1, h1, r1, a1, b1, c1);
convariance_matrix(w2, h2, r2, a2, b2, c2);
float t1 = ((a1 + a2) * powf(cy1 - cy2, 2) + (b1 + b2) * powf(cx1 - cx2, 2)) /
((a1 + a2) * (b1 + b2) - powf(c1 + c2, 2) + eps);
float t2 = ((c1 + c2) * (cx2 - cx1) * (cy1 - cy2)) / ((a1 + a2) * (b1 + b2) - powf(c1 + c2, 2) + eps);
float t3 = logf(((a1 + a2) * (b1 + b2) - powf(c1 + c2, 2)) /
(4 * sqrtf(fmaxf(a1 * b1 - c1 * c1, 0.0f)) * sqrtf(fmaxf(a2 * b2 - c2 * c2, 0.0f)) + eps) +
eps);
float bd = 0.25f * t1 + 0.5f * t2 + 0.5f * t3;
bd = fmaxf(fminf(bd, 100.0f), eps);
float hd = sqrtf(1.0f - expf(-bd) + eps);
return 1 - hd;
}
static __global__ void nms_kernel_obb(float* bboxes, int max_objects, float threshold) {
int position = (blockDim.x * blockIdx.x + threadIdx.x);
int count = bboxes[0];
if (position >= count)
return;
float* pcurrent = bboxes + 1 + position * bbox_element;
for (int i = 0; i < count; ++i) {
float* pitem = bboxes + 1 + i * bbox_element;
if (i == position || pcurrent[5] != pitem[5])
continue;
if (pitem[4] >= pcurrent[4]) {
if (pitem[4] == pcurrent[4] && i < position)
continue;
float iou = box_probiou(pcurrent[0], pcurrent[1], pcurrent[2], pcurrent[3], pcurrent[7], pitem[0], pitem[1],
pitem[2], pitem[3], pitem[7]);
if (iou > threshold) {
pcurrent[6] = 0;
return;
}
}
}
}
void cuda_decode(float* predict, int num_bboxes, float confidence_threshold, float* parray, int max_objects,
cudaStream_t stream) {
int block = 256;
int grid = ceil(num_bboxes / (float)block);
decode_kernel << <grid, block, 0, stream >> > ((float*)predict, num_bboxes, confidence_threshold, parray, max_objects);
}
void cuda_nms(float* parray, float nms_threshold, int max_objects, cudaStream_t stream) {
int block = max_objects < 256 ? max_objects : 256;
int grid = ceil(max_objects / (float)block);
nms_kernel << <grid, block, 0, stream >> > (parray, max_objects, nms_threshold);
}
void cuda_decode_obb(float* predict, int num_bboxes, float confidence_threshold, float* parray, int max_objects,
cudaStream_t stream) {
int block = 256;
int grid = ceil(num_bboxes / (float)block);
decode_kernel_obb << <grid, block, 0, stream >> > ((float*)predict, num_bboxes, confidence_threshold, parray,
max_objects);
}
void cuda_nms_obb(float* parray, float nms_threshold, int max_objects, cudaStream_t stream) {
int block = max_objects < 256 ? max_objects : 256;
int grid = ceil(max_objects / (float)block);
nms_kernel_obb << <grid, block, 0, stream >> > (parray, max_objects, nms_threshold);
}
CMakeLists.txt
ccmake_minimum_required(VERSION 3.18)
project(Yolov8obb-Cuda)
add_definitions(-DAPI_EXPORTS)
add_definitions(-DNOMINMAX)
set(Tensorrt_Include "E:\\Tensorrt\\8.6.1.6\\samples\\common")
set(Window_Include "E:\\Tensorrt\\8.6.1.6\\window")
set(OpenCV_DIR "E:\\Opencv contrib\\newbuild\\install")
set(OpenCV_INCLUDE_DIRS ${OpenCV_DIR}\\include)
set(OpenCV_LIB_DIRS ${OpenCV_DIR}\\x64\\vc17\\lib)
set(OpenCV_LIB_DEBUG ${OpenCV_DIR}\\x64\\vc17\\lib\\opencv_world470d.lib)
set(OpenCV_LIB_RELEASE ${OpenCV_DIR}\\x64\\vc17\\lib\\opencv_world470.lib)
set(CMAKE_CUDA_ARCHITECTURES 52)
find_package(CUDA REQUIRED)
enable_language(CUDA)
find_package(OpenCV QUIET)
include_directories(${CUDA_INCLUDE_DIRS})
include_directories(${OpenCV_INCLUDE_DIRS})
include_directories(${Tensorrt_Include})
include_directories(${Window_Include})
link_directories(${OpenCV_LIB_DIRS})
set(SOURCES
main.cpp
yolo_obb.cpp
logger.cpp
process.cu
yolo_obb.h
process.h
logger.h
yolo_cuda_utils.h
)
add_executable(Yolov8obb-Cuda ${SOURCES})
target_link_libraries(Yolov8obb-Cuda "nvinfer.lib" "nvinfer_plugin.lib" "nvonnxparser.lib"
"nvparsers.lib"
"cublas.lib" "cublasLt.lib" "cuda.lib" "cudadevrt.lib" "cudart.lib"
"cudart_static.lib" "cudnn.lib" "cufft.lib" "cufftw.lib" "curand.lib" "cusolver.lib"
"cusolverMg.lib" "cusparse.lib" "nppc.lib" "nppial.lib"
"nppicc.lib" "nppidei.lib" "nppif.lib" "nppig.lib"
"nppim.lib" "nppist.lib" "nppisu.lib" "nppitc.lib"
"npps.lib" "nvblas.lib" "nvjpeg.lib" "nvml.lib"
"nvrtc.lib")
target_link_libraries(Yolov8obb-Cuda ${CUDA_LIBRARIES})
target_link_libraries(Yolov8obb-Cuda
$<$<CONFIG:Debug>:${OpenCV_LIB_DEBUG}>
$<$<CONFIG:Release>:${OpenCV_LIB_RELEASE}>
)
火山引擎开发者社区是火山引擎打造的AI技术生态平台,聚焦Agent与大模型开发,提供豆包系列模型(图像/视频/视觉)、智能分析与会话工具,并配套评测集、动手实验室及行业案例库。社区通过技术沙龙、挑战赛等活动促进开发者成长,新用户可领50万Tokens权益,助力构建智能应用。
更多推荐
所有评论(0)