Export-Onnx

from ultralytics import YOLO
model = YOLO("yolov8n-obb.yaml")
model = YOLO("yolov8n-obb.pt")
trainer = model.train(data="dota8.yaml", epochs=3, imgsz=1024)
metrics = model.val(data="dota8.yaml")
results = model("boats.jpg")
model.export(format="onnx")

Convert-Engine

trtexec.exe --onnx=yolov8n-obb.onnx --saveEngine=yolov8n-obb.engine --fp16

logger.h

#include "logging.h"
class SampleErrorRecorder;
extern SampleErrorRecorder gRecorder;
namespace sample
{
    extern Logger gLogger;
    extern LogStreamConsumer gLogVerbose;
    extern LogStreamConsumer gLogInfo;
    extern LogStreamConsumer gLogWarning;
    extern LogStreamConsumer gLogError;
    extern LogStreamConsumer gLogFatal;
    void setReportableSeverity(Logger::Severity severity);
} 

yolo_cuda_utils.h

#include <cuda_runtime_api.h>
#include <opencv2/opencv.hpp>
#include <dirent.h>
#include <fstream> 
#include "logger.h"
#include "ErrorRecorder.h"
#include "NvInfer.h"
#ifndef CUDA_CHECK
#define CUDA_CHECK(callstr)\
    {\
        cudaError_t error_code = callstr;\
        if (error_code != cudaSuccess) {\
            std::cerr << "CUDA error " << error_code << " at " << __FILE__ << ":" << __LINE__;\
            assert(0);\
        }\
    }
#endif  
#pragma once 
static inline cv::Mat preprocess_img(cv::Mat& img, int input_w, int input_h) {
    int w, h, x, y;
    float r_w = input_w / (img.cols * 1.0);
    float r_h = input_h / (img.rows * 1.0);
    if (r_h > r_w) {
        w = input_w;
        h = r_w * img.rows;
        x = 0;
        y = (input_h - h) / 2;
    }
    else {
        w = r_h * img.cols;
        h = input_h;
        x = (input_w - w) / 2;
        y = 0;
    }
    cv::Mat re(h, w, CV_8UC3);
    cv::resize(img, re, re.size(), 0, 0, cv::INTER_LINEAR);
    cv::Mat out(input_h, input_w, CV_8UC3, cv::Scalar(128, 128, 128));
    re.copyTo(out(cv::Rect(x, y, re.cols, re.rows)));
    return out;
}
static inline int read_files_in_dir(const char* p_dir_name, std::vector<std::string>& file_names) {
    DIR* p_dir = opendir(p_dir_name);
    if (p_dir == nullptr) {
        return -1;
    }
    struct dirent* p_file = nullptr;
    while ((p_file = readdir(p_dir)) != nullptr) {
        if (strcmp(p_file->d_name, ".") != 0 &&
            strcmp(p_file->d_name, "..") != 0) {
            std::string cur_file_name(p_file->d_name);
            file_names.push_back(cur_file_name);
        }
    }
    closedir(p_dir);
    return 0;
}
static inline std::string trim_leading_whitespace(const std::string& str) {
    size_t first = str.find_first_not_of(' ');
    if (std::string::npos == first) {
        return str;
    }
    size_t last = str.find_last_not_of(' ');
    return str.substr(first, (last - first + 1));
}
static inline std::string to_string_with_precision(const float a_value, const int n = 2) {
    std::ostringstream out;
    out.precision(n);
    out << std::fixed << a_value;
    return out.str();
}
static inline int read_labels(const std::string labels_filename, std::unordered_map<int, std::string>& labels_map) {
    std::ifstream file(labels_filename);
    std::string line;
    int index = 0;
    while (std::getline(file, line)) {
        line = trim_leading_whitespace(line);
        labels_map[index] = line;
        index++;
    }
    file.close();
    return 0;
}
#define USE_FP16
const static char* kInputTensorName = "images";
const static char* kOutputTensorName = "output0";
const static int kNumClass = 80;
const static int kBatchSize = 1;
const static int kGpuId = 0;
const static int kInputH = 1024;
const static int kInputW = 1024;
const static float kNmsThresh = 0.45f;
const static float kConfThresh = 0.5f;
const static float kConfThreshKeypoints = 0.5f;
const static int kMaxInputImageSize = 3000 * 3000;
const static int kMaxNumOutputBbox = 1000;
const static char* kInputQuantizationFolder = "./coco_calib";
constexpr static int kClsNumClass = 1000;
constexpr static int kClsInputH = 224;
constexpr static int kClsInputW = 224;
constexpr static int kPoseNumClass = 1;
const static int kNumberOfPoints = 17;
constexpr static int kObbNumClass = 15;
struct alignas(float) Detection {
    float bbox[4];
    float conf;
    float class_id;
    float mask[32];
    float keypoints[kNumberOfPoints * 3];
    float angle;
};
struct AffineMatrix {
    float value[6];
};
const int bbox_element =
sizeof(AffineMatrix) / sizeof(float) + 1;

yolo_obb.h

#pragma once
#include <opencv2/opencv.hpp>
#include "NvInfer.h"
#include "yolo_cuda_utils.h"
#include <fstream>
#include <iostream>
#include "process.h"
#include "logging.h"
cv::Rect get_rect(cv::Mat& img, float bbox[4]);
void batch_process_obb(std::vector<std::vector<Detection>>& res_batch, const float* decode_ptr_host, int batch_size,
                       int bbox_element, const std::vector<cv::Mat>& img_batch);
void cuda_decode(float* predict, int num_bboxes, float confidence_threshold, float* parray, int max_objects,
                 cudaStream_t stream);
void cuda_nms(float* parray, float nms_threshold, int max_objects, cudaStream_t stream);
void cuda_decode_obb(float* predict, int num_bboxes, float confidence_threshold, float* parray, int max_objects,
                     cudaStream_t stream);
void cuda_nms_obb(float* parray, float nms_threshold, int max_objects, cudaStream_t stream);
void draw_bbox(std::vector<cv::Mat>& img_batch, std::vector<std::vector<Detection>>& res_batch);
void draw_bbox_obb(std::vector<cv::Mat>& img_batch, std::vector<std::vector<Detection>>& res_batch);
void draw_bbox_keypoints_line(std::vector<cv::Mat>& img_batch, std::vector<std::vector<Detection>>& res_batch);
void draw_mask_bbox(cv::Mat& img, std::vector<Detection>& dets, std::vector<cv::Mat>& masks,
                    std::unordered_map<int, std::string>& labels_map);
void infer_image_demo();

process.h

#pragma once
#include <opencv2/opencv.hpp>
#include "NvInfer.h"
#include "yolo_cuda_utils.h"
#include <map>
void cuda_preprocess_init(int max_image_size);
void cuda_preprocess_destroy();
void cuda_preprocess(uint8_t *src, int src_width, int src_height, float *dst, int dst_width, int dst_height, cudaStream_t stream);
void cuda_batch_preprocess(std::vector<cv::Mat> &img_batch, float *dst, int dst_width, int dst_height, cudaStream_t stream);
void cuda_decode(float* predict, int num_bboxes, float confidence_threshold, float* parray, int max_objects,
	cudaStream_t stream);
void cuda_nms(float* parray, float nms_threshold, int max_objects, cudaStream_t stream);
void cuda_decode_obb(float* predict, int num_bboxes, float confidence_threshold, float* parray, int max_objects,
	cudaStream_t stream);
void cuda_nms_obb(float* parray, float nms_threshold, int max_objects, cudaStream_t stream);

logger.cpp

#include "logger.h"
#include "ErrorRecorder.h"
#include "logging.h"
using namespace nvinfer1;
SampleErrorRecorder gRecorder;
namespace sample
{
    Logger gLogger{ Logger::Severity::kINFO };
    LogStreamConsumer gLogVerbose{ LOG_VERBOSE(gLogger) };
    LogStreamConsumer gLogInfo{ LOG_INFO(gLogger) };
    LogStreamConsumer gLogWarning{ LOG_WARN(gLogger) };
    LogStreamConsumer gLogError{ LOG_ERROR(gLogger) };
    LogStreamConsumer gLogFatal{ LOG_FATAL(gLogger) };
    void setReportableSeverity(Logger::Severity severity)
    {
        gLogger.setReportableSeverity(severity);
        gLogVerbose.setReportableSeverity(severity);
        gLogInfo.setReportableSeverity(severity);
        gLogWarning.setReportableSeverity(severity);
        gLogError.setReportableSeverity(severity);
        gLogFatal.setReportableSeverity(severity);
    }
}

yolo_obb.cpp

#include "yolo_obb.h"
#include "yolo_cuda_utils.h"
#include "ErrorRecorder.h"
#include "logging.h"
using namespace nvinfer1;
sample::Logger gLogger;
cv::Rect get_rect(cv::Mat& img, float bbox[4]) {
    float l, r, t, b;
    float r_w = kInputW / (img.cols * 1.0);
    float r_h = kInputH / (img.rows * 1.0);
    if (r_h > r_w) {
        l = bbox[0];
        r = bbox[2];
        t = bbox[1] - (kInputH - r_w * img.rows) / 2;
        b = bbox[3] - (kInputH - r_w * img.rows) / 2;
        l = l / r_w;
        r = r / r_w;
        t = t / r_w;
        b = b / r_w;
    } else {
        l = bbox[0] - (kInputW - r_h * img.cols) / 2;
        r = bbox[2] - (kInputW - r_h * img.cols) / 2;
        t = bbox[1];
        b = bbox[3];
        l = l / r_h;
        r = r / r_h;
        t = t / r_h;
        b = b / r_h;
    }
    l = std::max(0.0f, l);
    t = std::max(0.0f, t);
    int width = std::max(0, std::min(int(round(r - l)), img.cols - int(round(l))));
    int height = std::max(0, std::min(int(round(b - t)), img.rows - int(round(t))));
    return cv::Rect(int(round(l)), int(round(t)), width, height);
}
cv::Rect get_rect_adapt_landmark(cv::Mat& img, float bbox[4], float lmk[kNumberOfPoints * 3]) {
    float l, r, t, b;
    float r_w = kInputW / (img.cols * 1.0);
    float r_h = kInputH / (img.rows * 1.0);
    if (r_h > r_w) {
        l = bbox[0] / r_w;
        r = bbox[2] / r_w;
        t = (bbox[1] - (kInputH - r_w * img.rows) / 2) / r_w;
        b = (bbox[3] - (kInputH - r_w * img.rows) / 2) / r_w;
        for (int i = 0; i < kNumberOfPoints * 3; i += 3) {
            lmk[i] /= r_w;
            lmk[i + 1] = (lmk[i + 1] - (kInputH - r_w * img.rows) / 2) / r_w;
        }
    } else {
        l = (bbox[0] - (kInputW - r_h * img.cols) / 2) / r_h;
        r = (bbox[2] - (kInputW - r_h * img.cols) / 2) / r_h;
        t = bbox[1] / r_h;
        b = bbox[3] / r_h;
        for (int i = 0; i < kNumberOfPoints * 3; i += 3) {
            lmk[i] = (lmk[i] - (kInputW - r_h * img.cols) / 2) / r_h;
            lmk[i + 1] /= r_h;
        }
    }
    l = std::max(0.0f, l);
    t = std::max(0.0f, t);
    int width = std::max(0, std::min(int(round(r - l)), img.cols - int(round(l))));
    int height = std::max(0, std::min(int(round(b - t)), img.rows - int(round(t))));
    return cv::Rect(int(round(l)), int(round(t)), width, height);
}
void draw_bbox(std::vector<cv::Mat>& img_batch, std::vector<std::vector<Detection>>& res_batch) {
    for (size_t i = 0; i < img_batch.size(); i++) {
        auto& res = res_batch[i];
        cv::Mat img = img_batch[i];
        for (size_t j = 0; j < res.size(); j++) {
            cv::Rect r = get_rect(img, res[j].bbox);
            cv::rectangle(img, r, cv::Scalar(0x27, 0xC1, 0x36), 2);
            cv::putText(img, std::to_string((int)res[j].class_id), cv::Point(r.x, r.y - 1), cv::FONT_HERSHEY_PLAIN, 1.2,
                        cv::Scalar(0xFF, 0xFF, 0xFF), 2);
        }
    }
}
void draw_bbox_keypoints_line(std::vector<cv::Mat>& img_batch, std::vector<std::vector<Detection>>& res_batch) {
    const std::vector<std::pair<int, int>> skeleton_pairs = {
            {0, 1}, {0, 2},  {0, 5}, {0, 6},  {1, 2},   {1, 3},   {2, 4},   {5, 6},   {5, 7},  {5, 11},
            {6, 8}, {6, 12}, {7, 9}, {8, 10}, {11, 12}, {11, 13}, {12, 14}, {13, 15}, {14, 16}};
    for (size_t i = 0; i < img_batch.size(); i++) {
        auto& res = res_batch[i];
        cv::Mat img = img_batch[i];
        for (size_t j = 0; j < res.size(); j++) {
            cv::Rect r = get_rect_adapt_landmark(img, res[j].bbox, res[j].keypoints);
            cv::rectangle(img, r, cv::Scalar(0x27, 0xC1, 0x36), 2);
            cv::putText(img, std::to_string((int)res[j].class_id), cv::Point(r.x, r.y - 1), cv::FONT_HERSHEY_PLAIN, 1.2,
                        cv::Scalar(0xFF, 0xFF, 0xFF), 2);
            for (int k = 0; k < kNumberOfPoints * 3; k += 3) {
                if (res[j].keypoints[k + 2] > 0.5) {
                    cv::circle(img, cv::Point((int)res[j].keypoints[k], (int)res[j].keypoints[k + 1]), 3,
                               cv::Scalar(0, 0x27, 0xC1), -1);
                }
            }
            for (const auto& bone : skeleton_pairs) {
                int kp1_idx = bone.first * 3;
                int kp2_idx = bone.second * 3;
                if (res[j].keypoints[kp1_idx + 2] > 0.5 && res[j].keypoints[kp2_idx + 2] > 0.5) {
                    cv::Point p1((int)res[j].keypoints[kp1_idx], (int)res[j].keypoints[kp1_idx + 1]);
                    cv::Point p2((int)res[j].keypoints[kp2_idx], (int)res[j].keypoints[kp2_idx + 1]);
                    cv::line(img, p1, p2, cv::Scalar(0, 0x27, 0xC1), 2);
                }
            }
        }
    }
}
cv::Mat scale_mask(cv::Mat mask, cv::Mat img) {
    int x, y, w, h;
    float r_w = kInputW / (img.cols * 1.0);
    float r_h = kInputH / (img.rows * 1.0);
    if (r_h > r_w) {
        w = kInputW;
        h = r_w * img.rows;
        x = 0;
        y = (kInputH - h) / 2;
    } else {
        w = r_h * img.cols;
        h = kInputH;
        x = (kInputW - w) / 2;
        y = 0;
    }
    cv::Rect r(x, y, w, h);
    cv::Mat res;
    cv::resize(mask(r), res, img.size());
    return res;
}
void draw_mask_bbox(cv::Mat& img, std::vector<Detection>& dets, std::vector<cv::Mat>& masks,
                    std::unordered_map<int, std::string>& labels_map) {
    static std::vector<uint32_t> colors = {0xFF3838, 0xFF9D97, 0xFF701F, 0xFFB21D, 0xCFD231, 0x48F90A, 0x92CC17,
                                           0x3DDB86, 0x1A9334, 0x00D4BB, 0x2C99A8, 0x00C2FF, 0x344593, 0x6473FF,
                                           0x0018EC, 0x8438FF, 0x520085, 0xCB38FF, 0xFF95C8, 0xFF37C7};
    for (size_t i = 0; i < dets.size(); i++) {
        cv::Mat img_mask = scale_mask(masks[i], img);
        auto color = colors[(int)dets[i].class_id % colors.size()];
        auto bgr = cv::Scalar(color & 0xFF, color >> 8 & 0xFF, color >> 16 & 0xFF);

        cv::Rect r = get_rect(img, dets[i].bbox);
        for (int x = r.x; x < r.x + r.width; x++) {
            for (int y = r.y; y < r.y + r.height; y++) {
                float val = img_mask.at<float>(y, x);
                if (val <= 0.5)
                    continue;
                img.at<cv::Vec3b>(y, x)[0] = img.at<cv::Vec3b>(y, x)[0] / 2 + bgr[0] / 2;
                img.at<cv::Vec3b>(y, x)[1] = img.at<cv::Vec3b>(y, x)[1] / 2 + bgr[1] / 2;
                img.at<cv::Vec3b>(y, x)[2] = img.at<cv::Vec3b>(y, x)[2] / 2 + bgr[2] / 2;
            }
        }
        cv::rectangle(img, r, bgr, 2);
        cv::Size textSize =
                cv::getTextSize(labels_map[(int)dets[i].class_id] + " " + to_string_with_precision(dets[i].conf),
                                cv::FONT_HERSHEY_PLAIN, 1.2, 2, NULL);
        cv::Point topLeft(r.x, r.y - textSize.height);
        cv::Point bottomRight(r.x + textSize.width, r.y + textSize.height);
        int lineThickness = 2;
        cv::rectangle(img, topLeft, bottomRight, bgr, -1);
        cv::putText(img, labels_map[(int)dets[i].class_id] + " " + to_string_with_precision(dets[i].conf),
                    cv::Point(r.x, r.y + 4), cv::FONT_HERSHEY_PLAIN, 1.2, cv::Scalar::all(0xFF), 2);
    }
}
void process_decode_ptr_host_obb(std::vector<Detection>& res, const float* decode_ptr_host, int bbox_element,
                                 cv::Mat& img, int count) {
    Detection det;
    for (int i = 0; i < count; i++) {
        int basic_pos = 1 + i * bbox_element;
        int keep_flag = decode_ptr_host[basic_pos + 6];
        if (keep_flag == 1) {
            det.bbox[0] = decode_ptr_host[basic_pos + 0];
            det.bbox[1] = decode_ptr_host[basic_pos + 1];
            det.bbox[2] = decode_ptr_host[basic_pos + 2];
            det.bbox[3] = decode_ptr_host[basic_pos + 3];
            det.conf = decode_ptr_host[basic_pos + 4];
            det.class_id = decode_ptr_host[basic_pos + 5];
            det.angle = decode_ptr_host[basic_pos + 7];
            res.push_back(det);
        }
    }
}
void batch_process_obb(std::vector<std::vector<Detection>>& res_batch, const float* decode_ptr_host, int batch_size,
                       int bbox_element, const std::vector<cv::Mat>& img_batch) {
    res_batch.resize(batch_size);
    int count = static_cast<int>(*decode_ptr_host);
    count = std::min(count, kMaxNumOutputBbox);
    for (int i = 0; i < batch_size; i++) {
        auto& img = const_cast<cv::Mat&>(img_batch[i]);
        process_decode_ptr_host_obb(res_batch[i], &decode_ptr_host[i * count], bbox_element, img, count);
    }
}
static std::vector<cv::Point> get_corner(cv::Mat& img, const Detection& box) {
    float cos_value, sin_value;
    float x1 = box.bbox[0];
    float y1 = box.bbox[1];
    float w = box.bbox[2];
    float h = box.bbox[3];
    float angle = box.angle * 180.0f / CV_PI;  
    std::cout << "Original angle: " << angle << std::endl;
    if (h >= w) {
        std::swap(w, h);
        angle = fmod(angle + 90.0f, 180.0f);  
    }
    if (angle < 0) {
        angle += 360.0f; 
    }
    if (angle > 180.0f) {
        angle -= 180.0f;  
    }
    std::cout << "Adjusted angle: " << angle << std::endl;
    float normal_angle = fmod(angle, 180.0f);
    if (normal_angle < 0) {
        normal_angle += 180.0f; 
    }
    std::cout << "Normal angle: " << normal_angle << std::endl;
    cos_value = std::cos(angle * CV_PI / 180.0f);  
    sin_value = std::sin(angle * CV_PI / 180.0f);
    float l = x1 - w / 2;  
    float r = x1 + w / 2;  
    float t = y1 - h / 2;  
    float b = y1 + h / 2;  
    float bbox[4] = {l, t, r, b};
    cv::Rect rect = get_rect(img, bbox);
    float x_ = (rect.x + rect.x + rect.width) / 2;   
    float y_ = (rect.y + rect.y + rect.height) / 2;  
    float width = rect.width;                       
    float height = rect.height;                      
    std::vector<cv::Point> corner_points(4);
    float vec1x = width / 2 * cos_value;
    float vec1y = width / 2 * sin_value;
    float vec2x = -height / 2 * sin_value;
    float vec2y = height / 2 * cos_value;
    corner_points[0] = cv::Point(int(round(x_ + vec1x + vec2x)), int(round(y_ + vec1y + vec2y)));
    corner_points[1] = cv::Point(int(round(x_ + vec1x - vec2x)), int(round(y_ + vec1y - vec2y)));
    corner_points[2] =
            cv::Point(int(round(x_ - vec1x - vec2x)), int(round(y_ - vec1y - vec2y)));  
    corner_points[3] = cv::Point(int(round(x_ - vec1x + vec2x)), int(round(y_ - vec1y + vec2y))); 
    for (auto& point : corner_points) {
        point.x = std::max(0, std::min(point.x, img.cols - 1));
        point.y = std::max(0, std::min(point.y, img.rows - 1));
    }
    return corner_points;
}
void draw_bbox_obb(std::vector<cv::Mat>& img_batch, std::vector<std::vector<Detection>>& res_batch) {
    static std::vector<uint32_t> colors = {0xFF3838, 0xFF9D97, 0xFF701F, 0xFFB21D, 0xCFD231, 0x48F90A, 0x92CC17,
                                           0x3DDB86, 0x1A9334, 0x00D4BB, 0x2C99A8, 0x00C2FF, 0x344593, 0x6473FF,
                                           0x0018EC, 0x8438FF, 0x520085, 0xCB38FF, 0xFF95C8, 0xFF37C7};
    for (size_t i = 0; i < img_batch.size(); i++) {
        auto& res = res_batch[i];
        auto& img = img_batch[i];
        for (auto& obj : res) {
            auto color = colors[(int)obj.class_id % colors.size()];
            auto bgr = cv::Scalar(color & 0xFF, color >> 8 & 0xFF, color >> 16 & 0xFF);
            auto corner_points = get_corner(img, obj);
            cv::polylines(img, std::vector<std::vector<cv::Point>>{corner_points}, true, bgr, 1);

            auto text = (std::to_string((int)(obj.class_id)) + ":" + to_string_with_precision(obj.conf));
            cv::Size textsize = cv::getTextSize(text, 0, 0.3, 1, nullptr);
            int width = textsize.width;
            int height = textsize.height;
            bool outside = (corner_points[0].y - height >= 3) ? true : false;
            cv::Point p1(corner_points[0].x, corner_points[0].y), p2;
            p2.x = corner_points[0].x + width;
            if (outside) {
                p2.y = corner_points[0].y - height - 3;
            } else {
                p2.y = corner_points[0].y + height + 3;
            }
            cv::rectangle(img, p1, p2, bgr, -1, cv::LINE_AA);
            cv::putText(
                    img, text,
                    cv::Point(corner_points[0].x, (outside ? corner_points[0].y - 2 : corner_points[0].y + height + 2)),
                    0, 0.3, cv::Scalar::all(255), 1, cv::LINE_AA);
        }
    }
}
const int kOutputSize = kMaxNumOutputBbox * sizeof(Detection) / sizeof(float) + 1;
void deserialize_engine(std::string& engine_name, IRuntime** runtime, ICudaEngine** engine,
    IExecutionContext** context) {
    std::ifstream file(engine_name, std::ios::binary);
    if (!file.good()) {
        std::cerr << "read " << engine_name << " error!" << std::endl;
        assert(false);
    }
    size_t size = 0;
    file.seekg(0, file.end);
    size = file.tellg();
    file.seekg(0, file.beg);
    char* serialized_engine = new char[size];
    assert(serialized_engine);
    file.read(serialized_engine, size);
    file.close();
    *runtime = createInferRuntime(gLogger);
    assert(*runtime);
    *engine = (*runtime)->deserializeCudaEngine(serialized_engine, size);
    assert(*engine);
    *context = (*engine)->createExecutionContext();
    assert(*context);
    delete[] serialized_engine;
}
void prepare_buffer(ICudaEngine* engine, float** input_buffer_device, float** output_buffer_device,
    float** output_buffer_host, float** decode_ptr_host, float** decode_ptr_device)
{
    assert(engine->getNbBindings() == 2);
    const int inputIndex = engine->getBindingIndex(kInputTensorName);
    const int outputIndex = engine->getBindingIndex(kOutputTensorName);
    assert(inputIndex == 0);
    assert(outputIndex == 1);
    CUDA_CHECK(cudaMalloc((void**)input_buffer_device, kBatchSize * 3 * kInputH * kInputW * sizeof(float)));
    CUDA_CHECK(cudaMalloc((void**)output_buffer_device, kBatchSize * kOutputSize * sizeof(float)));
    if (kBatchSize > 1) {
        std::cerr << "Do not yet support GPU post processing for multiple batches" << std::endl;
        exit(0);
    }
    *decode_ptr_host = new float[1 + kMaxNumOutputBbox * bbox_element];
    CUDA_CHECK(cudaMalloc((void**)decode_ptr_device, sizeof(float) * (1 + kMaxNumOutputBbox * bbox_element)));
}
void infer(IExecutionContext& context, cudaStream_t& stream, void** buffers, float* output, int batchsize,
    float* decode_ptr_host, float* decode_ptr_device, int model_bboxes) {
    auto start = std::chrono::system_clock::now();
    context.enqueue(batchsize, buffers, stream, nullptr);
    CUDA_CHECK(
        cudaMemsetAsync(decode_ptr_device, 0, sizeof(float) * (1 + kMaxNumOutputBbox * bbox_element), stream));
    cuda_decode_obb((float*)buffers[1], model_bboxes, kConfThresh, decode_ptr_device, kMaxNumOutputBbox, stream);
    cuda_nms_obb(decode_ptr_device, kNmsThresh, kMaxNumOutputBbox, stream);  //cuda nms
    CUDA_CHECK(cudaMemcpyAsync(decode_ptr_host, decode_ptr_device,
        sizeof(float) * (1 + kMaxNumOutputBbox * bbox_element), cudaMemcpyDeviceToHost,
        stream));
    auto end = std::chrono::system_clock::now();
    std::cout << "inference and gpu postprocess time: "
        << std::chrono::duration_cast<std::chrono::milliseconds>(end - start).count() << "ms" << std::endl;
    CUDA_CHECK(cudaStreamSynchronize(stream));
}
void infer_image_demo() {
    cudaSetDevice(kGpuId);
    std::string engine_name = "yolov8n-obb.engine";
    std::string img_dir = "images";
    std::string sub_type = "";
    int model_bboxes;
    int is_p = 0;
    float gd = 0.0f, gw = 0.0f;
    int max_channels = 0;
    IRuntime* runtime = nullptr;
    ICudaEngine* engine = nullptr;
    IExecutionContext* context = nullptr;
    deserialize_engine(engine_name, &runtime, &engine, &context);
    cudaStream_t stream;
    CUDA_CHECK(cudaStreamCreate(&stream));
    cuda_preprocess_init(kMaxInputImageSize);
    auto out_dims = engine->getBindingDimensions(1);
    model_bboxes = out_dims.d[0];
    float* device_buffers[2];
    float* output_buffer_host = nullptr;
    float* decode_ptr_host = nullptr;
    float* decode_ptr_device = nullptr;
    std::vector<std::string> file_names;
    if (read_files_in_dir(img_dir.c_str(), file_names) < 0) {
        std::cerr << "read_files_in_dir failed." << std::endl;
    }
    prepare_buffer(engine, &device_buffers[0], &device_buffers[1], &output_buffer_host, &decode_ptr_host,
        &decode_ptr_device);
    for (size_t i = 0; i < file_names.size(); i += kBatchSize) {
        std::vector<cv::Mat> img_batch;
        std::vector<std::string> img_name_batch;
        for (size_t j = i; j < i + kBatchSize && j < file_names.size(); j++) {
            cv::Mat img = cv::imread(img_dir + "/" + file_names[j]);
            img_batch.push_back(img);
            img_name_batch.push_back(file_names[j]);
        }
        cuda_batch_preprocess(img_batch, device_buffers[0], kInputW, kInputH, stream);
        infer(*context, stream, (void**)device_buffers, output_buffer_host, kBatchSize, decode_ptr_host,
            decode_ptr_device, model_bboxes);
        std::vector<std::vector<Detection>> res_batch;
        batch_process_obb(res_batch, decode_ptr_host, img_batch.size(), bbox_element, img_batch);
        draw_bbox_obb(img_batch, res_batch);
        for (size_t j = 0; j < img_batch.size(); j++) {
            cv::imwrite("_" + img_name_batch[j], img_batch[j]);
        }
    }
    cudaStreamDestroy(stream);
    CUDA_CHECK(cudaFree(device_buffers[0]));
    CUDA_CHECK(cudaFree(device_buffers[1]));
    CUDA_CHECK(cudaFree(decode_ptr_device));
    delete[] decode_ptr_host;
    delete[] output_buffer_host;
    cuda_preprocess_destroy();
    delete context;
    delete engine;
    delete runtime;
}

process.cu

#include "yolo_cuda_utils.h"
#include "process.h"
static uint8_t* img_buffer_host = nullptr;
static uint8_t* img_buffer_device = nullptr;
__global__ void warpaffine_kernel(uint8_t* src, int src_line_size, int src_width, int src_height, float* dst,
                                  int dst_width, int dst_height, uint8_t const_value_st, AffineMatrix d2s, int edge) {
    int position = blockDim.x * blockIdx.x + threadIdx.x;
    if (position >= edge)
        return;
    float m_x1 = d2s.value[0];
    float m_y1 = d2s.value[1];
    float m_z1 = d2s.value[2];
    float m_x2 = d2s.value[3];
    float m_y2 = d2s.value[4];
    float m_z2 = d2s.value[5];
    int dx = position % dst_width;
    int dy = position / dst_width;
    float src_x = m_x1 * dx + m_y1 * dy + m_z1 + 0.5f;
    float src_y = m_x2 * dx + m_y2 * dy + m_z2 + 0.5f;
    float c0, c1, c2;
    if (src_x <= -1 || src_x >= src_width || src_y <= -1 || src_y >= src_height) {
        c0 = const_value_st;
        c1 = const_value_st;
        c2 = const_value_st;
    } else {
        int y_low = floorf(src_y);
        int x_low = floorf(src_x);
        int y_high = y_low + 1;
        int x_high = x_low + 1;
        uint8_t const_value[] = {const_value_st, const_value_st, const_value_st};
        float ly = src_y - y_low;
        float lx = src_x - x_low;
        float hy = 1 - ly;
        float hx = 1 - lx;
        float w1 = hy * hx, w2 = hy * lx, w3 = ly * hx, w4 = ly * lx;
        uint8_t* v1 = const_value;
        uint8_t* v2 = const_value;
        uint8_t* v3 = const_value;
        uint8_t* v4 = const_value;
        if (y_low >= 0) {
            if (x_low >= 0)
                v1 = src + y_low * src_line_size + x_low * 3;
            if (x_high < src_width)
                v2 = src + y_low * src_line_size + x_high * 3;
        }
        if (y_high < src_height) {
            if (x_low >= 0)
                v3 = src + y_high * src_line_size + x_low * 3;
            if (x_high < src_width)
                v4 = src + y_high * src_line_size + x_high * 3;
        }
        c0 = w1 * v1[0] + w2 * v2[0] + w3 * v3[0] + w4 * v4[0];
        c1 = w1 * v1[1] + w2 * v2[1] + w3 * v3[1] + w4 * v4[1];
        c2 = w1 * v1[2] + w2 * v2[2] + w3 * v3[2] + w4 * v4[2];
    }
    float t = c2;
    c2 = c0;
    c0 = t;
    c0 = c0 / 255.0f;
    c1 = c1 / 255.0f;
    c2 = c2 / 255.0f;
    int area = dst_width * dst_height;
    float* pdst_c0 = dst + dy * dst_width + dx;
    float* pdst_c1 = pdst_c0 + area;
    float* pdst_c2 = pdst_c1 + area;
    *pdst_c0 = c0;
    *pdst_c1 = c1;
    *pdst_c2 = c2;
}
void cuda_preprocess(uint8_t* src, int src_width, int src_height, float* dst, int dst_width, int dst_height,
                     cudaStream_t stream) {
    int img_size = src_width * src_height * 3;
    memcpy(img_buffer_host, src, img_size);
    CUDA_CHECK(cudaMemcpyAsync(img_buffer_device, img_buffer_host, img_size, cudaMemcpyHostToDevice, stream));
    AffineMatrix s2d, d2s;
    float scale = std::min(dst_height / (float)src_height, dst_width / (float)src_width);
    s2d.value[0] = scale;
    s2d.value[1] = 0;
    s2d.value[2] = -scale * src_width * 0.5 + dst_width * 0.5;
    s2d.value[3] = 0;
    s2d.value[4] = scale;
    s2d.value[5] = -scale * src_height * 0.5 + dst_height * 0.5;
    cv::Mat m2x3_s2d(2, 3, CV_32F, s2d.value);
    cv::Mat m2x3_d2s(2, 3, CV_32F, d2s.value);
    cv::invertAffineTransform(m2x3_s2d, m2x3_d2s);
    memcpy(d2s.value, m2x3_d2s.ptr<float>(0), sizeof(d2s.value));
    int jobs = dst_height * dst_width;
    int threads = 256;
    int blocks = ceil(jobs / (float)threads);
    warpaffine_kernel<<<blocks, threads, 0, stream>>>(img_buffer_device, src_width * 3, src_width, src_height, dst,
                                                      dst_width, dst_height, 128, d2s, jobs);
}
void cuda_batch_preprocess(std::vector<cv::Mat>& img_batch, float* dst, int dst_width, int dst_height,
                           cudaStream_t stream) {
    int dst_size = dst_width * dst_height * 3;
    for (size_t i = 0; i < img_batch.size(); i++) {
        cuda_preprocess(img_batch[i].ptr(), img_batch[i].cols, img_batch[i].rows, &dst[dst_size * i], dst_width,
                        dst_height, stream);
        CUDA_CHECK(cudaStreamSynchronize(stream));
    }
}
void cuda_preprocess_init(int max_image_size) {
    CUDA_CHECK(cudaMallocHost((void**)&img_buffer_host, max_image_size * 3));
    CUDA_CHECK(cudaMalloc((void**)&img_buffer_device, max_image_size * 3));
}
void cuda_preprocess_destroy() {
    CUDA_CHECK(cudaFree(img_buffer_device));
    CUDA_CHECK(cudaFreeHost(img_buffer_host));
}
static __global__ void decode_kernel_obb(float* predict, int num_bboxes, float confidence_threshold, float* parray,
    int max_objects) {
    float count = predict[0];
    int position = (blockDim.x * blockIdx.x + threadIdx.x);
    if (position >= count)
        return;
    float* pitem = predict + 1 + position * (sizeof(Detection) / sizeof(float));
    int index = atomicAdd(parray, 1);
    if (index >= max_objects)
        return;
    float confidence = pitem[4];
    if (confidence < confidence_threshold)
        return;
    float cx = pitem[0];
    float cy = pitem[1];
    float width = pitem[2];
    float height = pitem[3];
    float label = pitem[5];
    float angle = pitem[89];
    float* pout_item = parray + 1 + index * bbox_element;
    *pout_item++ = cx;
    *pout_item++ = cy;
    *pout_item++ = width;
    *pout_item++ = height;
    *pout_item++ = confidence;
    *pout_item++ = label;
    *pout_item++ = 1;
    *pout_item++ = angle;
}
static __global__ void decode_kernel(float* predict, int num_bboxes, float confidence_threshold, float* parray,
    int max_objects) {
    float count = predict[0];
    int position = (blockDim.x * blockIdx.x + threadIdx.x);
    if (position >= count)
        return;
    float* pitem = predict + 1 + position * (sizeof(Detection) / sizeof(float));
    int index = atomicAdd(parray, 1);
    if (index >= max_objects)
        return;
    float confidence = pitem[4];
    if (confidence < confidence_threshold)
        return;
    float left = pitem[0];
    float top = pitem[1];
    float right = pitem[2];
    float bottom = pitem[3];
    float label = pitem[5];
    float* pout_item = parray + 1 + index * bbox_element;
    *pout_item++ = left;
    *pout_item++ = top;
    *pout_item++ = right;
    *pout_item++ = bottom;
    *pout_item++ = confidence;
    *pout_item++ = label;
    *pout_item++ = 1;
}
static __device__ float box_iou(float aleft, float atop, float aright, float abottom, float bleft, float btop,
    float bright, float bbottom) {
    float cleft = max(aleft, bleft);
    float ctop = max(atop, btop);
    float cright = min(aright, bright);
    float cbottom = min(abottom, bbottom);
    float c_area = max(cright - cleft, 0.0f) * max(cbottom - ctop, 0.0f);
    if (c_area == 0.0f)
        return 0.0f;
    float a_area = max(0.0f, aright - aleft) * max(0.0f, abottom - atop);
    float b_area = max(0.0f, bright - bleft) * max(0.0f, bbottom - btop);
    return c_area / (a_area + b_area - c_area);
}
static __global__ void nms_kernel(float* bboxes, int max_objects, float threshold) {
    int position = (blockDim.x * blockIdx.x + threadIdx.x);
    int count = bboxes[0];
    if (position >= count)
        return;
    float* pcurrent = bboxes + 1 + position * bbox_element;
    for (int i = 0; i < count; ++i) {
        float* pitem = bboxes + 1 + i * bbox_element;
        if (i == position || pcurrent[5] != pitem[5])
            continue;
        if (pitem[4] >= pcurrent[4]) {
            if (pitem[4] == pcurrent[4] && i < position)
                continue;
            float iou =
                box_iou(pcurrent[0], pcurrent[1], pcurrent[2], pcurrent[3], pitem[0], pitem[1], pitem[2], pitem[3]);
            if (iou > threshold) {
                pcurrent[6] = 0;
                return;
            }
        }
    }
}
static __device__ void convariance_matrix(float w, float h, float r, float& a, float& b, float& c) {
    float a_val = w * w / 12.0f;
    float b_val = h * h / 12.0f;
    float cos_r = cosf(r);
    float sin_r = sinf(r);
    a = a_val * cos_r * cos_r + b_val * sin_r * sin_r;
    b = a_val * sin_r * sin_r + b_val * cos_r * cos_r;
    c = (a_val - b_val) * sin_r * cos_r;
}
static __device__ float box_probiou(float cx1, float cy1, float w1, float h1, float r1, float cx2, float cy2, float w2,
    float h2, float r2, float eps = 1e-7) {

    float a1, b1, c1, a2, b2, c2;
    convariance_matrix(w1, h1, r1, a1, b1, c1);
    convariance_matrix(w2, h2, r2, a2, b2, c2);
    float t1 = ((a1 + a2) * powf(cy1 - cy2, 2) + (b1 + b2) * powf(cx1 - cx2, 2)) /
        ((a1 + a2) * (b1 + b2) - powf(c1 + c2, 2) + eps);
    float t2 = ((c1 + c2) * (cx2 - cx1) * (cy1 - cy2)) / ((a1 + a2) * (b1 + b2) - powf(c1 + c2, 2) + eps);
    float t3 = logf(((a1 + a2) * (b1 + b2) - powf(c1 + c2, 2)) /
        (4 * sqrtf(fmaxf(a1 * b1 - c1 * c1, 0.0f)) * sqrtf(fmaxf(a2 * b2 - c2 * c2, 0.0f)) + eps) +
        eps);
    float bd = 0.25f * t1 + 0.5f * t2 + 0.5f * t3;
    bd = fmaxf(fminf(bd, 100.0f), eps);
    float hd = sqrtf(1.0f - expf(-bd) + eps);
    return 1 - hd;
}
static __global__ void nms_kernel_obb(float* bboxes, int max_objects, float threshold) {
    int position = (blockDim.x * blockIdx.x + threadIdx.x);
    int count = bboxes[0];
    if (position >= count)
        return;
    float* pcurrent = bboxes + 1 + position * bbox_element;
    for (int i = 0; i < count; ++i) {
        float* pitem = bboxes + 1 + i * bbox_element;
        if (i == position || pcurrent[5] != pitem[5])
            continue;
        if (pitem[4] >= pcurrent[4]) {
            if (pitem[4] == pcurrent[4] && i < position)
                continue;
            float iou = box_probiou(pcurrent[0], pcurrent[1], pcurrent[2], pcurrent[3], pcurrent[7], pitem[0], pitem[1],
                pitem[2], pitem[3], pitem[7]);
            if (iou > threshold) {
                pcurrent[6] = 0;
                return;
            }
        }
    }
}
void cuda_decode(float* predict, int num_bboxes, float confidence_threshold, float* parray, int max_objects,
    cudaStream_t stream) {
    int block = 256;
    int grid = ceil(num_bboxes / (float)block);
    decode_kernel << <grid, block, 0, stream >> > ((float*)predict, num_bboxes, confidence_threshold, parray, max_objects);
}
void cuda_nms(float* parray, float nms_threshold, int max_objects, cudaStream_t stream) {
    int block = max_objects < 256 ? max_objects : 256;
    int grid = ceil(max_objects / (float)block);
    nms_kernel << <grid, block, 0, stream >> > (parray, max_objects, nms_threshold);
}
void cuda_decode_obb(float* predict, int num_bboxes, float confidence_threshold, float* parray, int max_objects,
    cudaStream_t stream) {
    int block = 256;
    int grid = ceil(num_bboxes / (float)block);
    decode_kernel_obb << <grid, block, 0, stream >> > ((float*)predict, num_bboxes, confidence_threshold, parray,
        max_objects);
}
void cuda_nms_obb(float* parray, float nms_threshold, int max_objects, cudaStream_t stream) {
    int block = max_objects < 256 ? max_objects : 256;
    int grid = ceil(max_objects / (float)block);
    nms_kernel_obb << <grid, block, 0, stream >> > (parray, max_objects, nms_threshold);
}

CMakeLists.txt

ccmake_minimum_required(VERSION 3.18)
project(Yolov8obb-Cuda) 
add_definitions(-DAPI_EXPORTS)	
add_definitions(-DNOMINMAX)
set(Tensorrt_Include "E:\\Tensorrt\\8.6.1.6\\samples\\common")
set(Window_Include "E:\\Tensorrt\\8.6.1.6\\window")
set(OpenCV_DIR "E:\\Opencv contrib\\newbuild\\install") 
set(OpenCV_INCLUDE_DIRS ${OpenCV_DIR}\\include)
set(OpenCV_LIB_DIRS ${OpenCV_DIR}\\x64\\vc17\\lib)
set(OpenCV_LIB_DEBUG ${OpenCV_DIR}\\x64\\vc17\\lib\\opencv_world470d.lib) 
set(OpenCV_LIB_RELEASE ${OpenCV_DIR}\\x64\\vc17\\lib\\opencv_world470.lib)				
set(CMAKE_CUDA_ARCHITECTURES 52)	
find_package(CUDA REQUIRED)			
enable_language(CUDA)  						
find_package(OpenCV QUIET)
include_directories(${CUDA_INCLUDE_DIRS})	
include_directories(${OpenCV_INCLUDE_DIRS}) 
include_directories(${Tensorrt_Include}) 
include_directories(${Window_Include}) 
link_directories(${OpenCV_LIB_DIRS})  		
set(SOURCES
    main.cpp
    yolo_obb.cpp
    logger.cpp
    process.cu
    yolo_obb.h
    process.h
    logger.h
    yolo_cuda_utils.h
)
add_executable(Yolov8obb-Cuda ${SOURCES})
target_link_libraries(Yolov8obb-Cuda "nvinfer.lib" "nvinfer_plugin.lib" "nvonnxparser.lib"
                      "nvparsers.lib"
                      "cublas.lib" "cublasLt.lib" "cuda.lib" "cudadevrt.lib" "cudart.lib"
                      "cudart_static.lib" "cudnn.lib" "cufft.lib" "cufftw.lib" "curand.lib" "cusolver.lib"
                      "cusolverMg.lib" "cusparse.lib" "nppc.lib" "nppial.lib"
                      "nppicc.lib" "nppidei.lib" "nppif.lib" "nppig.lib"
                      "nppim.lib" "nppist.lib" "nppisu.lib" "nppitc.lib" 
                      "npps.lib"  "nvblas.lib" "nvjpeg.lib" "nvml.lib"
                      "nvrtc.lib") 		
target_link_libraries(Yolov8obb-Cuda ${CUDA_LIBRARIES}) 
target_link_libraries(Yolov8obb-Cuda
    $<$<CONFIG:Debug>:${OpenCV_LIB_DEBUG}>
    $<$<CONFIG:Release>:${OpenCV_LIB_RELEASE}>
) 	
Logo

火山引擎开发者社区是火山引擎打造的AI技术生态平台,聚焦Agent与大模型开发,提供豆包系列模型(图像/视频/视觉)、智能分析与会话工具,并配套评测集、动手实验室及行业案例库。社区通过技术沙龙、挑战赛等活动促进开发者成长,新用户可领50万Tokens权益,助力构建智能应用。

更多推荐