yolov7 - pose 系列CUDA 前后处理
yolov7
YOLOv7 - 实现了一种新的实时目标检测算法,用于图像识别和处理。
项目地址:https://gitcode.com/gh_mirrors/yo/yolov7
免费下载资源
·
yolov7-pose.cpp
#include <fstream>
#include <iostream>
#include <sstream>
#include <numeric>
#include <chrono>
#include <vector>
#include <opencv2/opencv.hpp>
#include <dirent.h>
#include "NvInfer.h"
#include "cuda_runtime_api.h"
#include "logging.h"
//#include "include/utils.hpp"
#include "utils.hpp"
#include "preprocess.h"
#include "postprocess.h"
#define MAX_IMAGE_INPUT_SIZE_THRESH 5000 * 5000
#define MAX_OBJECTS 2048
#define NUM_BOX_ELEMENT 17
using namespace std;
struct affineMatrix //letter_box 仿射变换矩阵
{
float i2d[6]; //仿射变换正变换
float d2i[6]; //仿射变换逆变换
};
struct bbox
{
float x1,x2,y1,y2;
float landmarks[10]; //5个关键点 2*5 =10
float score;
};
#define CHECK(status) \
do\
{\
auto ret = (status);\
if (ret != 0)\
{\
std::cerr << "Cuda failure: " << ret << std::endl;\
abort();\
}\
} while (0)
#define DEVICE 0 // GPU id
#define NMS_THRESH 0.45
#define BBOX_CONF_THRESH 0.3
using namespace nvinfer1;
static const int INPUT_W = 640;
static const int INPUT_H = 640;
static const int NUM_CLASSES = 1; //类别数
static const int CKPT_NUM = 5; //关键点个数
const char* INPUT_BLOB_NAME = "images"; //onnx 输入 名字
const char* OUTPUT_BLOB_NAME = "output"; //onnx 输出 名字
static Logger gLogger;
void affine_project(float *d2i,float x,float y,float *ox,float *oy) //通过仿射变换逆矩阵,恢复成原图的坐标
{
*ox = d2i[0]*x+d2i[1]*y+d2i[2];
*oy = d2i[3]*x+d2i[4]*y+d2i[5];
}
const float color_list[5][3] =
{
{255, 0, 0},
{0, 255, 0},
{0, 0, 255},
{0, 255, 255},
{255,255,0},
};
void getd2i(affineMatrix &afmt,cv::Size to,cv::Size from) //计算仿射变换的矩阵和逆矩阵
{
float scale = std::min(1.0*to.width/from.width, 1.0*to.height/from.height);
afmt.i2d[0]=scale;
afmt.i2d[1]=0;
afmt.i2d[2]=-scale*from.width*0.5+to.width*0.5;
afmt.i2d[3]=0;
afmt.i2d[4]=scale;
afmt.i2d[5]=-scale*from.height*0.5+to.height*0.5;
cv::Mat i2d_mat(2,3,CV_32F,afmt.i2d);
cv::Mat d2i_mat(2,3,CV_32F,afmt.d2i);
cv::invertAffineTransform(i2d_mat,d2i_mat);
memcpy(afmt.d2i, d2i_mat.ptr<float>(0), sizeof(afmt.d2i));
}
int main(int argc, char** argv)
{
cudaSetDevice(DEVICE);
char *trtModelStreamDet{nullptr};
size_t size{0};
const std::string engine_file_path {argv[1]};
std::ifstream file(engine_file_path, std::ios::binary);
int batch_size = 1;
if (file.good()) {
file.seekg(0, file.end);
size = file.tellg();
file.seekg(0, file.beg);
trtModelStreamDet = new char[size];
assert(trtModelStreamDet);
file.read(trtModelStreamDet, size);
file.close();
}
//det模型trt初始化
IRuntime* runtime_det = createInferRuntime(gLogger);
assert(runtime_det != nullptr);
ICudaEngine* engine_det = runtime_det->deserializeCudaEngine(trtModelStreamDet, size);
assert(engine_det != nullptr);
IExecutionContext* context_det = engine_det->createExecutionContext();
assert(context_det != nullptr);
delete[] trtModelStreamDet;
float *buffers[2];
const int inputIndex = engine_det->getBindingIndex(INPUT_BLOB_NAME);
const int outputIndex = engine_det->getBindingIndex(OUTPUT_BLOB_NAME);
assert(inputIndex == 0);
assert(outputIndex == 1);
// Create GPU buffers on device
auto out_dims = engine_det->getBindingDimensions(1);
auto output_size = 1;
int OUTPUT_CANDIDATES = out_dims.d[1];
for(int j=0;j<out_dims.nbDims;j++) {
output_size *= out_dims.d[j];
cout<< "out_dims.d["<< j << "] : "<< out_dims.d[j] <<endl;
}
cout<< "output candidates: "<< OUTPUT_CANDIDATES << endl;
CHECK(cudaMalloc((void**)&buffers[inputIndex], 3 * INPUT_H * INPUT_W * sizeof(float)));
CHECK(cudaMalloc((void**)&buffers[outputIndex], output_size * sizeof(float)));
// Create stream
cudaStream_t stream;
CHECK(cudaStreamCreate(&stream));
uint8_t* img_host = nullptr;
uint8_t* img_device = nullptr;
float *affine_matrix_d2i_host = nullptr;
float *affine_matrix_d2i_device = nullptr;
float *decode_ptr_device = nullptr;
float *decode_ptr_host = nullptr;
decode_ptr_host = new float[1+MAX_OBJECTS*NUM_BOX_ELEMENT];
// prepare input data cache in pinned memory
CHECK(cudaMallocHost((void**)&img_host, MAX_IMAGE_INPUT_SIZE_THRESH * 3));
// prepare input data cache in device memory
CHECK(cudaMalloc((void**)&img_device, MAX_IMAGE_INPUT_SIZE_THRESH * 3));
CHECK(cudaMallocHost(&affine_matrix_d2i_host,sizeof(float)*6));
CHECK(cudaMalloc(&affine_matrix_d2i_device,sizeof(float)*6));
CHECK(cudaMalloc(&decode_ptr_device,sizeof(float)*(1+MAX_OBJECTS*NUM_BOX_ELEMENT)));
static float* prob = new float[output_size];
// std::string imgPath ="/mnt/Gpan/Mydata/pytorchPorject/Chinese_license_plate_detection_recognition/imgs";
std::string input_image_path=argv[2];
std::string imgPath=argv[2];
std::vector<std::string> imagList;
std::vector<std::string>fileType{"jpg","png"};
readFileList(const_cast<char *>(imgPath.c_str()),imagList,fileType);
double sumTime = 0;
int index = 0;
cv::Size to(INPUT_W, INPUT_H);
for (auto &input_image_path:imagList)
{
affineMatrix afmt;
cv::Mat img = cv::imread(input_image_path);
getd2i(afmt, to, cv::Size(img.cols, img.rows));
double begin_time = cv::getTickCount();
float *buffer_idx = (float*)buffers[inputIndex];
size_t size_image = img.cols * img.rows * 3;
size_t size_image_dst = INPUT_H * INPUT_W * 3;
memcpy(affine_matrix_d2i_host,afmt.d2i,sizeof(afmt.d2i));
memcpy(img_host, img.data, size_image);
CHECK(cudaMemcpyAsync(img_device,
img_host,
size_image,
cudaMemcpyHostToDevice,
stream));
CHECK(cudaMemcpyAsync(affine_matrix_d2i_device,
affine_matrix_d2i_host,
sizeof(afmt.d2i),
cudaMemcpyHostToDevice,stream));
preprocess_kernel_img(img_device,
img.cols,
img.rows,
buffer_idx,
INPUT_W,
INPUT_H,
affine_matrix_d2i_device,
stream); //前处理 ,相当于letter_box
double time_pre = cv::getTickCount();
double time_pre_=(time_pre-begin_time)/cv::getTickFrequency()*1000;
std::cout<<"preprocessing time is "<<time_pre_<<" ms"<<std::endl;
// doInference_cu(*context_det,stream, (void**)buffers,prob,1,output_size);
(*context_det).enqueueV2((void**)buffers, stream, nullptr);
float *predict = (float *)buffers[outputIndex];
double time_infer = cv::getTickCount();
time_infer =(time_infer-begin_time)/cv::getTickFrequency()*1000;
std::cout<<"time_infer is "<<time_infer<<" ms"<<std::endl;
CHECK(cudaMemsetAsync(decode_ptr_device,
0,
sizeof(int),
stream));
decode_kernel_invoker(predict,
OUTPUT_CANDIDATES,
NUM_CLASSES,
CKPT_NUM,
BBOX_CONF_THRESH,
affine_matrix_d2i_device,
decode_ptr_device,
MAX_OBJECTS,
stream); //cuda 后处理
nms_kernel_invoker(decode_ptr_device, NMS_THRESH, MAX_OBJECTS, stream);//cuda nms
CHECK(cudaMemcpyAsync(decode_ptr_host, decode_ptr_device, sizeof(float)*(1+MAX_OBJECTS*NUM_BOX_ELEMENT), cudaMemcpyDeviceToHost, stream));
cudaStreamSynchronize(stream);
double end_time = cv::getTickCount();
std::vector<bbox> boxes;
//实际保留的bbox 个数
int boxes_count= 0;
//decode_ptr_host 第0位表示bboxes 的总的个数
int count = std::min((int)*decode_ptr_host, MAX_OBJECTS);
//遍历所有的count 数量
for (int i = 0; i < count; i++)
{
int basic_pos = 1 + i * NUM_BOX_ELEMENT;
//检查标志位是否保留该bbox
int keep_flag = decode_ptr_host[basic_pos + 6];
if (keep_flag == 1)
{
boxes_count += 1;
bbox box;
box.x1 = decode_ptr_host[basic_pos + 0];
box.y1 = decode_ptr_host[basic_pos + 1];
box.x2 = decode_ptr_host[basic_pos + 2];
box.y2 = decode_ptr_host[basic_pos + 3];
box.score = decode_ptr_host[basic_pos + 4];
//关键点的起点位置
int landmark_pos = basic_pos + 7;
//关键点的个数为5 个,(x1,y1)(x2,y2)(x3,y3)...
for (int id = 0; id < CKPT_NUM; id += 1)
{
box.landmarks[2 * id] = decode_ptr_host[landmark_pos + 2 * id]; //xi
box.landmarks[2 * id + 1 ] = decode_ptr_host[landmark_pos + 2 * id + 1]; //yi
}
boxes.push_back(box);
}
}
double time_post = cv::getTickCount();
time_post =(time_post - begin_time)/cv::getTickFrequency()*1000;
std::cout<<"time_post is " << time_post <<" ms" << std::endl;
std::cout<<input_image_path <<" " << "boxex count: " << boxes_count << endl;
//画出关键点 和方框
for (int i = 0; i<boxes_count; i++)
{
cv::Rect roi_area(boxes[i].x1,boxes[i].y1,boxes[i].x2-boxes[i].x1,boxes[i].y2-boxes[i].y1);
cv::rectangle(img, roi_area, cv::Scalar(0,255,0), 2);
for (int j= 0; j < CKPT_NUM; j++)
{
cv::Scalar color = cv::Scalar(color_list[j][0], color_list[j][1], color_list[j][2]);
cv::circle(img,cv::Point(boxes[i].landmarks[2*j], boxes[i].landmarks[2*j+1]), 2, color, -1);
}
}
auto time_gap = (end_time-begin_time)/cv::getTickFrequency()*1000;
std::cout<<" time_gap: "<<time_gap<<"ms ";
if (index)
{
sumTime += time_gap;
}
std::cout<<std::endl;
index += 1;
int pos = input_image_path.find_last_of("/");
std::string image_name = input_image_path.substr(pos+1);
cv::imwrite(image_name, img);
}
// destroy the engine
std::cout<<"averageTime:"<<(sumTime/(imagList.size() - 1 ))<<"ms"<<std::endl;
context_det->destroy();
engine_det->destroy();
runtime_det->destroy();
cudaStreamDestroy(stream);
CHECK(cudaFree(affine_matrix_d2i_device));
CHECK(cudaFreeHost(affine_matrix_d2i_host));
CHECK(cudaFree(img_device));
CHECK(cudaFreeHost(img_host));
CHECK(cudaFree(buffers[inputIndex]));
CHECK(cudaFree(buffers[outputIndex]));
CHECK(cudaFree(decode_ptr_device));
delete [] decode_ptr_host;
return 0;
}
preprocess.cpp
#include "preprocess.h"
__global__ void warpaffine_kernel(
uint8_t* src, int src_line_size, int src_width,
int src_height, float* dst, int dst_width,
int dst_height, uint8_t const_value_st,
float *d2i, int edge) {
int position = blockDim.x * blockIdx.x + threadIdx.x;
if (position >= edge) return;
float m_x1 = d2i[0];
float m_y1 = d2i[1];
float m_z1 = d2i[2];
float m_x2 = d2i[3];
float m_y2 = d2i[4];
float m_z2 = d2i[5];
int dx = position % dst_width;
int dy = position / dst_width;
float src_x = m_x1 * dx + m_y1 * dy + m_z1 + 0.5f;
float src_y = m_x2 * dx + m_y2 * dy + m_z2 + 0.5f;
float c0, c1, c2;
if (src_x <= -1 || src_x >= src_width || src_y <= -1 || src_y >= src_height) {
// out of range
c0 = const_value_st;
c1 = const_value_st;
c2 = const_value_st;
} else {
int y_low = floorf(src_y);
int x_low = floorf(src_x);
int y_high = y_low + 1;
int x_high = x_low + 1;
uint8_t const_value[] = {const_value_st, const_value_st, const_value_st};
float ly = src_y - y_low;
float lx = src_x - x_low;
float hy = 1 - ly;
float hx = 1 - lx;
float w1 = hy * hx, w2 = hy * lx, w3 = ly * hx, w4 = ly * lx;
uint8_t* v1 = const_value;
uint8_t* v2 = const_value;
uint8_t* v3 = const_value;
uint8_t* v4 = const_value;
if (y_low >= 0) {
if (x_low >= 0)
v1 = src + y_low * src_line_size + x_low * 3;
if (x_high < src_width)
v2 = src + y_low * src_line_size + x_high * 3;
}
if (y_high < src_height) {
if (x_low >= 0)
v3 = src + y_high * src_line_size + x_low * 3;
if (x_high < src_width)
v4 = src + y_high * src_line_size + x_high * 3;
}
c0 = w1 * v1[0] + w2 * v2[0] + w3 * v3[0] + w4 * v4[0];
c1 = w1 * v1[1] + w2 * v2[1] + w3 * v3[1] + w4 * v4[1];
c2 = w1 * v1[2] + w2 * v2[2] + w3 * v3[2] + w4 * v4[2];
}
//bgr to rgb
float t = c2;
c2 = c0;
c0 = t;
//normalization
c0 = c0 / 255.0f;
c1 = c1 / 255.0f;
c2 = c2 / 255.0f;
//rgbrgbrgb to rrrgggbbb
int area = dst_width * dst_height;
float* pdst_c0 = dst + dy * dst_width + dx;
float* pdst_c1 = pdst_c0 + area;
float* pdst_c2 = pdst_c1 + area;
*pdst_c0 = c0;
*pdst_c1 = c1;
*pdst_c2 = c2;
}
void preprocess_kernel_img(
uint8_t* src, int src_width, int src_height,
float* dst, int dst_width, int dst_height,
float*d2i,cudaStream_t stream) {
int jobs = dst_height * dst_width;
int threads = 256;
int blocks = ceil(jobs / (float)threads);
warpaffine_kernel<<<blocks, threads, 0, stream>>>(
src, src_width*3, src_width,
src_height, dst, dst_width,
dst_height, 128, d2i, jobs);
}
postprocess.cpp
#include "postprocess.h"
const int NUM_BOX_ELEMENT = 17; // left, top, right, bottom, confidence, class, keepflag, 5 keypoints
static __device__ void affine_project(float* matrix, float x, float y, float* ox, float* oy){
*ox = matrix[0] * x + matrix[1] * y + matrix[2];
*oy = matrix[3] * x + matrix[4] * y + matrix[5];
}
static __global__ void decode_kernel(float* predict, int num_bboxes, int num_classes,int ckpt, float confidence_threshold, float* invert_affine_matrix, float* parray, int max_objects){
int position = blockDim.x * blockIdx.x + threadIdx.x;
if (position >= num_bboxes) return;
// if(position < 3)
//printf("position:%d, blockDim.x: %d, blockIdx.x: %d, threadIdx.x: %d\n", position, blockDim.x, blockIdx.x, threadIdx.x);
// printf("position: %d, num_classes:%d, ckpt: %d !\n", position, num_classes, ckpt);
//5 + 1 + 5*3 =21 一个方框bbox + 关键点 所占的空间
/*
//方框描述位 5个
bx :方框x中心点坐标
by :方框y中心点坐标
bw : 方框宽度
bh :方框高度
conf:目标置信度
//类别 就1 个分类,有多少个分类写几个
c0 :表示分类人脸
//关键点 人脸有5个 每个关键点有x,y,conf 三个描述位 总共与3X5 =15 个关键点位
kx : 关键点x坐标
ky: 关键点y坐标
kconf: 关键点置信度
一个检测对象,方框 总共有:5 + 1 + 15 = 21 个描述位 来描述,包括方框和关键点信息
*/
/*
pitem = predict+ 21 * position position 为 0,1,2,3,...
每个线程处理一个框,处理21位flaot predict 数据
*/
float* pitem = predict + (5 + num_classes + ckpt * 3) * position; //predict+ 21 * position position 为 0,1,2,3,...
//for test
// if(position == 0){
// for(int j=0; j<21; j++){
// printf("pitem[%d]: %f\n", j, pitem[j]);
// }
// }
// 方框描述位 confidence
float objectness = pitem[4];
if(objectness < confidence_threshold)
return;
//判断方框目标的类别,如果有很多个类别的话,需要对比每个类别上的confidence 大小,最大的那个就为检测出来的目标标签
//pitem + 5 开始为目标类别位,C0 开始,......
float* class_confidence = pitem + 5;
// 第一个目标类别的置信度,
float confidence = *class_confidence++;
int label = 0;
for(int i = 1; i < num_classes; ++i, ++class_confidence){
//后面类别的置信度要比前一个大才把标签替换掉,输出方框最终的标签号和置信度
if(*class_confidence > confidence){
confidence = *class_confidence;
label = i;
printf("*if class confidence :%d\n", (*class_confidence));
}
}
confidence *= objectness;
if(confidence < confidence_threshold)
return;
//原子操作,每个线程分别对变量parray进行+1 操作,
/*
CUDA的原子操作可以理解为对一个变量进行“读取-修改-写入”这三个操作的一个最小单位的执行过程,
这个执行过程不能够再分解为更小的部分,在它执行过程中,
不允许其他并行线程对该变量进行读取和写入的操作。基于这个机制,
原子操作实现了对在多个线程间共享的变量的互斥保护,
确保任何一次对变量的操作的结果的正确性。
*/
//给parray 的第一位累加bboxes的个数
int index = atomicAdd(parray, 1);
// printf("parray: %f, index: %d \n", *parray, index);
if(index >= max_objects)
return;
// printf("index %d max_objects %d\n", index,max_objects);
float cx = pitem[0];
float cy = pitem[1];
float width = pitem[2];
float height = pitem[3];
//五个关键点
float *landmarks = pitem + 5 + num_classes;
float x1 = landmarks[0];
float y1 = landmarks[1];
float x2 = landmarks[3];
float y2 = landmarks[4];
float x3 = landmarks[6];
float y3 = landmarks[7];
float x4 = landmarks[9];
float y4 = landmarks[10];
float x5 = landmarks[12];
float y5 = landmarks[13];
float left = cx - width * 0.5f;
float top = cy - height * 0.5f;
float right = cx + width * 0.5f;
float bottom = cy + height * 0.5f;
//反射变换出关键点的坐标
affine_project(invert_affine_matrix, left, top, &left, &top);
affine_project(invert_affine_matrix, right, bottom, &right, &bottom);
affine_project(invert_affine_matrix, x1,y1,&x1,&y1);
affine_project(invert_affine_matrix, x2,y2,&x2,&y2);
affine_project(invert_affine_matrix, x3,y3,&x3,&y3);
affine_project(invert_affine_matrix, x4,y4,&x4,&y4);
affine_project(invert_affine_matrix, x5,y5,&x5,&y5);
//往数组阵列中填写结果
float* pout_item = parray + 1 + index * NUM_BOX_ELEMENT;
*pout_item++ = left;
*pout_item++ = top;
*pout_item++ = right;
*pout_item++ = bottom;
*pout_item++ = confidence;
*pout_item++ = label;
*pout_item++ = 1; // 1 = keep, 0 = ignore
//five keypoint
*pout_item++ = x1;
*pout_item++ = y1;
*pout_item++ = x2;
*pout_item++ = y2;
*pout_item++ = x3;
*pout_item++ = y3;
*pout_item++ = x4;
*pout_item++ = y4;
*pout_item++ = x5;
*pout_item++ = y5;
}
static __device__ float box_iou(
float aleft, float atop, float aright, float abottom,
float bleft, float btop, float bright, float bbottom
){
float cleft = max(aleft, bleft);
float ctop = max(atop, btop);
float cright = min(aright, bright);
float cbottom = min(abottom, bbottom);
float c_area = max(cright - cleft, 0.0f) * max(cbottom - ctop, 0.0f);
if(c_area == 0.0f)
return 0.0f;
float a_area = max(0.0f, aright - aleft) * max(0.0f, abottom - atop);
float b_area = max(0.0f, bright - bleft) * max(0.0f, bbottom - btop);
return c_area / (a_area + b_area - c_area);
}
static __global__ void nms_kernel(float* bboxes, int max_objects, float threshold){
int position = (blockDim.x * blockIdx.x + threadIdx.x);
//bboxes 的数量,一个真正的object 可能存在有多个bboxes重叠,采用nms来去掉这种情况
int count = min((int)*bboxes, max_objects);
// printf("bboxes count num:%d\n", count);
if (position >= count)
return;
// left, top, right, bottom, confidence, class, keepflag
//position 线程id , 一个position 线程处理一个box element,pcurrent是指的当前线程映射的bbox id 号
float* pcurrent = bboxes + 1 + position * NUM_BOX_ELEMENT;
//遍历所有的bboxes
for(int i = 0; i < count; ++i){
float* pitem = bboxes + 1 + i * NUM_BOX_ELEMENT;
//同一个bbox 的element 而且 class 类型还不一样的
if(i == position || pcurrent[5] != pitem[5]) continue;
//置信度大于当前的置信度
if(pitem[4] >= pcurrent[4]){
//position 位置之前的且与当前element 置信度相等的忽略,为什么呢?
if(pitem[4] == pcurrent[4] && i < position)
continue;
//计算IOU值
float iou = box_iou(
pcurrent[0], pcurrent[1], pcurrent[2], pcurrent[3],
pitem[0], pitem[1], pitem[2], pitem[3]
);
if(iou > threshold){
//标记舍弃当前的bbox
pcurrent[6] = 0; // 1=keep, 0=ignore
return;
}
}
}
}
void decode_kernel_invoker(float* predict, int num_bboxes, int num_classes,int ckpt, float confidence_threshold, float* invert_affine_matrix, float* parray, int max_objects, cudaStream_t stream)
{
/*每个线程块分配256个线程*/
int block = 256;
int grid = ceil(num_bboxes / (float)block);
decode_kernel<<<grid, block, 0, stream>>>(predict, num_bboxes, num_classes,ckpt, confidence_threshold, invert_affine_matrix, parray, max_objects);
}
void nms_kernel_invoker(float* parray, float nms_threshold, int max_objects, cudaStream_t stream){
int block = max_objects<256? max_objects:256;
int grid = ceil(max_objects / (float)block);
nms_kernel<<<grid, block, 0, stream>>>(parray, max_objects, nms_threshold);
}
GitHub 加速计划 / yo / yolov7
13.13 K
4.14 K
下载
YOLOv7 - 实现了一种新的实时目标检测算法,用于图像识别和处理。
最近提交(Master分支:3 个月前 )
a207844b - 11 个月前
2c612d33 - 11 个月前
更多推荐
已为社区贡献2条内容
所有评论(0)