diff --git a/CMakeLists.txt b/CMakeLists.txt index e550a53a..320c2ada 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -19,13 +19,18 @@ if (CMAKE_COMPILER_IS_GNUCXX) set(CMAKE_CXX_FLAGS "-std=c++17 -Wno-deprecated ${CMAKE_CXX_FLAGS} ") message(STATUS "[Linux GCC Compiler Options]+:-std=c++17 -Wno-deprecated") endif () +# 指定CUDA编译器 +set(CMAKE_CUDA_COMPILER "/usr/local/cuda/bin/nvcc") +set(CMAKE_CUDA_ARCHITECTURES 89) # For RTX 20xx series +# 启用 CUDA +enable_language(CUDA) set(LITE_AI_ROOT_DIR ${CMAKE_SOURCE_DIR}) -option(ENABLE_TEST "build test examples." OFF) +option(ENABLE_TEST "build test examples." ON) option(ENABLE_DEBUG_STRING "enable DEBUG string or not" ON) option(ENABLE_ONNXRUNTIME "enable ONNXRuntime engine" ON) -option(ENABLE_TENSORRT "enable TensorRT engine" OFF) +option(ENABLE_TENSORRT "enable TensorRT engine" ON) option(ENABLE_MNN "enable MNN engine" OFF) option(ENABLE_NCNN "enable NCNN engine" OFF) option(ENABLE_TNN "enable TNN engine" OFF) diff --git a/cmake/opencv.cmake b/cmake/opencv.cmake index cea2b051..912446c5 100644 --- a/cmake/opencv.cmake +++ b/cmake/opencv.cmake @@ -19,9 +19,9 @@ link_directories(${OpenCV_DIR}/lib) if(NOT WIN32) if(ENABLE_OPENCV_VIDEOIO OR ENABLE_TEST) - set(OpenCV_LIBS opencv_core opencv_imgproc opencv_imgcodecs opencv_video opencv_videoio) + set(OpenCV_LIBS opencv_core opencv_imgproc opencv_imgcodecs opencv_video opencv_videoio opencv_calib3d) else() - set(OpenCV_LIBS opencv_core opencv_imgproc opencv_imgcodecs) # no videoio, video module + set(OpenCV_LIBS opencv_core opencv_imgproc opencv_imgcodecs opencv_calib3d) # no videoio, video module endif() else() set(OpenCV_LIBS opencv_world490) diff --git a/cmake/tensorrt.cmake b/cmake/tensorrt.cmake index 3de10b85..a88f4673 100644 --- a/cmake/tensorrt.cmake +++ b/cmake/tensorrt.cmake @@ -41,6 +41,8 @@ link_directories(${TensorRT_DIR}/lib) # 1. glob sources files file(GLOB TENSORRT_CORE_SRCS ${CMAKE_SOURCE_DIR}/lite/trt/core/*.cpp) +file(GLOB TENSORRT_CUDA_KERNEL_SRCS_CPP ${CMAKE_SOURCE_DIR}/lite/trt/kernel/*.cpp) +file(GLOB TENSORRT_CUDA_KERNEL_SRCS_CU ${CMAKE_SOURCE_DIR}/lite/trt/kernel/*.cu) file(GLOB TENSORRT_CV_SRCS ${CMAKE_SOURCE_DIR}/lite/trt/cv/*.cpp) file(GLOB TENSORRT_NLP_SRCS ${CMAKE_SOURCE_DIR}/lite/trt/nlp/*.cpp) file(GLOB TENSORRT_ASR_SRCS ${CMAKE_SOURCE_DIR}/lite/trt/asr/*.cpp) @@ -52,8 +54,17 @@ file(GLOB TENSORRT_CV_HEAD ${CMAKE_SOURCE_DIR}/lite/trt/cv/*.h) file(GLOB TENSORRT_NLP_HEAD ${CMAKE_SOURCE_DIR}/lite/trt/nlp/*.h) file(GLOB TENSORRT_ASR_HEAD ${CMAKE_SOURCE_DIR}/lite/trt/asr/*.h) file(GLOB TENSORRT_SD_HEAD ${CMAKE_SOURCE_DIR}/lite/trt/sd/*.h) +file(GLOB TENSORRT_CUDA_KERNEL_HEAD_CPP ${CMAKE_SOURCE_DIR}/lite/trt/kernel/*.h) +file(GLOB TENSORRT_CUDA_KERNEL_HEAD_CU ${CMAKE_SOURCE_DIR}/lite/trt/kernel/*.cuh) + + + +set(TRT_SRCS ${TENSORRT_CV_SRCS} ${TENSORRT_NLP_SRCS} ${TENSORRT_ASR_SRCS} ${TENSORRT_CORE_SRCS} ${TENSORRT_SD_SRCS} + ${TENSORRT_CUDA_KERNEL_SRCS_CPP} ${TENSORRT_CUDA_KERNEL_SRCS_CU}) +set_source_files_properties(${TENSORRT_CUDA_KERNEL_SRCS_CU} ${TENSORRT_CUDA_KERNEL_SRCS_CPP} + ${TENSORRT_CUDA_KERNEL_HEAD_CPP} ${TENSORRT_CUDA_KERNEL_HEAD_CU} + PROPERTIES LANGUAGE CUDA) -set(TRT_SRCS ${TENSORRT_CV_SRCS} ${TENSORRT_NLP_SRCS} ${TENSORRT_ASR_SRCS} ${TENSORRT_CORE_SRCS} ${TENSORRT_SD_SRCS}) # 3. copy message("[Lite.AI.Toolkit][I] Installing Lite.AI.ToolKit Headers for TensorRT Backend ...") # "INSTALL" can copy all files from the list to the specified path. @@ -63,4 +74,5 @@ file(INSTALL ${TENSORRT_CV_HEAD} DESTINATION ${CMAKE_INSTALL_PREFIX}/include/lit file(INSTALL ${TENSORRT_ASR_HEAD} DESTINATION ${CMAKE_INSTALL_PREFIX}/include/lite/trt/asr) file(INSTALL ${TENSORRT_NLP_HEAD} DESTINATION ${CMAKE_INSTALL_PREFIX}/include/lite/trt/nlp) file(INSTALL ${TENSORRT_SD_HEAD} DESTINATION ${CMAKE_INSTALL_PREFIX}/include/lite/trt/sd) - +file(INSTALL ${TENSORRT_CUDA_KERNEL_HEAD_CPP} DESTINATION ${CMAKE_INSTALL_PREFIX}/include/lite/trt/kernel) +file(INSTALL ${TENSORRT_CUDA_KERNEL_HEAD_CU} DESTINATION ${CMAKE_INSTALL_PREFIX}/include/lite/trt/kernel) diff --git a/cmake/utils.cmake b/cmake/utils.cmake index f089cb7f..9073a3c4 100644 --- a/cmake/utils.cmake +++ b/cmake/utils.cmake @@ -52,7 +52,7 @@ function(add_lite_ai_toolkit_shared_library version soversion) if (ENABLE_TENSORRT) include(cmake/tensorrt.cmake) set(LITE_SRCS ${LITE_SRCS} ${TRT_SRCS}) - set(LITE_DEPENDENCIES ${LITE_DEPENDENCIES} cudart nvinfer nvonnxparser + set(LITE_DEPENDENCIES ${LITE_DEPENDENCIES} cuda cudart nvinfer nvonnxparser nvinfer_plugin ddim_scheduler_cpp) link_directories(${CMAKE_SOURCE_DIR}/lite/bin) endif () diff --git a/examples/lite/CMakeLists.txt b/examples/lite/CMakeLists.txt index 929e07d5..8e99cd70 100644 --- a/examples/lite/CMakeLists.txt +++ b/examples/lite/CMakeLists.txt @@ -103,5 +103,10 @@ add_lite_executable(lite_face_parsing_bisenet_dyn cv) add_lite_executable(lite_yolov8face cv) add_lite_executable(lite_lightenhance cv) add_lite_executable(lite_realesrgan cv) +add_lite_executable(lite_face_68landmarks cv) +add_lite_executable(lite_face_recognizer cv) +add_lite_executable(lite_face_swap cv) +add_lite_executable(lite_face_restoration cv) +add_lite_executable(lite_facefusion_pipeline cv) add_lite_executable(lite_yolov8 cv) add_lite_executable(lite_sd_pipeline sd) diff --git a/examples/lite/cv/test_lite_face_68landmarks.cpp b/examples/lite/cv/test_lite_face_68landmarks.cpp new file mode 100644 index 00000000..0f4a0da9 --- /dev/null +++ b/examples/lite/cv/test_lite_face_68landmarks.cpp @@ -0,0 +1,112 @@ +// +// Created by wangzijian on 11/1/24. +// +#include "lite/lite.h" +#include "lite/trt/cv/trt_face_68landmarks_mt.h" + +static void test_default() +{ +#ifdef ENABLE_ONNXRUNTIME + std::string onnx_path = "/home/lite.ai.toolkit/examples/hub/onnx/cv/2dfan4.onnx"; + std::string test_img_path = "/home/lite.ai.toolkit/examples/lite/resources/test_lite_facefusion_pipeline_source.jpg"; + + // 1. Test Default Engine ONNXRuntime + lite::cv::faceid::Face_68Landmarks *face68Landmarks = new lite::cv::faceid::Face_68Landmarks(onnx_path); + + lite::types::BoundingBoxType bbox; + bbox.x1 = 487; + bbox.y1 = 236; + bbox.x2 = 784; + bbox.y2 = 624; + + cv::Mat img_bgr = cv::imread(test_img_path); + std::vector face_landmark_5of68; + face68Landmarks->detect(img_bgr, bbox, face_landmark_5of68); + + std::cout<<"face id detect done!"< bbox; + bbox.x1 = 487; + bbox.y1 = 236; + bbox.x2 = 784; + bbox.y2 = 624; + + cv::Mat img_bgr = cv::imread(test_img_path); + std::vector face_landmark_5of68; + face68Landmarks->detect(img_bgr, bbox, face_landmark_5of68); + + std::cout<<"face id detect done!"< bbox; + + bbox.x1 = 487; + bbox.y1 = 236; + bbox.x2 = 784; + bbox.y2 = 624; + + cv::Mat img_bgr = cv::imread(test_img_path); + std::vector face_landmark_5of68; + face68Landmarks->detect_async(img_bgr, bbox, face_landmark_5of68); + + cv::Mat img_bgr2 = cv::imread(test_img_path); + std::vector face_landmark_5of682; + face68Landmarks->detect_async(img_bgr, bbox, face_landmark_5of682); + + cv::Mat img_bgr3 = cv::imread(test_img_path); + std::vector face_landmark_5of683; + face68Landmarks->detect_async(img_bgr, bbox, face_landmark_5of683); + + + cv::Mat img_bgr4 = cv::imread(test_img_path); + std::vector face_landmark_5of684; + face68Landmarks->detect_async(img_bgr, bbox, face_landmark_5of684); + + face68Landmarks->wait_for_completion(); + + face68Landmarks->shutdown(); + + std::cout<<"face id detect done!"< face_landmark_5 = { + cv::Point2f(568.2485f, 398.9512f), + cv::Point2f(701.7346f, 399.64795f), + cv::Point2f(634.2213f, 482.92694f), + cv::Point2f(583.5656f, 543.10187f), + cv::Point2f(684.52405f, 543.125f) + }; + cv::Mat img_bgr = cv::imread(test_img_path); + + std::vector source_image_embeding; + + face_recognizer->detect(img_bgr,face_landmark_5,source_image_embeding); + + + std::cout<<"face id detect done!"< face_landmark_5 = { + cv::Point2f(569.092041f, 398.845886f), + cv::Point2f(701.891724f, 399.156677f), + cv::Point2f(634.767212f, 482.927216f), + cv::Point2f(584.270996f, 543.294617f), + cv::Point2f(684.877991f, 543.067078f) + }; + cv::Mat img_bgr = cv::imread(test_img_path); + + face_restoration->detect(img_bgr,face_landmark_5,save_img_path); + + + std::cout<<"face restoration detect done!"<(engine_path,4); + +// trt_face_restoration_mt *face_restoration_trt = new trt_face_restoration_mt(engine_path); + + + // 2. 准备测试数据 - 这里假设我们要处理4张相同的图片作为示例 + std::vector test_img_paths = { + "/home/lite.ai.toolkit/trt_result.jpg", + "/home/lite.ai.toolkit/trt_result_2.jpg", + "/home/lite.ai.toolkit/trt_result_3.jpg", + "/home/lite.ai.toolkit/trt_result_4.jpg" + }; + + std::vector save_img_paths = { + "/home/lite.ai.toolkit/trt_facerestoration_mt_thread1.jpg", + "/home/lite.ai.toolkit/trt_facerestoration_mt_thread2.jpg", + "/home/lite.ai.toolkit/trt_facerestoration_mt_thread3.jpg", + "/home/lite.ai.toolkit/trt_facerestoration_mt_thread4.jpg" + }; + + std::vector face_landmark_5 = { + cv::Point2f(569.092041f, 398.845886f), + cv::Point2f(701.891724f, 399.156677f), + cv::Point2f(634.767212f, 482.927216f), + cv::Point2f(584.270996f, 543.294617f), + cv::Point2f(684.877991f, 543.067078f) + }; +// cv::Mat img_bgr = cv::imread(test_img_path); +// +// face_restoration_trt->detect_async(img_bgr,face_landmark_5,save_img_path); +// +// +// std::cout<<"face restoration detect done!"<detect_async(img_bgr, face_landmark_5, save_img_paths[i]); + std::cout << "Submitted task " << i + 1 << " for processing" << std::endl; + } + + // 6. 等待所有任务完成 + std::cout << "Waiting for all tasks to complete..." << std::endl; + face_restoration_trt->wait_for_completion(); + + // 7. 计算和输出总耗时 + auto end_time = std::chrono::high_resolution_clock::now(); + auto duration = std::chrono::duration_cast(end_time - start_time); + + std::cout << "All tasks completed!" << std::endl; + std::cout << "Total processing time: " << duration.count() << "ms" << std::endl; + std::cout << "Average time per image: " << duration.count() / test_img_paths.size() << "ms" << std::endl; + + +#endif +} + +int main(__unused int argc, __unused char *argv[]) +{ +// test_default(); + test_tensorrt(); + return 0; +} \ No newline at end of file diff --git a/examples/lite/cv/test_lite_face_swap.cpp b/examples/lite/cv/test_lite_face_swap.cpp new file mode 100644 index 00000000..a7429030 --- /dev/null +++ b/examples/lite/cv/test_lite_face_swap.cpp @@ -0,0 +1,55 @@ +// +// Created by wangzijian on 11/5/24. +// +#include "lite/lite.h" + +static void test_default() +{ +#ifdef ENABLE_ONNXRUNTIME + std::string face_swap_onnx_path = "../../../examples/hub/onnx/cv/inswapper_128.onnx"; + std::string face_detect_onnx_path = "../../../examples/hub/onnx/cv/yoloface_8n.onnx"; + std::string face_landmarks_68 = "../../../examples/hub/onnx/cv/2dfan4.onnx"; + std::string face_recognizer_onnx_path = "../../../examples/hub/onnx/cv/arcface_w600k_r50.onnx";; + + std::string source_image_path = "../../../examples/lite/resources/test_lite_facefusion_pipeline_source.jpg"; + std::string target_image_path = "../../../examples/lite/resources/test_lite_facefusion_pipeline_target.jpg"; + + lite::cv::face::detect::YOLOV8Face *yolov8_face = new lite::cv::face::detect::YOLOV8Face(face_detect_onnx_path); + lite::cv::faceid::Face_68Landmarks *face68Landmarks = new lite::cv::faceid::Face_68Landmarks(face_landmarks_68); + lite::cv::faceid::Face_Recognizer *face_recognizer = new lite::cv::faceid::Face_Recognizer(face_recognizer_onnx_path); + lite::cv::face::swap::InSwapper *face_swap = new lite::cv::face::swap::InSwapper(face_swap_onnx_path); + + + std::vector detected_boxes; + cv::Mat img_bgr = cv::imread(source_image_path); + yolov8_face->detect(img_bgr, detected_boxes); + int position = 0; // 0号位置的脸部 + auto test_bounding_box = detected_boxes[0]; + std::vector face_landmark_5of68; + face68Landmarks->detect(img_bgr, test_bounding_box, face_landmark_5of68); + std::vector source_image_embeding; + face_recognizer->detect(img_bgr,face_landmark_5of68,source_image_embeding); + + // 上面是source的 现在下面是target的 + std::vector target_detected_boxes; + cv::Mat target_img_bgr = cv::imread(target_image_path); + yolov8_face->detect(target_img_bgr, target_detected_boxes); + auto target_test_bounding_box = target_detected_boxes[0]; + std::vector target_face_landmark_5of68; + face68Landmarks->detect(target_img_bgr, target_test_bounding_box,target_face_landmark_5of68); + + cv::Mat face_swap_image; + face_swap->detect(target_img_bgr,source_image_embeding,target_face_landmark_5of68,face_swap_image); + + delete yolov8_face; + delete face68Landmarks; + delete face_swap; + delete face_recognizer; +#endif +} + +int main(__unused int argc, __unused char *argv[]) +{ + test_default(); + return 0; +} \ No newline at end of file diff --git a/examples/lite/cv/test_lite_facefusion_pipeline.cpp b/examples/lite/cv/test_lite_facefusion_pipeline.cpp new file mode 100644 index 00000000..7134900d --- /dev/null +++ b/examples/lite/cv/test_lite_facefusion_pipeline.cpp @@ -0,0 +1,45 @@ +// +// Created by wangzijian on 11/7/24. +// +#include "lite/lite.h" +static void test_default() +{ +#ifdef ENABLE_ONNXRUNTIME + std::string face_swap_onnx_path = "../../../examples/hub/onnx/cv/inswapper_128.onnx"; + std::string face_detect_onnx_path = "../../../examples/hub/onnx/cv/yoloface_8n.onnx"; + std::string face_landmarks_68 = "../../../examples/hub/onnx/cv/2dfan4.onnx"; + std::string face_recognizer_onnx_path = "../../../examples/hub/onnx/cv/arcface_w600k_r50.onnx"; + std::string face_restoration_onnx_path = "../../../examples/hub/onnx/cv/gfpgan_1.4.onnx"; + + auto pipeLine = lite::cv::face::swap::facefusion::PipeLine( + face_detect_onnx_path, + face_landmarks_68, + face_recognizer_onnx_path, + face_swap_onnx_path, + face_restoration_onnx_path + ); + + std::string source_image_path = "../../../examples/lite/resources/test_lite_facefusion_pipeline_source.jpg"; + std::string target_image_path = "../../../examples/lite/resources/test_lite_facefusion_pipeline_target.jpg"; + std::string save_image_path = "../../../examples/logs/test_lite_facefusion_pipeline_result.jpg"; + + + // 写一个测试时间的代码 + auto start = std::chrono::high_resolution_clock::now(); + + + + pipeLine.detect(source_image_path,target_image_path,save_image_path); + auto end = std::chrono::high_resolution_clock::now(); + std::chrono::duration diff = end-start; + std::cout << "Time: " << diff.count() << " s\n"; + + +#endif +} + +int main() +{ + + test_default(); +} \ No newline at end of file diff --git a/examples/lite/resources/test_lite_facefusion_pipeline_source.jpg b/examples/lite/resources/test_lite_facefusion_pipeline_source.jpg new file mode 100644 index 00000000..d04c2239 Binary files /dev/null and b/examples/lite/resources/test_lite_facefusion_pipeline_source.jpg differ diff --git a/examples/lite/resources/test_lite_facefusion_pipeline_target.jpg b/examples/lite/resources/test_lite_facefusion_pipeline_target.jpg new file mode 100644 index 00000000..a1948be1 Binary files /dev/null and b/examples/lite/resources/test_lite_facefusion_pipeline_target.jpg differ diff --git a/lite/models.h b/lite/models.h index 60eb82e6..4dd27101 100644 --- a/lite/models.h +++ b/lite/models.h @@ -115,6 +115,11 @@ #include "lite/ort/cv/yolofacev8.h" #include "lite/ort/cv/light_enhance.h" #include "lite/ort/cv/real_esr_gan.h" +#include "lite/ort/cv/face_68landmarks.h" +#include "lite/ort/cv/face_recognizer.h" +#include "lite/ort/cv/face_swap.h" +#include "lite/ort/cv/face_restoration.h" +#include "lite/ort/cv/face_fusion_pipeline.h" #include "lite/ort/sd/clip.h" #include "lite/ort/sd/unet.h" #include "lite/ort/sd/vae.h" @@ -136,6 +141,11 @@ #include "lite/trt/cv/trt_yolov5_blazeface.h" #include "lite/trt/cv/trt_lightenhance.h" #include "lite/trt/cv/trt_realesrgan.h" +#include "lite/trt/cv/trt_face_68landmarks.h" +#include "lite/trt/cv/trt_face_recognizer.h" +#include "lite/trt/cv/trt_face_swap.h" +#include "lite/trt/cv/trt_face_restoration.h" +#include "lite/trt/cv/trt_facefusion_pipeline.h" #include "lite/trt/sd/trt_clip.h" #include "lite/trt/sd/trt_vae.h" #include "lite/trt/sd/trt_unet.h" @@ -502,6 +512,11 @@ namespace lite typedef ortcv::YoloFaceV8 _ONNXYOLOFaceNet; typedef ortcv::LightEnhance _ONNXLightEnhance; typedef ortcv::RealESRGAN _ONNXRealESRGAN; + typedef ortcv::Face_68Landmarks _ONNXFace_68Landmarks; + typedef ortcv::Face_Recognizer _ONNXFace_Recognizer; + typedef ortcv::Face_Swap _ONNXFace_Swap; + typedef ortcv::Face_Restoration _ONNXFace_Restoration; + typedef ortcv::Face_Fusion_Pipeline _ONNXFace_Fusion_Pipeline; // 1. classification namespace classification @@ -580,6 +595,25 @@ namespace lite } + namespace swap + { + namespace facefusion + { + typedef _ONNXYOLOFaceNet YOLOV8Face; + typedef _ONNXFace_Swap InSwapper; + typedef _ONNXFace_Restoration GFPGAN; + typedef _ONNXFace_68Landmarks Face_68Landmarks; + typedef _ONNXFace_Recognizer Face_Recognizer; + typedef _ONNXFace_Fusion_Pipeline PipeLine; + } + typedef _ONNXFace_Swap InSwapper; + } + + namespace restoration + { + typedef _ONNXFace_Restoration GFPGAN; + } + namespace pose { typedef _ONNXFSANet FSANet; // head pose estimation. @@ -618,7 +652,8 @@ namespace lite typedef _ONNXCavaGhostArcFace CavaGhostArcFace; typedef _ONNXCavaCombinedFace CavaCombinedFace; typedef _ONNXMobileSEFocalFace MobileSEFocalFace; - + typedef _ONNXFace_68Landmarks Face_68Landmarks; + typedef _ONNXFace_Recognizer Face_Recognizer; } // 5. segmentation namespace segmentation @@ -733,6 +768,11 @@ namespace lite{ typedef trtcv::TRTLightEnhance _TRT_LightEnhance; typedef trtcv::TRTRealESRGAN _TRT_RealESRGAN; typedef trtcv::TRTMODNet _TRT_MODNet; + typedef trtcv::TRTFaceFusionFace68Landmarks _TRT_FaceFusionFace68Landmarks; + typedef trtcv::TRTFaceFusionFaceRecognizer _TRTFaceFusionFaceRecognizer; + typedef trtcv::TRTFaceFusionFaceSwap _TRTFaceFusionFaceSwap; + typedef trtcv::TRTFaceFusionFaceRestoration _TRTFaceFusionFaceRestoration; + typedef trtcv::TRTFaceFusionPipeLine _TRTFaceFusionPipeLine; namespace classification { @@ -755,6 +795,20 @@ namespace lite{ typedef _TRT_YOLOFaceNet YOLOV8Face; typedef _TRT_YOLO5Face YOLOV5Face; } + namespace swap + { + typedef _TRTFaceFusionFaceSwap FaceFusionFaceSwap; + typedef _TRTFaceFusionPipeLine FaceFusionPipeLine; + } + namespace restoration + { + typedef _TRTFaceFusionFaceRestoration TRTGFPGAN; + } + } + namespace faceid + { + typedef _TRT_FaceFusionFace68Landmarks FaceFusionFace68Landmarks; + typedef _TRTFaceFusionFaceRecognizer FaceFusionFaceRecognizer; } namespace lightenhance { diff --git a/lite/ort/cv/face_68landmarks.cpp b/lite/ort/cv/face_68landmarks.cpp new file mode 100644 index 00000000..7bbff59c --- /dev/null +++ b/lite/ort/cv/face_68landmarks.cpp @@ -0,0 +1,97 @@ +// +// Created by wangzijian on 11/1/24. +// + +#include "face_68landmarks.h" + +using ortcv::Face_68Landmarks; + +void Face_68Landmarks::preprocess(const lite::types::Boxf &bounding_box, + const cv::Mat &input_mat, + cv::Mat &crop_img) { + + float xmin = bounding_box.x1; + float ymin = bounding_box.y1; + float xmax = bounding_box.x2; + float ymax = bounding_box.y2; + + + float width = xmax - xmin; + float height = ymax - ymin; + float max_side = std::max(width, height); + float scale = 195.0f / max_side; + + float center_x = (xmax + xmin) * scale; + float center_y = (ymax + ymin) * scale; + + cv::Point2f translation; + translation.x = (256.0f - center_x) * 0.5f; + translation.y = (256.0f - center_y) * 0.5f; + + cv::Size crop_size(256, 256); + + std::tie(crop_img, affine_matrix) = face_utils::warp_face_by_translation(input_mat, translation, scale, crop_size); + + crop_img.convertTo(crop_img,CV_32FC3,1 / 255.f); + +} + + +Ort::Value Face_68Landmarks::transform(const cv::Mat &mat_rs) { + input_node_dims[0] = 1; + input_node_dims[1] = mat_rs.channels(); + input_node_dims[2] = mat_rs.rows; + input_node_dims[3] = mat_rs.cols; + + return ortcv::utils::transform::create_tensor( + mat_rs, input_node_dims, memory_info_handler, + input_values_handler, ortcv::utils::transform::CHW); +} + + +void Face_68Landmarks::detect(const cv::Mat &input_mat, const lite::types::BoundingBoxType &bbox, + std::vector &face_landmark_5of68) { + if (input_mat.empty()) return; + + img_with_landmarks = input_mat.clone(); + cv::Mat crop_image; + + preprocess(bbox,input_mat,crop_image); + + Ort::Value input_tensor = transform(crop_image); + Ort::RunOptions runOptions; + + // 2.infer + auto output_tensors = ort_session->Run( + runOptions, input_node_names.data(), + &input_tensor, 1, output_node_names.data(), num_outputs + ); + + postprocess(output_tensors,face_landmark_5of68); + +} + + + +void Face_68Landmarks::postprocess(std::vector &ort_outputs, + std::vector &face_landmark_5of68) { + float *pdata = ort_outputs[0].GetTensorMutableData(); + std::vector out_shape = ort_outputs[0].GetTensorTypeAndShapeInfo().GetShape(); + std::vector landmarks; + + for (int i = 0;i < 68; ++i) + { + float x = pdata[i * 3] / 64.0f * 256.f; + float y = pdata[i * 3 + 1] / 64.0f * 256.f; + landmarks.emplace_back(x, y); + } + + cv::Mat inverse_affine_matrix; + cv::invertAffineTransform(affine_matrix, inverse_affine_matrix); + + cv::transform(landmarks, landmarks, inverse_affine_matrix); + + face_landmark_5of68 = face_utils::convert_face_landmark_68_to_5(landmarks); +} + + diff --git a/lite/ort/cv/face_68landmarks.h b/lite/ort/cv/face_68landmarks.h new file mode 100644 index 00000000..3bb89c79 --- /dev/null +++ b/lite/ort/cv/face_68landmarks.h @@ -0,0 +1,45 @@ +// +// Created by wangzijian on 11/1/24. +// + +#ifndef LITE_AI_TOOLKIT_FACE_68LANDMARKS_H +#define LITE_AI_TOOLKIT_FACE_68LANDMARKS_H +#include "lite/ort/core/ort_core.h" +#include "lite/ort/core/ort_types.h" +#include "lite/ort/core/ort_utils.h" +#include "lite/utils.h" +#include "lite/ort/cv/face_utils.h" +#include "algorithm" + +namespace ortcv{ + class LITE_EXPORTS Face_68Landmarks : public BasicOrtHandler{ + public: + explicit Face_68Landmarks(const std::string &_onnx_path, unsigned int _num_threads = 1): + BasicOrtHandler(_onnx_path, _num_threads = 1){}; + + ~Face_68Landmarks() override = default; + + private: + cv::Mat affine_matrix; + cv::Mat img_with_landmarks; + + private: + void preprocess(const lite::types::Boxf &bouding_box,const cv::Mat &input_mat,cv::Mat &crop_img); + + Ort::Value transform(const cv::Mat &mat_rs) override; + + void postprocess(std::vector &ort_outputs, std::vector &face_landmark_5of68); + + + public: + + void detect(const cv::Mat &input_mat,const lite::types::BoundingBoxType &bbox, std::vector &face_landmark_5of68); + + + + }; + + +} + +#endif //LITE_AI_TOOLKIT_FACE_68LANDMARKS_H diff --git a/lite/ort/cv/face_fusion_pipeline.cpp b/lite/ort/cv/face_fusion_pipeline.cpp new file mode 100644 index 00000000..f3192719 --- /dev/null +++ b/lite/ort/cv/face_fusion_pipeline.cpp @@ -0,0 +1,45 @@ +// +// Created by wangzijian on 11/7/24. +// + +#include "face_fusion_pipeline.h" +using ortcv::Face_Fusion_Pipeline; + +Face_Fusion_Pipeline::Face_Fusion_Pipeline(const std::string &face_detect_onnx_path, + const std::string &face_landmarks_68_onnx_path, + const std::string &face_recognizer_onnx_path, + const std::string &face_swap_onnx_path, + const std::string &face_restoration_onnx_path) { + face_detect = std::make_unique(face_detect_onnx_path,6); + face_landmarks = std::make_unique(face_landmarks_68_onnx_path,6); + face_recognizer = std::make_unique(face_recognizer_onnx_path,6); + face_swap = std::make_unique(face_swap_onnx_path,6); + face_restoration = std::make_unique(face_restoration_onnx_path,6); +} + +void Face_Fusion_Pipeline::detect(const std::string &source_image, const std::string &target_image,const std::string &save_image_path) { + std::vector detected_boxes; + cv::Mat img_bgr = cv::imread(source_image); + face_detect->detect(img_bgr,detected_boxes); + + int position = 0; // position number 0 + auto test_bounding_box = detected_boxes[0]; + std::vector face_landmark_5of68; + + face_landmarks->detect(img_bgr, test_bounding_box, face_landmark_5of68); + std::vector source_image_embeding; + face_recognizer->detect(img_bgr,face_landmark_5of68,source_image_embeding); + + + std::vector target_detected_boxes; + cv::Mat target_img_bgr = cv::imread(target_image); + face_detect->detect(target_img_bgr, target_detected_boxes); + auto target_test_bounding_box = target_detected_boxes[0]; + std::vector target_face_landmark_5of68; + face_landmarks->detect(target_img_bgr, target_test_bounding_box,target_face_landmark_5of68); + + cv::Mat face_swap_image; + face_swap->detect(target_img_bgr,source_image_embeding,target_face_landmark_5of68,face_swap_image); + face_restoration->detect(face_swap_image,target_face_landmark_5of68,save_image_path); + +} \ No newline at end of file diff --git a/lite/ort/cv/face_fusion_pipeline.h b/lite/ort/cv/face_fusion_pipeline.h new file mode 100644 index 00000000..bbf31395 --- /dev/null +++ b/lite/ort/cv/face_fusion_pipeline.h @@ -0,0 +1,38 @@ +// +// Created by wangzijian on 11/7/24. +// + +#ifndef LITE_AI_TOOLKIT_FACE_FUSION_PIPELINE_H +#define LITE_AI_TOOLKIT_FACE_FUSION_PIPELINE_H +#include "lite/ort/core/ort_core.h" +#include "lite/ort/cv/face_restoration.h" +#include "lite/ort/cv/face_swap.h" +#include "lite/ort/cv/face_recognizer.h" +#include "lite/ort/cv/yolofacev8.h" +#include "lite/ort/cv/face_68landmarks.h" + +namespace ortcv{ + class Face_Fusion_Pipeline{ + public: + Face_Fusion_Pipeline( + const std::string &face_detect_onnx_path, + const std::string &face_landmarks_68_onnx_path, + const std::string &face_recognizer_onnx_path, + const std::string &face_swap_onnx_path, + const std::string &face_restoration_onnx_path + ); + ~Face_Fusion_Pipeline() = default; // 使用智能指针来进行管理 + + private: + std::unique_ptr face_restoration; + std::unique_ptr face_detect; + std::unique_ptr face_landmarks; + std::unique_ptr face_recognizer; + std::unique_ptr face_swap; + + public: + void detect(const std::string &source_image,const std::string &target_image,const std::string &save_image); + }; +} + +#endif //LITE_AI_TOOLKIT_FACE_FUSION_PIPELINE_H diff --git a/lite/ort/cv/face_recognizer.cpp b/lite/ort/cv/face_recognizer.cpp new file mode 100644 index 00000000..8082d800 --- /dev/null +++ b/lite/ort/cv/face_recognizer.cpp @@ -0,0 +1,96 @@ +// +// Created by wangzijian on 11/4/24. +// + +#include "face_recognizer.h" +using ortcv::Face_Recognizer; + + +cv::Mat Face_Recognizer::preprocess(cv::Mat &input_mat, std::vector &face_landmark_5,cv::Mat &preprocessed_mat) { + cv::Mat crop_image; + cv::Mat affine_martix; + + std::tie(crop_image,affine_martix) = face_utils::warp_face_by_face_landmark_5(input_mat,face_landmark_5,face_utils::ARCFACE_112_V2); + crop_image.convertTo(crop_image,CV_32FC3, 1.0f / 127.5f,-1.0); + cv::cvtColor(crop_image,crop_image,cv::COLOR_BGR2RGB); + + return crop_image; + +} + + +Ort::Value Face_Recognizer::transform(const cv::Mat &mat_rs) { + input_node_dims[0] = 1; + input_node_dims[1] = mat_rs.channels(); + input_node_dims[2] = mat_rs.rows; + input_node_dims[3] = mat_rs.cols; + + return ortcv::utils::transform::create_tensor( + mat_rs, input_node_dims, memory_info_handler, + input_values_handler, ortcv::utils::transform::CHW); +} + +void Face_Recognizer::detect(cv::Mat &input_mat, std::vector &face_landmark_5) { + cv::Mat ori_image = input_mat.clone(); + + cv::Mat crop_image = preprocess(input_mat,face_landmark_5,ori_image); + Ort::Value input_tensor = transform(crop_image); + Ort::RunOptions runOptions; + + // 2.infer + auto output_tensors = ort_session->Run( + runOptions, input_node_names.data(), + &input_tensor, 1, output_node_names.data(), num_outputs + ); + + float *pdata = output_tensors[0].GetTensorMutableData(); + std::vector out_shape = output_tensors[0].GetTensorTypeAndShapeInfo().GetShape(); + + std::vector output(pdata, pdata + 512); + + float norm = 0.0f; + for (const auto &val : output) { + norm += val * val; + } + norm = std::sqrt(norm); + + for (auto &val : output) { + val /= norm; + } + + std::cout<<"done!"< &face_landmark_5, std::vector &embeding) { + cv::Mat ori_image = input_mat.clone(); + + cv::Mat crop_image = preprocess(input_mat,face_landmark_5,ori_image); + Ort::Value input_tensor = transform(crop_image); + Ort::RunOptions runOptions; + + // 2.infer + auto output_tensors = ort_session->Run( + runOptions, input_node_names.data(), + &input_tensor, 1, output_node_names.data(), num_outputs + ); + + float *pdata = output_tensors[0].GetTensorMutableData(); + std::vector out_shape = output_tensors[0].GetTensorTypeAndShapeInfo().GetShape(); + + embeding.assign(pdata,pdata + 512); + std::vector normal_embeding(pdata,pdata + 512); + + + float norm = 0.0f; + for (const auto &val : normal_embeding) { + norm += val * val; + } + norm = std::sqrt(norm); + + for (auto &val : normal_embeding) { + val /= norm; + } + + std::cout<<"done!"< &face_landmark_5,cv::Mat &preprocessed_mat); + + Ort::Value transform(const cv::Mat &mat_rs) override; + + public: + void detect(cv::Mat &input_mat,std::vector &face_landmark_5); + + void detect(cv::Mat &input_mat,std::vector &face_landmark_5,std::vector &embeding); + + }; +} + + +#endif //LITE_AI_TOOLKIT_FACE_RECOGNIZER_H diff --git a/lite/ort/cv/face_restoration.cpp b/lite/ort/cv/face_restoration.cpp new file mode 100644 index 00000000..77df1bf2 --- /dev/null +++ b/lite/ort/cv/face_restoration.cpp @@ -0,0 +1,95 @@ +// +// Created by wangzijian on 11/7/24. +// + +#include "face_restoration.h" + +using ortcv::Face_Restoration; + + + +Ort::Value Face_Restoration::transform(const cv::Mat &mat_rs) { + input_node_dims[0] = 1; + input_node_dims[1] = mat_rs.channels(); + input_node_dims[2] = mat_rs.rows; + input_node_dims[3] = mat_rs.cols; + + return ortcv::utils::transform::create_tensor( + mat_rs, input_node_dims, memory_info_handler, + input_values_handler, ortcv::utils::transform::CHW); +} + + + +void Face_Restoration::detect(cv::Mat &face_swap_image, std::vector &target_landmarks_5 , const std::string &face_enchaner_path) { + auto ori_image = face_swap_image.clone(); + + cv::Mat crop_image; + cv::Mat affine_matrix; + std::tie(crop_image,affine_matrix) = face_utils::warp_face_by_face_landmark_5(face_swap_image,target_landmarks_5,face_utils::FFHQ_512); + + std::vector crop_size = {512,512}; + cv::Mat box_mask = face_utils::create_static_box_mask(crop_size); + std::vector crop_mask_list; + crop_mask_list.emplace_back(box_mask); + + cv::cvtColor(crop_image,crop_image,cv::COLOR_BGR2RGB); + crop_image.convertTo(crop_image,CV_32FC3,1.f / 255.f); + crop_image.convertTo(crop_image,CV_32FC3,2.0f,-1.f); + + Ort::Value input_tensor = transform(crop_image); + + Ort::RunOptions runOptions; + + // 2.infer + auto output_tensors = ort_session->Run( + runOptions, input_node_names.data(), + &input_tensor, 1, output_node_names.data(), num_outputs + ); + + float *pdata = output_tensors[0].GetTensorMutableData(); + std::vector out_shape = output_tensors[0].GetTensorTypeAndShapeInfo().GetShape(); + + int channel = 3; + int height = 512; + int width = 512; + std::vector output(channel * height * width); + output.assign(pdata,pdata + (channel * height * width)); + + std::transform(output.begin(),output.end(),output.begin(), + [](double x){return std::max(-1.0,std::max(-1.0,std::min(1.0,x)));}); + + std::transform(output.begin(),output.end(),output.begin(), + [](double x){return (x + 1.f) /2.f;}); + + + std::vector transposed_data(channel * height * width); + for (int c = 0; c < channel; ++c){ + for (int h = 0 ; h < height; ++h){ + for (int w = 0; w < width ; ++w){ + int src_index = c * (height * width) + h * width + w; + int dst_index = h * (width * channel) + w * channel + c; + transposed_data[dst_index] = output[src_index]; + } + } + } + + std::transform(transposed_data.begin(),transposed_data.end(),transposed_data.begin(), + [](float x){return std::round(x * 255.f);}); + + std::transform(transposed_data.begin(), transposed_data.end(), transposed_data.begin(), + [](float x) { return static_cast(x); }); + + + cv::Mat mat(height, width, CV_32FC3, transposed_data.data()); + cv::cvtColor(mat, mat, cv::COLOR_RGB2BGR); + + + auto crop_mask = crop_mask_list[0]; + cv::Mat paste_frame = face_utils::paste_back(ori_image,mat,crop_mask,affine_matrix); + + cv::Mat dst_image = face_utils::blend_frame(ori_image,paste_frame); + + cv::imwrite(face_enchaner_path,dst_image); + +} \ No newline at end of file diff --git a/lite/ort/cv/face_restoration.h b/lite/ort/cv/face_restoration.h new file mode 100644 index 00000000..e4febb85 --- /dev/null +++ b/lite/ort/cv/face_restoration.h @@ -0,0 +1,28 @@ +// +// Created by wangzijian on 11/7/24. +// + +#ifndef LITE_AI_TOOLKIT_FACE_RESTORATION_H +#define LITE_AI_TOOLKIT_FACE_RESTORATION_H +#include "lite/ort/core/ort_core.h" +#include "lite/ort/core/ort_types.h" +#include "lite/ort/core/ort_utils.h" +#include "lite/ort/cv/face_utils.h" + +namespace ortcv{ + class LITE_EXPORTS Face_Restoration : public BasicOrtHandler{ + public: + explicit Face_Restoration(const std::string &_onnx_path, unsigned int _num_threads = 1): + BasicOrtHandler(_onnx_path,_num_threads){}; + ~Face_Restoration() override = default; + + private: + + Ort::Value transform(const cv::Mat &mat_rs) override; + + public: + void detect(cv::Mat &face_swap_image,std::vector &target_landmarks_5 ,const std::string &face_enchaner_path); + }; +} + +#endif //LITE_AI_TOOLKIT_FACE_RESTORATION_H diff --git a/lite/ort/cv/face_swap.cpp b/lite/ort/cv/face_swap.cpp new file mode 100644 index 00000000..36ec87ac --- /dev/null +++ b/lite/ort/cv/face_swap.cpp @@ -0,0 +1,120 @@ +// +// Created by wangzijian on 11/5/24. +// + +#include "face_swap.h" +using ortcv::Face_Swap; + +void Face_Swap::preprocess(cv::Mat &target_face, std::vector source_image_embeding, + std::vector target_landmark_5,std::vector &processed_source_embeding, + cv::Mat &preprocessed_mat) { + + + std::tie(preprocessed_mat, affine_martix) = face_utils::warp_face_by_face_landmark_5(target_face,target_landmark_5,face_utils::ARCFACE_128_V2); + + std::vector crop_size= {128.0,128.0}; + crop_list.emplace_back(face_utils::create_static_box_mask(crop_size)); + + cv::cvtColor(preprocessed_mat,preprocessed_mat,cv::COLOR_BGR2RGB); + preprocessed_mat.convertTo(preprocessed_mat,CV_32FC3,1.0 / 255.f); + preprocessed_mat.convertTo(preprocessed_mat,CV_32FC3,1.0 / 1.f,0); + + std::vector model_martix = face_utils::load_npy("/home/facefusion-onnxrun/python/model_matrix.npy"); + + processed_source_embeding= face_utils::dot_product(source_image_embeding,model_martix,512); + + face_utils::normalize(processed_source_embeding); + + std::cout<<"done!"< source_face_embeding,std::vector target_landmark_5, + cv::Mat &face_swap_image){ + + cv::Mat ori_image = target_image.clone(); + std::vector source_embeding_input; + cv::Mat model_input_mat; + preprocess(target_image,source_face_embeding,target_landmark_5,source_embeding_input,model_input_mat); + Ort::Value inputTensor_target = transform(model_input_mat); + + std::vector input_node_dims = {1, 512}; + Ort::MemoryInfo memory_info = Ort::MemoryInfo::CreateCpu(OrtArenaAllocator, OrtMemTypeDefault); + Ort::Value inputTensor_embeding = Ort::Value::CreateTensor( + memory_info, + source_embeding_input.data(), + source_embeding_input.size(), + input_node_dims.data(), + input_node_dims.size() + ); + + std::vector inputTensors; + inputTensors.push_back(std::move(inputTensor_target)); + inputTensors.push_back(std::move(inputTensor_embeding)); + + + Ort::RunOptions runOptions; + + std::vector input_node_names_face_swap = { + "target", + "source", + }; + + std::vector output_node_names_face_swap = { + "output" + }; + + std::vector outputTensors = ort_session->Run( + runOptions, + input_node_names_face_swap.data(), + inputTensors.data(), + inputTensors.size(), + output_node_names_face_swap.data(), + output_node_names_face_swap.size() + ); + + float *p_data = outputTensors[0].GetTensorMutableData(); + std::vector out_shape = outputTensors[0].GetTensorTypeAndShapeInfo().GetShape(); + + std::vector output_swap_image(1 * 3 * 128 * 128); + output_swap_image.assign(p_data,p_data + (1 * 3 * 128 * 128)); + + std::vector transposed(3 * 128 * 128); + int channels = 3; + int height = 128; + int width = 128; + + for (int c = 0; c < channels; ++c) { + for (int h = 0; h < height; ++h) { + for (int w = 0; w < width; ++w) { + int src_idx = c * (height * width) + h * width + w; // CHW + int dst_idx = h * (width * channels) + w * channels + c; // HWC + transposed[dst_idx] = output_swap_image[src_idx]; + } + } + } + + for (auto& val : transposed) { + val = std::round(val * 255.0); + } + + cv::Mat mat(height, width, CV_32FC3, transposed.data()); + cv::cvtColor(mat, mat, cv::COLOR_RGB2BGR); + + cv::Mat dst_image = face_utils::paste_back(ori_image,mat,crop_list[0],affine_martix); + face_swap_image = dst_image; + +} \ No newline at end of file diff --git a/lite/ort/cv/face_swap.h b/lite/ort/cv/face_swap.h new file mode 100644 index 00000000..9346aef2 --- /dev/null +++ b/lite/ort/cv/face_swap.h @@ -0,0 +1,38 @@ +// +// Created by wangzijian on 11/5/24. +// + +#ifndef LITE_AI_TOOLKIT_FACE_SWAP_H +#define LITE_AI_TOOLKIT_FACE_SWAP_H +#include "lite/ort/core/ort_core.h" +#include "lite/ort/core/ort_types.h" +#include "lite/ort/core/ort_utils.h" +#include "lite/ort/cv/face_restoration.h" +#include "lite/ort/cv/face_utils.h" + +namespace ortcv{ + class LITE_EXPORTS Face_Swap : public BasicOrtHandler + { + public: + explicit Face_Swap(const std::string &_onnx_path, unsigned int _num_threads = 1): + BasicOrtHandler(_onnx_path, _num_threads = 1){}; + ~Face_Swap() override = default; + private: + void preprocess(cv::Mat &target_face,std::vector source_image_embeding,std::vector target_landmark_5, + std::vector &processed_source_embeding,cv::Mat &preprocessed_mat); + + Ort::Value transform(const cv::Mat &mat_rs) override; + + private: + + std::vector crop_list; + + cv::Mat affine_martix; + + public: + void detect(cv::Mat &target_image,std::vector source_face_embeding,std::vector target_landmark_5, cv::Mat &face_swap_image); + + }; +} + +#endif //LITE_AI_TOOLKIT_FACE_SWAP_H diff --git a/lite/ort/cv/face_utils.cpp b/lite/ort/cv/face_utils.cpp new file mode 100644 index 00000000..56df2ee6 --- /dev/null +++ b/lite/ort/cv/face_utils.cpp @@ -0,0 +1,335 @@ +// +// Created by wangzijian on 11/11/24. +// + +#include "face_utils.h" + +cv::Mat +face_utils::paste_back(const cv::Mat &temp_vision_frame, const cv::Mat &crop_vision_frame, const cv::Mat &crop_mask, + const cv::Mat &affine_matrix) { + + // 确保所有图像都是float类型 + cv::Mat temp_float, crop_float, mask_float; + temp_vision_frame.convertTo(temp_float, CV_32F); + crop_vision_frame.convertTo(crop_float, CV_32F); + crop_mask.convertTo(mask_float, CV_32F); + + // 获取仿射变换的逆矩阵 + cv::Mat inverse_matrix; + cv::invertAffineTransform(affine_matrix, inverse_matrix); + + // 获取目标尺寸 + cv::Size temp_size(temp_vision_frame.cols, temp_vision_frame.rows); + + // 对mask进行反向仿射变换 + cv::Mat inverse_mask; + cv::warpAffine(mask_float, inverse_mask, inverse_matrix, temp_size); + cv::threshold(inverse_mask, inverse_mask, 1.0, 1.0, cv::THRESH_TRUNC); // clip at 1 + cv::threshold(inverse_mask, inverse_mask, 0.0, 0.0, cv::THRESH_TOZERO); // clip at 0 + + // 对crop_vision_frame进行反向仿射变换 + cv::Mat inverse_vision_frame; + cv::warpAffine(crop_float, inverse_vision_frame, inverse_matrix, + temp_size, cv::INTER_LINEAR, cv::BORDER_REPLICATE); + + // 创建输出图像 + cv::Mat paste_vision_frame; + temp_float.copyTo(paste_vision_frame); + + // 对每个通道进行混合 + std::vector channels(3); + std::vector inverse_channels(3); + std::vector temp_channels(3); + + cv::split(inverse_vision_frame, inverse_channels); + cv::split(temp_float, temp_channels); + + // 创建 1 - mask + cv::Mat inverse_weight; + cv::subtract(cv::Scalar(1.0), inverse_mask, inverse_weight); + + for (int i = 0; i < 3; ++i) { + // 确保所有运算都在相同类型(CV_32F)下进行 + cv::Mat weighted_inverse, weighted_temp; + cv::multiply(inverse_mask, inverse_channels[i], weighted_inverse); + cv::multiply(inverse_weight, temp_channels[i], weighted_temp); + cv::add(weighted_inverse, weighted_temp, channels[i]); + } + + cv::merge(channels, paste_vision_frame); + + // 如果需要,将结果转换回原始类型 + cv::Mat result; + if(temp_vision_frame.type() != CV_32F) { + paste_vision_frame.convertTo(result, temp_vision_frame.type()); + } else { + result = paste_vision_frame; + } + + return result; + +} + +namespace face_utils +{ + const std::vector face_template_128 = { + cv::Point2f(0.36167656, 0.40387734), + cv::Point2f(0.63696719, 0.40235469), + cv::Point2f(0.50019687, 0.56044219), + cv::Point2f(0.38710391, 0.72160547), + cv::Point2f(0.61507734, 0.72034453) + }; + + const std::vector face_template_112 = { + cv::Point2f(0.34191607, 0.46157411), + cv::Point2f(0.65653393, 0.45983393), + cv::Point2f(0.50022500, 0.64050536), + cv::Point2f(0.37097589, 0.82469196), + cv::Point2f(0.63151696, 0.82325089) + }; + + const std::vector face_template_512 = { + cv::Point2f(0.37691676, 0.46864664), + cv::Point2f(0.62285697, 0.46912813), + cv::Point2f(0.50123859, 0.61331904), + cv::Point2f(0.39308822, 0.72541100), + cv::Point2f(0.61150205, 0.72490465) + }; + + const std::vector> face_template_vector = {face_template_112, face_template_128, face_template_512}; + +} + + +std::pair +face_utils::warp_face_by_face_landmark_5(cv::Mat input_mat, std::vector face_landmark_5, + unsigned int type) { + + std::vector current_template_select; + if (type == face_utils::ARCFACE_112_V2) + { + current_template_select = face_utils::face_template_vector[0]; + } + + if (type == face_utils::ARCFACE_128_V2) + { + current_template_select = face_utils::face_template_vector[1]; + } + + if (type == face_utils::FFHQ_512) + { + current_template_select = face_utils::face_template_vector[2]; + } + + // 创建标准模板点 + std::vector normed_template; + for(auto current_template : current_template_select) // face_template应该是类的成员变量 + { + current_template.x = current_template.x * type; // 512 + current_template.y = current_template.y * type; // 注意:原代码中y使用了x,这里修正为y + normed_template.emplace_back(current_template); + } + + // 估计仿射变换矩阵 + cv::Mat inliers; + cv::Mat affine_matrix = cv::estimateAffinePartial2D( + face_landmark_5, + normed_template, + inliers, + cv::RANSAC, + 100 + ); + + // 检查变换矩阵是否有效 + if (affine_matrix.empty()) { + throw std::runtime_error("Failed to estimate affine transformation"); + } + + // 进行仿射变换 + cv::Mat crop_img; + cv::warpAffine( + input_mat, + crop_img, + affine_matrix, + cv::Size(type, type), + cv::INTER_AREA, + cv::BORDER_REPLICATE + ); + + return std::make_pair(crop_img, affine_matrix); +} + + +std::vector +face_utils::dot_product(const std::vector &vec, const std::vector &matrix, int matrix_cols) { + std::vector result(matrix_cols); + int vec_size = vec.size(); + + for (int j = 0; j < matrix_cols; ++j) { + float sum = 0.0f; + for (int i = 0; i < vec_size; ++i) { + sum += vec[i] * matrix[i * matrix_cols + j]; + } + result[j] = sum; + } + return result; +} + +float face_utils::calculate_norm(const std::vector &vec) { + float sum = 0.0f; + for (float v : vec) { + sum += v * v; + } + return std::sqrt(sum); +} + + +void face_utils::normalize(std::vector &vec) { + float norm = calculate_norm(vec); + if (norm > 0) { + for (float& v : vec) { + v /= norm; + } + } +} + +std::vector face_utils::load_npy(const std::string &filename) { + std::ifstream file(filename, std::ios::binary); + if (!file.is_open()) { + throw std::runtime_error("Cannot open file: " + filename); + } + + char magic[6]; + file.read(magic, 6); + if (magic[0] != '\x93' || magic[1] != 'N' || magic[2] != 'U' || + magic[3] != 'M' || magic[4] != 'P' || magic[5] != 'Y') { + throw std::runtime_error("Invalid .npy file format"); + } + + uint8_t major_version, minor_version; + file.read(reinterpret_cast(&major_version), 1); + file.read(reinterpret_cast(&minor_version), 1); + + uint16_t header_len; + file.read(reinterpret_cast(&header_len), 2); + + std::vector header(header_len); + file.read(header.data(), header_len); + + size_t num_elements = 512 * 512; + + // 读取数据 + std::vector data(num_elements); + file.read(reinterpret_cast(data.data()), num_elements * sizeof(float)); + + return data; +} + +std::pair +face_utils::warp_face_by_translation(const cv::Mat &temp_img, cv::Point2f &translation, float scale, + const cv::Size &crop_size) { + cv::Mat affine_matrix = (cv::Mat_(2, 3) << scale, 0, translation.x, + 0, scale, translation.y); + + cv::Mat crop_img; + cv::warpAffine(temp_img, crop_img, affine_matrix, crop_size); + + return {crop_img, affine_matrix}; +} + + +std::vector face_utils::convert_face_landmark_68_to_5(const std::vector &landmark_68) { + std::vector face_landmark_5; + + // 计算左眼的中心位置 + cv::Point2f left_eye(0.0f, 0.0f); + for (int i = 36; i < 42; ++i) { + left_eye += landmark_68[i]; + } + left_eye *= (1.0f / 6.0f); // 取平均 + + // 计算右眼的中心位置 + cv::Point2f right_eye(0.0f, 0.0f); + for (int i = 42; i < 48; ++i) { + right_eye += landmark_68[i]; + } + right_eye *= (1.0f / 6.0f); // 取平均 + + // 获取鼻尖位置 + cv::Point2f nose = landmark_68[30]; + + // 获取左右嘴角的位置 + cv::Point2f left_mouth_end = landmark_68[48]; + cv::Point2f right_mouth_end = landmark_68[54]; + + // 将5个点加入到结果中 + face_landmark_5.push_back(left_eye); + face_landmark_5.push_back(right_eye); + face_landmark_5.push_back(nose); + face_landmark_5.push_back(left_mouth_end); + face_landmark_5.push_back(right_mouth_end); + + return face_landmark_5; +} + +cv::Mat face_utils::blend_frame(const cv::Mat &target_image, const cv::Mat &paste_frame) { + float face_enhancer_blend = 1.0f - (80.0f / 100.0f); + + cv::Mat temp_vision_frame; + + cv::addWeighted(target_image, face_enhancer_blend, + paste_frame, 1.0f - face_enhancer_blend, + 0, + temp_vision_frame); + + return temp_vision_frame; +} + + +cv::Mat face_utils::create_static_box_mask(std::vector crop_size) { + + float face_mask_blur = 0.3; + + std::vector face_mask_padding = {0,0,0,0}; + + // Calculate blur parameters + int blur_amount = static_cast(crop_size[0] * 0.5 * face_mask_blur); + int blur_area = std::max(blur_amount / 2, 1); + + // Create initial mask filled with ones + cv::Mat box_mask = cv::Mat::ones(crop_size[1], crop_size[0], CV_32F); + + // Calculate padding areas + int top_padding = std::max(blur_area, static_cast(crop_size[1] * face_mask_padding[0] / 100.0)); + int bottom_padding = std::max(blur_area, static_cast(crop_size[1] * face_mask_padding[2] / 100.0)); + int right_padding = std::max(blur_area, static_cast(crop_size[0] * face_mask_padding[1] / 100.0)); + int left_padding = std::max(blur_area, static_cast(crop_size[0] * face_mask_padding[3] / 100.0)); + + // Set padding regions to zero + // Top region + if (top_padding > 0) { + box_mask(cv::Rect(0, 0, crop_size[0], top_padding)) = 0.0; + } + + // Bottom region + if (bottom_padding > 0) { + box_mask(cv::Rect(0, crop_size[1] - bottom_padding, crop_size[0], bottom_padding)) = 0.0; + } + + // Left region + if (left_padding > 0) { + box_mask(cv::Rect(0, 0, left_padding, crop_size[1])) = 0.0; + } + + // Right region + if (right_padding > 0) { + box_mask(cv::Rect(crop_size[0] - right_padding, 0, right_padding, crop_size[1])) = 0.0; + } + + // Apply Gaussian blur if needed + if (blur_amount > 0) { + cv::GaussianBlur(box_mask, box_mask, cv::Size(0, 0), blur_amount * 0.25); + } + + return box_mask; +} diff --git a/lite/ort/cv/face_utils.h b/lite/ort/cv/face_utils.h new file mode 100644 index 00000000..08f5b24b --- /dev/null +++ b/lite/ort/cv/face_utils.h @@ -0,0 +1,60 @@ +// +// Created by wangzijian on 11/11/24. +// + +#ifndef LITE_AI_TOOLKIT_FACE_UTILS_H +#define LITE_AI_TOOLKIT_FACE_UTILS_H +#include "opencv2/opencv.hpp" +#include +#pragma once + +namespace face_utils +{ + + + cv::Mat paste_back(const cv::Mat& temp_vision_frame, + const cv::Mat& crop_vision_frame, + const cv::Mat& crop_mask, + const cv::Mat& affine_matrix); + + std::pair warp_face_by_translation(const cv::Mat& temp_img,cv::Point2f& translation, + float scale, const cv::Size& crop_size); + + std::vector dot_product(const std::vector& vec, + const std::vector& matrix, + int matrix_cols); + + std::pair warp_face_by_face_landmark_5(cv::Mat input_mat, + std::vector face_landmark_5,unsigned int type); + + std::vector convert_face_landmark_68_to_5(const std::vector& landmark_68); + + cv::Mat blend_frame(const cv::Mat &target_image, const cv::Mat &paste_frame); + + cv::Mat create_static_box_mask(std::vector crop_size); + + void normalize(std::vector& vec); + + float calculate_norm(const std::vector& vec); + + std::vector load_npy(const std::string& filename); + + // 需要把下面三个vector整合在一起 + + extern const std::vector> face_template_vector; + + extern const std::vector face_template_128; + + extern const std::vector face_template_112; + + extern const std::vector face_template_512; + + enum FaceType { + ARCFACE_112_V2 = 112, + ARCFACE_128_V2 = 128, + FFHQ_512 = 512 + }; + +} + +#endif //LITE_AI_TOOLKIT_FACE_UTILS_H diff --git a/lite/ort/cv/yolofacev8.h b/lite/ort/cv/yolofacev8.h index 10e98d62..fe455db1 100644 --- a/lite/ort/cv/yolofacev8.h +++ b/lite/ort/cv/yolofacev8.h @@ -20,8 +20,6 @@ namespace ortcv { private: float mean = -127.5 / 128.0; float scale = 1 / 128.0; - // const float conf_threshold = 0.5f; - // const float iou_threshold = 0.4f; float ratio_width; float ratio_height; diff --git a/lite/trt/cv/trt_face_68landmarks.cpp b/lite/trt/cv/trt_face_68landmarks.cpp new file mode 100644 index 00000000..b8165ca9 --- /dev/null +++ b/lite/trt/cv/trt_face_68landmarks.cpp @@ -0,0 +1,90 @@ +// +// Created by wangzijian on 11/12/24. +// + +#include "trt_face_68landmarks.h" +using trtcv::TRTFaceFusionFace68Landmarks; + +void +TRTFaceFusionFace68Landmarks::preprocess(const lite::types::Boxf &bounding_box, const cv::Mat &input_mat, cv::Mat &crop_img) { + float xmin = bounding_box.x1; + float ymin = bounding_box.y1; + float xmax = bounding_box.x2; + float ymax = bounding_box.y2; + + + float width = xmax - xmin; + float height = ymax - ymin; + float max_side = std::max(width, height); + float scale = 195.0f / max_side; + + float center_x = (xmax + xmin) * scale; + float center_y = (ymax + ymin) * scale; + + cv::Point2f translation; + translation.x = (256.0f - center_x) * 0.5f; + translation.y = (256.0f - center_y) * 0.5f; + + cv::Size crop_size(256, 256); + + std::tie(crop_img, affine_matrix) = face_utils::warp_face_by_translation(input_mat, translation, scale, crop_size); + + crop_img.convertTo(crop_img,CV_32FC3,1 / 255.f); +} + +void TRTFaceFusionFace68Landmarks::detect(const cv::Mat &input_mat, const lite::types::BoundingBoxType &bbox, + std::vector &face_landmark_5of68) { + if (input_mat.empty()) return; + + img_with_landmarks = input_mat.clone(); + cv::Mat crop_image; + + preprocess(bbox,input_mat,crop_image); + + std::vector input_data; + + trtcv::utils::transform::create_tensor(crop_image,input_data,input_node_dims,trtcv::utils::transform::CHW); + + cudaMemcpyAsync(buffers[0], input_data.data(), input_node_dims[0] * input_node_dims[1] * input_node_dims[2] * input_node_dims[3] * sizeof(float), + cudaMemcpyHostToDevice, stream); + + // 在推理之前同步流,確保數據完全拷貝 + cudaStreamSynchronize(stream); + bool status = trt_context->enqueueV3(stream); + cudaStreamSynchronize(stream); + + if (!status){ + std::cerr << "Failed to infer by TensorRT." << std::endl; + return; + } + + std::vector output(output_node_dims[0][0] * output_node_dims[0][1] * output_node_dims[0][2]); + cudaMemcpyAsync(output.data(), buffers[1], output_node_dims[0][0] * output_node_dims[0][1] * output_node_dims[0][2] * sizeof(float), + cudaMemcpyDeviceToHost, stream); + cudaStreamSynchronize(stream); + + + postprocess(output.data(),face_landmark_5of68); + +} + + +void TRTFaceFusionFace68Landmarks::postprocess(float *trt_outputs, std::vector &face_landmark_5of68) { + std::vector landmarks; + + for (int i = 0;i < 68; ++i) + { + float x = trt_outputs[i * 3] / 64.0f * 256.f; + float y = trt_outputs[i * 3 + 1] / 64.0f * 256.f; + landmarks.emplace_back(x, y); + } + + cv::Mat inverse_affine_matrix; + cv::invertAffineTransform(affine_matrix, inverse_affine_matrix); + + cv::transform(landmarks, landmarks, inverse_affine_matrix); + + face_landmark_5of68 = face_utils::convert_face_landmark_68_to_5(landmarks); +} + + diff --git a/lite/trt/cv/trt_face_68landmarks.h b/lite/trt/cv/trt_face_68landmarks.h new file mode 100644 index 00000000..5ead05a8 --- /dev/null +++ b/lite/trt/cv/trt_face_68landmarks.h @@ -0,0 +1,37 @@ +// +// Created by wangzijian on 11/12/24. +// + +#ifndef LITE_AI_TOOLKIT_TRT_FACE_68LANDMARKS_H +#define LITE_AI_TOOLKIT_TRT_FACE_68LANDMARKS_H +#include "lite/ort/cv/face_utils.h" +#include "lite/trt/core/trt_core.h" +#include "lite/trt/core/trt_utils.h" +#include "lite/trt/core/trt_types.h" +#include "algorithm" + + +namespace trtcv{ + class LITE_EXPORTS TRTFaceFusionFace68Landmarks : public BasicTRTHandler{ + public: + explicit TRTFaceFusionFace68Landmarks(const std::string& _trt_model_path,unsigned int _num_threads = 1): + BasicTRTHandler(_trt_model_path,_num_threads){}; + private: + cv::Mat affine_matrix; + cv::Mat img_with_landmarks; + private: + void preprocess(const lite::types::Boxf &bouding_box,const cv::Mat &input_mat,cv::Mat &crop_img); + + void postprocess(float *trt_outputs, std::vector &face_landmark_5of68); + + public: + + void detect(const cv::Mat &input_mat,const lite::types::BoundingBoxType &bbox, std::vector &face_landmark_5of68); + + + + }; +} + + +#endif //LITE_AI_TOOLKIT_TRT_FACE_68LANDMARKS_H diff --git a/lite/trt/cv/trt_face_68landmarks_mt.cpp b/lite/trt/cv/trt_face_68landmarks_mt.cpp new file mode 100644 index 00000000..7db634a6 --- /dev/null +++ b/lite/trt/cv/trt_face_68landmarks_mt.cpp @@ -0,0 +1,291 @@ +// +// Created by root on 11/15/24. +// + +#include "trt_face_68landmarks_mt.h" + + +trt_face_68landmarks_mt::trt_face_68landmarks_mt(std::string &model_path, size_t num_threads) : num_threads(num_threads){ + + // 1. 读取模型文件 + std::ifstream file(model_path, std::ios::binary); + if (!file.good()) { + std::cerr << "Failed to read model file: " << model_path << std::endl; + return; + } + + file.seekg(0, std::ifstream::end); + size_t model_size = file.tellg(); + file.seekg(0, std::ifstream::beg); + std::vector model_data(model_size); + file.read(model_data.data(), model_size); + file.close(); + + // 2. 创建TensorRT运行时和引擎 + trt_runtime.reset(nvinfer1::createInferRuntime(logger)); + trt_engine.reset(trt_runtime->deserializeCudaEngine(model_data.data(), model_size)); + + if (!trt_engine) { + std::cerr << "Failed to deserialize the TensorRT engine." << std::endl; + return; + } + + // 3. 获取模型输入输出信息 + int num_io_tensors = trt_engine->getNbIOTensors(); + + // 4. 为每个线程创建执行上下文和CUDA流 + trt_contexts.resize(num_threads); + streams.resize(num_threads); + buffers.resize(num_threads); + + for (size_t thread_id = 0; thread_id < num_threads; ++thread_id) { + // 创建执行上下文 + trt_contexts[thread_id].reset(trt_engine->createExecutionContext()); + if (!trt_contexts[thread_id]) { + std::cerr << "Failed to create execution context for thread " << thread_id << std::endl; + return; + } + + // 创建CUDA流 + cudaStreamCreate(&streams[thread_id]); + + // 为每个线程分配输入输出缓冲区 + buffers[thread_id].resize(num_io_tensors); + + for (int i = 0; i < num_io_tensors; ++i) { + auto tensor_name = trt_engine->getIOTensorName(i); + nvinfer1::Dims tensor_dims = trt_engine->getTensorShape(tensor_name); + + // 处理输入tensor + if (i == 0) { + size_t tensor_size = 1; + for (int j = 0; j < tensor_dims.nbDims; ++j) { + tensor_size *= tensor_dims.d[j]; + if (thread_id == 0) { // 只在第一个线程记录输入维度 + input_node_dims.push_back(tensor_dims.d[j]); + } + } + cudaMalloc(&buffers[thread_id][i], tensor_size * sizeof(float)); + trt_contexts[thread_id]->setTensorAddress(tensor_name, buffers[thread_id][i]); + continue; + } + + // 处理输出tensor + size_t tensor_size = 1; + if (thread_id == 0) { // 只在第一个线程记录输出维度 + std::vector output_node; + for (int j = 0; j < tensor_dims.nbDims; ++j) { + output_node.push_back(tensor_dims.d[j]); + tensor_size *= tensor_dims.d[j]; + } + output_node_dims.push_back(output_node); + } else { + for (int j = 0; j < tensor_dims.nbDims; ++j) { + tensor_size *= tensor_dims.d[j]; + } + } + + cudaMalloc(&buffers[thread_id][i], tensor_size * sizeof(float)); + trt_contexts[thread_id]->setTensorAddress(tensor_name, buffers[thread_id][i]); + + if (thread_id == 0) { + output_tensor_size++; + } + } + } + + // 5. 启动工作线程 + for (size_t i = 0; i < num_threads; ++i) { + worker_threads.emplace_back(&trt_face_68landmarks_mt::worker_function, this, i); + } + +} + +// 在cpp文件中修改相关实现 +void trt_face_68landmarks_mt::worker_function(int thread_id) { + while (true) { + InferenceTask task; + bool has_task = false; + + // 从任务队列获取任务 + { + std::unique_lock lock(queue_mutex); + if (!task_queue.empty()) { + task = std::move(task_queue.front()); + task_queue.pop(); + has_task = true; + active_tasks++; + } else if (stop_flag) { + break; + } else { + condition.wait(lock); + continue; + } + } + + if (has_task) { + // 处理任务 + process_single_task(task, thread_id); + + // 更新活跃任务计数 + { + std::lock_guard lock(completion_mutex); + active_tasks--; + completion_cv.notify_all(); + } + } + } +} + +void +trt_face_68landmarks_mt::preprocess(const lite::types::Boxf &bounding_box, const cv::Mat &input_mat, cv::Mat &crop_img) { + float xmin = bounding_box.x1; + float ymin = bounding_box.y1; + float xmax = bounding_box.x2; + float ymax = bounding_box.y2; + + + float width = xmax - xmin; + float height = ymax - ymin; + float max_side = std::max(width, height); + float scale = 195.0f / max_side; + + float center_x = (xmax + xmin) * scale; + float center_y = (ymax + ymin) * scale; + + cv::Point2f translation; + translation.x = (256.0f - center_x) * 0.5f; + translation.y = (256.0f - center_y) * 0.5f; + + cv::Size crop_size(256, 256); + + std::tie(crop_img, affine_matrix) = face_utils::warp_face_by_translation(input_mat, translation, scale, crop_size); + + crop_img.convertTo(crop_img,CV_32FC3,1 / 255.f); +} + + +void trt_face_68landmarks_mt::process_single_task( InferenceTask &task, int thread_id) { + if (task.input_mat.empty()) return; + + img_with_landmarks = task.input_mat.clone(); + cv::Mat crop_image; + + preprocess(task.bbox, task.input_mat, crop_image); + + std::vector input_data; + + trtcv::utils::transform::create_tensor(crop_image,input_data,input_node_dims,trtcv::utils::transform::CHW); + + cudaMemcpyAsync(buffers[thread_id][0], input_data.data(), input_node_dims[0] * input_node_dims[1] * input_node_dims[2] * input_node_dims[3] * sizeof(float), + cudaMemcpyHostToDevice, streams[thread_id]); + + // 在推理之前同步流,確保數據完全拷貝 + cudaStreamSynchronize(streams[thread_id]); + bool status = trt_contexts[thread_id]->enqueueV3(streams[thread_id]); + cudaStreamSynchronize(streams[thread_id]); + + if (!status){ + std::cerr << "Failed to infer by TensorRT." << std::endl; + return; + } + + std::vector output(output_node_dims[0][0] * output_node_dims[0][1] * output_node_dims[0][2]); + cudaMemcpyAsync(output.data(), buffers[thread_id][1], output_node_dims[0][0] * output_node_dims[0][1] * output_node_dims[0][2] * sizeof(float), + cudaMemcpyDeviceToHost, streams[thread_id]); + cudaStreamSynchronize(streams[thread_id]); + + + // 带出结果 + // 指针指向带出来 + *task.face_landmark_5of68 = postprocess(output.data()); + + task.completion_promise.set_value(); +} + + +std::vector trt_face_68landmarks_mt::postprocess(float *trt_outputs) { + std::vector landmarks; + + for (int i = 0;i < 68; ++i) + { + float x = trt_outputs[i * 3] / 64.0f * 256.f; + float y = trt_outputs[i * 3 + 1] / 64.0f * 256.f; + landmarks.emplace_back(x, y); + } + + cv::Mat inverse_affine_matrix; + cv::invertAffineTransform(affine_matrix, inverse_affine_matrix); + + cv::transform(landmarks, landmarks, inverse_affine_matrix); + + return face_utils::convert_face_landmark_68_to_5(landmarks); +} + + +void trt_face_68landmarks_mt::postprocess(float *trt_outputs, std::vector &face_landmark_5of68) { + std::vector landmarks; + + for (int i = 0;i < 68; ++i) + { + float x = trt_outputs[i * 3] / 64.0f * 256.f; + float y = trt_outputs[i * 3 + 1] / 64.0f * 256.f; + landmarks.emplace_back(x, y); + } + + cv::Mat inverse_affine_matrix; + cv::invertAffineTransform(affine_matrix, inverse_affine_matrix); + + cv::transform(landmarks, landmarks, inverse_affine_matrix); + + face_landmark_5of68 = face_utils::convert_face_landmark_68_to_5(landmarks); +} + +void trt_face_68landmarks_mt::detect_async(cv::Mat &input_image, const lite::types::Boxf &bbox, + std::vector &face_landmark_5of68) { +// InferenceTask task{input_image.clone(), bbox, face_landmark_5of68}; + auto promise = std::promise(); + auto future = promise.get_future(); + + // 创建任务,传入结果向量的指针 + InferenceTask task{input_image.clone(), bbox, &face_landmark_5of68, std::move(promise)}; + + { + std::lock_guard lock(queue_mutex); + task_queue.push(std::move(task)); + } + + condition.notify_one(); +} + +void trt_face_68landmarks_mt::shutdown() { + // 设置停止标志 + stop_flag = true; + condition.notify_all(); + + // 等待所有工作线程结束 + for (auto& thread : worker_threads) { + if (thread.joinable()) { + thread.join(); + } + } +} + +void trt_face_68landmarks_mt::wait_for_completion() { + std::unique_lock lock(completion_mutex); + completion_cv.wait(lock, [this]() { + return active_tasks == 0 && task_queue.empty(); + }); +} + +trt_face_68landmarks_mt::~trt_face_68landmarks_mt() { + shutdown(); + + // 释放CUDA资源 + for (size_t thread_id = 0; thread_id < num_threads; ++thread_id) { + for (auto buffer : buffers[thread_id]) { + cudaFree(buffer); + } + cudaStreamDestroy(streams[thread_id]); + } +} \ No newline at end of file diff --git a/lite/trt/cv/trt_face_68landmarks_mt.h b/lite/trt/cv/trt_face_68landmarks_mt.h new file mode 100644 index 00000000..3088a041 --- /dev/null +++ b/lite/trt/cv/trt_face_68landmarks_mt.h @@ -0,0 +1,93 @@ +// trt_face_restoration_mt.h + +#ifndef LITE_AI_TOOLKIT_TRT_FACE_LANDMARKS_MT_H_ // 使用不同的后缀 +#define LITE_AI_TOOLKIT_TRT_FACE_LANDMARKS_MT_H_ + + +#include "cuda_runtime.h" +#include "NvInfer.h" +#include "opencv2/opencv.hpp" +#include "opencv2/core.hpp" +#include "lite/trt/core/trt_logger.h" +#include "lite/ort/cv/face_utils.h" +#include "lite/trt/core/trt_utils.h" +#include "fstream" +#include +#include +#include +#include +#include "lite/types.h" +#include +#include +#include + +// 定义任务结构体 +struct InferenceTask { + cv::Mat input_mat; + lite::types::Boxf bbox; + // 如果你想带出来结果的话 需要在这里加上指针 用于存储结果 + std::vector* face_landmark_5of68; // 改为指针 + std::promise completion_promise; // 添加promise用于同步 +}; + +class trt_face_68landmarks_mt { +private: + Logger logger; + + // TensorRT相关组件 + std::unique_ptr trt_runtime; + std::unique_ptr trt_engine; + std::vector> trt_contexts; // 每个线程一个context + std::vector streams; // 每个线程一个stream + std::vector> buffers; // 每个线程一组buffer + + // 模型相关维度信息 + std::vector input_node_dims; + std::vector> output_node_dims; + std::size_t input_tensor_size = 1; + std::size_t output_tensor_size = 0; + + // 线程池相关组件 + std::vector worker_threads; + std::queue task_queue; + std::mutex queue_mutex; + std::condition_variable condition; + std::atomic stop_flag{false}; + size_t num_threads; + + std::atomic active_tasks{0}; // 新增:跟踪活跃任务数 + std::mutex completion_mutex; + std::condition_variable completion_cv; + + // 线程工作函数 + void worker_function(int thread_id); + + // 实际的推理函数 + void process_single_task(InferenceTask& task, int thread_id); + + void preprocess(const lite::types::Boxf &bouding_box,const cv::Mat &input_mat,cv::Mat &crop_img); + + void postprocess(float *trt_outputs, std::vector &face_landmark_5of68); + + std::vector postprocess(float *trt_outputs); + + + +private: + cv::Mat affine_matrix; + cv::Mat img_with_landmarks; + +public: + explicit trt_face_68landmarks_mt(std::string& model_path, size_t num_threads = 4); + ~trt_face_68landmarks_mt(); + + // 异步任务提交接口 +// void detect_async(cv::Mat& input_image, const lite::types::Boxf& bbox, std::vector& face_landmark_5of68); + void detect_async(cv::Mat& input_image, const lite::types::Boxf& bbox, std::vector& face_landmark_5of68); + void shutdown(); // 新增:显式关闭方法 + + // 等待所有任务完成 + void wait_for_completion(); +}; + +#endif //LITE_AI_TOOLKIT_TRT_FACE_RESTORATION_MT_H \ No newline at end of file diff --git a/lite/trt/cv/trt_face_recognizer.cpp b/lite/trt/cv/trt_face_recognizer.cpp new file mode 100644 index 00000000..a7d0b7e1 --- /dev/null +++ b/lite/trt/cv/trt_face_recognizer.cpp @@ -0,0 +1,68 @@ +// +// Created by wangzijian on 11/13/24. +// + +#include "trt_face_recognizer.h" +using trtcv::TRTFaceFusionFaceRecognizer; + +cv::Mat TRTFaceFusionFaceRecognizer::preprocess(cv::Mat &input_mat, std::vector &face_landmark_5, + cv::Mat &preprocessed_mat) { + cv::Mat crop_image; + cv::Mat affine_martix; + + std::tie(crop_image,affine_martix) = face_utils::warp_face_by_face_landmark_5(input_mat,face_landmark_5,face_utils::ARCFACE_112_V2); + crop_image.convertTo(crop_image,CV_32FC3, 1.0f / 127.5f,-1.0); + cv::cvtColor(crop_image,crop_image,cv::COLOR_BGR2RGB); + + return crop_image; +} + + +void TRTFaceFusionFaceRecognizer::detect(cv::Mat &input_mat, std::vector &face_landmark_5, + std::vector &embeding) { + cv::Mat ori_image = input_mat.clone(); + + cv::Mat crop_image = preprocess(input_mat,face_landmark_5,ori_image); + + + std::vector input_vector; + + trtcv::utils::transform::create_tensor(crop_image,input_vector,input_node_dims, + trtcv::utils::transform::CHW); + + cudaMemcpyAsync(buffers[0], input_vector.data(), input_node_dims[0] * input_node_dims[1] * input_node_dims[2] * input_node_dims[3] * sizeof(float), + cudaMemcpyHostToDevice, stream); + + // 在推理之前同步流,確保數據完全拷貝 + cudaStreamSynchronize(stream); + bool status = trt_context->enqueueV3(stream); + cudaStreamSynchronize(stream); + + if (!status){ + std::cerr << "Failed to infer by TensorRT." << std::endl; + return; + } + + std::vector output(output_node_dims[0][0] * output_node_dims[0][1]); + cudaMemcpyAsync(output.data(), buffers[1], output_node_dims[0][0] * output_node_dims[0][1] * sizeof(float), + cudaMemcpyDeviceToHost, stream); + cudaStreamSynchronize(stream); + + embeding.assign(output.begin(),output.end()); + std::vector normal_embeding(output.begin(),output.end()); + + + float norm = 0.0f; + for (const auto &val : normal_embeding) { + norm += val * val; + } + norm = std::sqrt(norm); + + for (auto &val : normal_embeding) { + val /= norm; + } + + std::cout<<"done!"< &face_landmark_5,cv::Mat &preprocessed_mat); + + public: + void detect(cv::Mat &input_mat,std::vector &face_landmark_5,std::vector &embeding); + + }; +} + + + +#endif //LITE_AI_TOOLKIT_TRT_FACE_RECOGNIZER_H diff --git a/lite/trt/cv/trt_face_restoration.cpp b/lite/trt/cv/trt_face_restoration.cpp new file mode 100644 index 00000000..bb874cc5 --- /dev/null +++ b/lite/trt/cv/trt_face_restoration.cpp @@ -0,0 +1,110 @@ +// +// Created by wangzijian on 11/14/24. +// + +#include "trt_face_restoration.h" +using trtcv::TRTFaceFusionFaceRestoration; + +void TRTFaceFusionFaceRestoration::detect(cv::Mat &face_swap_image, std::vector &target_landmarks_5, + const std::string &face_enchaner_path) { + auto ori_image = face_swap_image.clone(); + + cv::Mat crop_image; + cv::Mat affine_matrix; + std::tie(crop_image,affine_matrix) = face_utils::warp_face_by_face_landmark_5(face_swap_image,target_landmarks_5,face_utils::FFHQ_512); + + std::vector crop_size = {512,512}; + cv::Mat box_mask = face_utils::create_static_box_mask(crop_size); + std::vector crop_mask_list; + crop_mask_list.emplace_back(box_mask); + + cv::cvtColor(crop_image,crop_image,cv::COLOR_BGR2RGB); + crop_image.convertTo(crop_image,CV_32FC3,1.f / 255.f); + crop_image.convertTo(crop_image,CV_32FC3,2.0f,-1.f); + + std::vector input_vector; + trtcv::utils::transform::create_tensor(crop_image,input_vector,input_node_dims,trtcv::utils::transform::CHW); + + // 拷贝 + + // 先不用拷贝了 处理完成再拷贝出来 类似于整个后处理放在GPU上完成 + cudaMemcpyAsync(buffers[0],input_vector.data(),1 * 3 * 512 * 512 * sizeof(float),cudaMemcpyHostToDevice,stream); + + // 同步 + cudaStreamSynchronize(stream); + + // 推理 + bool status = trt_context->enqueueV3(stream); + if (!status) { + std::cerr << "Failed to inference" << std::endl; + return; + } + + + // 同步 + cudaStreamSynchronize(stream); + std::vector transposed_data(1 * 3 * 512 * 512); + +// std::vector transposed_data(1 * 3 * 512 * 512); + + // 这里buffer1就是输出了 + launch_face_restoration_postprocess( + static_cast(buffers[1]), + transposed_data.data(), + 3, + 512, + 512 + ); + + std::vector transposed_data_float(transposed_data.begin(), + transposed_data.end()); + + + // 获取输出 + std::vector output_vector(1 * 3 * 512 * 512); +// cudaMemcpyAsync(output_vector.data(),buffers[1],1 * 3 * 512 * 512 * sizeof(float),cudaMemcpyDeviceToHost,stream); + cudaStreamSynchronize(stream); +// + // 后处理 + int channel = 3; + int height = 512; + int width = 512; +// std::vector output(channel * height * width); +// output.assign(output_vector.begin(),output_vector.end()); +// +// std::transform(output.begin(),output.end(),output.begin(), +// [](double x){return std::max(-1.0,std::max(-1.0,std::min(1.0,x)));}); +// +// std::transform(output.begin(),output.end(),output.begin(), +// [](double x){return (x + 1.f) /2.f;}); +// +// // CHW2HWC +// for (int c = 0; c < channel; ++c){ +// for (int h = 0 ; h < height; ++h){ +// for (int w = 0; w < width ; ++w){ +// int src_index = c * (height * width) + h * width + w; +// int dst_index = h * (width * channel) + w * channel + c; +// transposed_data[dst_index] = output[src_index]; +// } +// } +// } +// +// std::transform(transposed_data.begin(),transposed_data.end(),transposed_data.begin(), +// [](float x){return std::round(x * 255.f);}); +// +// std::transform(transposed_data.begin(), transposed_data.end(), transposed_data.begin(), +// [](float x) { return static_cast(x); }); + + + cv::Mat mat(height, width, CV_32FC3, transposed_data_float.data()); +// cv::imwrite("/home/lite.ai.toolkit/mid_process.jpg",mat); + cv::cvtColor(mat, mat, cv::COLOR_RGB2BGR); + + + auto crop_mask = crop_mask_list[0]; + cv::Mat paste_frame = face_utils::paste_back(ori_image,mat,crop_mask,affine_matrix); + + cv::Mat dst_image = face_utils::blend_frame(ori_image,paste_frame); + + cv::imwrite(face_enchaner_path,dst_image); +} \ No newline at end of file diff --git a/lite/trt/cv/trt_face_restoration.h b/lite/trt/cv/trt_face_restoration.h new file mode 100644 index 00000000..a525f657 --- /dev/null +++ b/lite/trt/cv/trt_face_restoration.h @@ -0,0 +1,23 @@ +// +// Created by wangzijian on 11/14/24. +// + +#ifndef LITE_AI_TOOLKIT_TRT_FACE_RESTORATION_H +#define LITE_AI_TOOLKIT_TRT_FACE_RESTORATION_H +#include "lite/trt/core/trt_core.h" +#include "lite/trt/core/trt_utils.h" +#include "lite/trt/core/trt_config.h" +#include "lite/ort/cv/face_utils.h" +#include "lite/trt/kernel/face_restoration_postprocess_manager.h" +namespace trtcv{ + class LITE_EXPORTS TRTFaceFusionFaceRestoration : BasicTRTHandler{ + public: + explicit TRTFaceFusionFaceRestoration(const std::string& _trt_model_path,unsigned int _num_threads = 1) : + BasicTRTHandler(_trt_model_path,_num_threads){};; + public: + void detect(cv::Mat &face_swap_image,std::vector &target_landmarks_5 ,const std::string &face_enchaner_path); + + }; +} + +#endif //LITE_AI_TOOLKIT_TRT_FACE_RESTORATION_H diff --git a/lite/trt/cv/trt_face_restoration_mt.cpp b/lite/trt/cv/trt_face_restoration_mt.cpp new file mode 100644 index 00000000..496ac4ba --- /dev/null +++ b/lite/trt/cv/trt_face_restoration_mt.cpp @@ -0,0 +1,289 @@ +// trt_face_restoration_mt.cpp +#include "trt_face_restoration_mt.h" + +trt_face_restoration_mt::trt_face_restoration_mt(std::string& model_path, size_t num_threads) + : num_threads(num_threads) { + // 1. 读取模型文件 + std::ifstream file(model_path, std::ios::binary); + if (!file.good()) { + std::cerr << "Failed to read model file: " << model_path << std::endl; + return; + } + + file.seekg(0, std::ifstream::end); + size_t model_size = file.tellg(); + file.seekg(0, std::ifstream::beg); + std::vector model_data(model_size); + file.read(model_data.data(), model_size); + file.close(); + + // 2. 创建TensorRT运行时和引擎 + trt_runtime.reset(nvinfer1::createInferRuntime(logger)); + trt_engine.reset(trt_runtime->deserializeCudaEngine(model_data.data(), model_size)); + + if (!trt_engine) { + std::cerr << "Failed to deserialize the TensorRT engine." << std::endl; + return; + } + + // 3. 获取模型输入输出信息 + int num_io_tensors = trt_engine->getNbIOTensors(); + + // 4. 为每个线程创建执行上下文和CUDA流 + trt_contexts.resize(num_threads); + streams.resize(num_threads); + buffers.resize(num_threads); + + for (size_t thread_id = 0; thread_id < num_threads; ++thread_id) { + // 创建执行上下文 + trt_contexts[thread_id].reset(trt_engine->createExecutionContext()); + if (!trt_contexts[thread_id]) { + std::cerr << "Failed to create execution context for thread " << thread_id << std::endl; + return; + } + + // 创建CUDA流 + cudaStreamCreate(&streams[thread_id]); + + // 为每个线程分配输入输出缓冲区 + buffers[thread_id].resize(num_io_tensors); + + for (int i = 0; i < num_io_tensors; ++i) { + auto tensor_name = trt_engine->getIOTensorName(i); + nvinfer1::Dims tensor_dims = trt_engine->getTensorShape(tensor_name); + + // 处理输入tensor + if (i == 0) { + size_t tensor_size = 1; + for (int j = 0; j < tensor_dims.nbDims; ++j) { + tensor_size *= tensor_dims.d[j]; + if (thread_id == 0) { // 只在第一个线程记录输入维度 + input_node_dims.push_back(tensor_dims.d[j]); + } + } + cudaMalloc(&buffers[thread_id][i], tensor_size * sizeof(float)); + trt_contexts[thread_id]->setTensorAddress(tensor_name, buffers[thread_id][i]); + continue; + } + + // 处理输出tensor + size_t tensor_size = 1; + if (thread_id == 0) { // 只在第一个线程记录输出维度 + std::vector output_node; + for (int j = 0; j < tensor_dims.nbDims; ++j) { + output_node.push_back(tensor_dims.d[j]); + tensor_size *= tensor_dims.d[j]; + } + output_node_dims.push_back(output_node); + } else { + for (int j = 0; j < tensor_dims.nbDims; ++j) { + tensor_size *= tensor_dims.d[j]; + } + } + + cudaMalloc(&buffers[thread_id][i], tensor_size * sizeof(float)); + trt_contexts[thread_id]->setTensorAddress(tensor_name, buffers[thread_id][i]); + + if (thread_id == 0) { + output_tensor_size++; + } + } + } + + // 5. 启动工作线程 + for (size_t i = 0; i < num_threads; ++i) { + worker_threads.emplace_back(&trt_face_restoration_mt::worker_function, this, i); + } +} + +// 在cpp文件中修改相关实现 +void trt_face_restoration_mt::worker_function(int thread_id) { + while (true) { + InferenceTaskTest task; + bool has_task = false; + + // 从任务队列获取任务 + { + std::unique_lock lock(queue_mutex); + if (!task_queue.empty()) { + task = std::move(task_queue.front()); + task_queue.pop(); + has_task = true; + active_tasks++; + } else if (stop_flag) { + break; + } else { + condition.wait(lock); + continue; + } + } + + if (has_task) { + // 处理任务 + process_single_task(task, thread_id); + + // 更新活跃任务计数 + { + std::lock_guard lock(completion_mutex); + active_tasks--; + completion_cv.notify_all(); + } + } + } +} + + +void trt_face_restoration_mt::process_single_task(const InferenceTaskTest& task, int thread_id) { + auto ori_image = task.face_swap_image.clone(); + + // 1. 图像预处理 + cv::Mat crop_image; + cv::Mat affine_matrix; + std::tie(crop_image, affine_matrix) = face_utils::warp_face_by_face_landmark_5( + task.face_swap_image, + task.target_landmarks_5, + face_utils::FFHQ_512 + ); + + std::vector crop_size = {512, 512}; + cv::Mat box_mask = face_utils::create_static_box_mask(crop_size); + std::vector crop_mask_list; + crop_mask_list.emplace_back(box_mask); + + cv::cvtColor(crop_image, crop_image, cv::COLOR_BGR2RGB); + crop_image.convertTo(crop_image, CV_32FC3, 1.f / 255.f); + crop_image.convertTo(crop_image, CV_32FC3, 2.0f, -1.f); + + std::vector input_vector; + trtcv::utils::transform::create_tensor( + crop_image, + input_vector, + input_node_dims, + trtcv::utils::transform::CHW + ); + + // 2. 拷贝输入数据到GPU + cudaMemcpyAsync( + buffers[thread_id][0], + input_vector.data(), + 1 * 3 * 512 * 512 * sizeof(float), + cudaMemcpyHostToDevice, + streams[thread_id] + ); + + // 3. 同步并推理 + cudaStreamSynchronize(streams[thread_id]); + bool status = trt_contexts[thread_id]->enqueueV3(streams[thread_id]); + + if (!status) { + std::cerr << "Failed to inference in thread " << thread_id << std::endl; + return; + } + + cudaStreamSynchronize(streams[thread_id]); + + // 4. 获取输出数据 + std::vector output_vector(1 * 3 * 512 * 512); + cudaMemcpyAsync( + output_vector.data(), + buffers[thread_id][1], + 1 * 3 * 512 * 512 * sizeof(float), + cudaMemcpyDeviceToHost, + streams[thread_id] + ); + + cudaStreamSynchronize(streams[thread_id]); + + // 5. 后处理 + int channel = 3; + int height = 512; + int width = 512; + std::vector output(channel * height * width); + output.assign(output_vector.begin(), output_vector.end()); + + // 值范围裁剪到[-1, 1] + std::transform(output.begin(), output.end(), output.begin(), + [](double x) { return std::max(-1.0, std::min(1.0, x)); }); + + // 转换到[0, 1]范围 + std::transform(output.begin(), output.end(), output.begin(), + [](double x) { return (x + 1.f) / 2.f; }); + + // CHW到HWC转换 + std::vector transposed_data(channel * height * width); + for (int c = 0; c < channel; ++c) { + for (int h = 0; h < height; ++h) { + for (int w = 0; w < width; ++w) { + int src_index = c * (height * width) + h * width + w; + int dst_index = h * (width * channel) + w * channel + c; + transposed_data[dst_index] = output[src_index]; + } + } + } + + // 转换到0-255范围 + std::transform(transposed_data.begin(), transposed_data.end(), transposed_data.begin(), + [](float x) { return std::round(x * 255.f); }); + + // 转换到uint8 + std::transform(transposed_data.begin(), transposed_data.end(), transposed_data.begin(), + [](float x) { return static_cast(x); }); + + // 6. 创建输出图像 + cv::Mat mat(height, width, CV_32FC3, transposed_data.data()); + cv::cvtColor(mat, mat, cv::COLOR_RGB2BGR); + + // 7. 后处理和保存 + auto crop_mask = crop_mask_list[0]; + cv::Mat paste_frame = face_utils::paste_back(ori_image, mat, crop_mask, affine_matrix); + cv::Mat dst_image = face_utils::blend_frame(ori_image, paste_frame); + cv::imwrite(task.face_enchaner_path, dst_image); +} + +void trt_face_restoration_mt::detect_async( + cv::Mat& face_swap_image, + std::vector& target_landmarks_5, + const std::string& face_enchaner_path +) { + InferenceTaskTest task{face_swap_image.clone(), target_landmarks_5, face_enchaner_path}; + + { + std::lock_guard lock(queue_mutex); + task_queue.push(std::move(task)); + } + + condition.notify_one(); +} + + +void trt_face_restoration_mt::shutdown() { + // 设置停止标志 + stop_flag = true; + condition.notify_all(); + + // 等待所有工作线程结束 + for (auto& thread : worker_threads) { + if (thread.joinable()) { + thread.join(); + } + } +} + +void trt_face_restoration_mt::wait_for_completion() { + std::unique_lock lock(completion_mutex); + completion_cv.wait(lock, [this]() { + return active_tasks == 0 && task_queue.empty(); + }); +} + +trt_face_restoration_mt::~trt_face_restoration_mt() { + shutdown(); + + // 释放CUDA资源 + for (size_t thread_id = 0; thread_id < num_threads; ++thread_id) { + for (auto buffer : buffers[thread_id]) { + cudaFree(buffer); + } + cudaStreamDestroy(streams[thread_id]); + } +} \ No newline at end of file diff --git a/lite/trt/cv/trt_face_restoration_mt.h b/lite/trt/cv/trt_face_restoration_mt.h new file mode 100644 index 00000000..f9cedcbb --- /dev/null +++ b/lite/trt/cv/trt_face_restoration_mt.h @@ -0,0 +1,77 @@ +// trt_face_restoration_mt.h +#ifndef LITE_AI_TOOLKIT_TRT_FACE_RESTORATION_MT_H_ // 注意添加_MT后缀 +#define LITE_AI_TOOLKIT_TRT_FACE_RESTORATION_MT_H_ + +#include "cuda_runtime.h" +#include "NvInfer.h" +#include "opencv2/opencv.hpp" +#include "opencv2/core.hpp" +#include "lite/trt/core/trt_logger.h" +#include "lite/ort/cv/face_utils.h" +#include "lite/trt/core/trt_utils.h" +#include +#include +#include +#include +#include +#include +#include + + +// 定义任务结构体 +struct InferenceTaskTest { + cv::Mat face_swap_image; + std::vector target_landmarks_5; + std::string face_enchaner_path; +}; + +class trt_face_restoration_mt { +private: + Logger logger; + + // TensorRT相关组件 + std::unique_ptr trt_runtime; + std::unique_ptr trt_engine; + std::vector> trt_contexts; // 每个线程一个context + std::vector streams; // 每个线程一个stream + std::vector> buffers; // 每个线程一组buffer + + // 模型相关维度信息 + std::vector input_node_dims; + std::vector> output_node_dims; + std::size_t input_tensor_size = 1; + std::size_t output_tensor_size = 0; + + // 线程池相关组件 + std::vector worker_threads; + std::queue task_queue; + std::mutex queue_mutex; + std::condition_variable condition; + std::atomic stop_flag{false}; + size_t num_threads; + + std::atomic active_tasks{0}; // 新增:跟踪活跃任务数 + std::mutex completion_mutex; + std::condition_variable completion_cv; + + // 线程工作函数 + void worker_function(int thread_id); + + // 实际的推理函数 + void process_single_task(const InferenceTaskTest& task, int thread_id); + +public: + explicit trt_face_restoration_mt(std::string& model_path, size_t num_threads = 4); + ~trt_face_restoration_mt(); + + // 异步任务提交接口 + void detect_async(cv::Mat& face_swap_image, + std::vector& target_landmarks_5, + const std::string& face_enchaner_path); + + void shutdown(); // 新增:显式关闭方法 + + // 等待所有任务完成 + void wait_for_completion(); +}; +#endif \ No newline at end of file diff --git a/lite/trt/cv/trt_face_swap.cpp b/lite/trt/cv/trt_face_swap.cpp new file mode 100644 index 00000000..e0f1358f --- /dev/null +++ b/lite/trt/cv/trt_face_swap.cpp @@ -0,0 +1,90 @@ +// +// Created by wangzijian on 11/13/24. +// + +#include "trt_face_swap.h" +using trtcv::TRTFaceFusionFaceSwap; + +void TRTFaceFusionFaceSwap::preprocess(cv::Mat &target_face, std::vector source_image_embeding, + std::vector target_landmark_5, + std::vector &processed_source_embeding, cv::Mat &preprocessed_mat) { + + std::tie(preprocessed_mat, affine_martix) = face_utils::warp_face_by_face_landmark_5(target_face,target_landmark_5,face_utils::ARCFACE_128_V2); + + std::vector crop_size= {128.0,128.0}; + crop_list.emplace_back(face_utils::create_static_box_mask(crop_size)); + + cv::cvtColor(preprocessed_mat,preprocessed_mat,cv::COLOR_BGR2RGB); + preprocessed_mat.convertTo(preprocessed_mat,CV_32FC3,1.0 / 255.f); + preprocessed_mat.convertTo(preprocessed_mat,CV_32FC3,1.0 / 1.f,0); + + std::vector model_martix = face_utils::load_npy("/home/facefusion-onnxrun/python/model_matrix.npy"); + + processed_source_embeding= face_utils::dot_product(source_image_embeding,model_martix,512); + + face_utils::normalize(processed_source_embeding); + + std::cout<<"done!"< source_face_embeding, + std::vector target_landmark_5, cv::Mat &face_swap_image) { + cv::Mat ori_image = target_image.clone(); + std::vector source_embeding_input; + cv::Mat model_input_mat; + preprocess(target_image,source_face_embeding,target_landmark_5,source_embeding_input,model_input_mat); + + std::vector input_vector; + trtcv::utils::transform::create_tensor(model_input_mat,input_vector,input_node_dims,trtcv::utils::transform::CHW); + + // 这个是 source 的输入下面写一个 embeding 的输入 + cudaMemcpyAsync(buffers[0],input_vector.data(),1 * 3 * 128 * 128 *sizeof(float ), cudaMemcpyHostToDevice,stream); + cudaMemcpyAsync(buffers[1],source_embeding_input.data(),512 * sizeof(float), cudaMemcpyHostToDevice,stream); + + // 推理之前先同步一下 + cudaStreamSynchronize(stream); + + // 这里是推理 + bool status = trt_context->enqueueV3(stream); + if (!status) { + std::cerr << "Failed to enqueue TensorRT model." << std::endl; + return; + } + + // 将输出拷贝出来 + std::vector output_vector(3 * 128 * 128); + cudaMemcpyAsync(output_vector.data(),buffers[2],1 * 3 * 128 * 128 * sizeof(float),cudaMemcpyDeviceToHost,stream); + cudaStreamSynchronize(stream); + + std::vector output_swap_image(1 * 3 * 128 * 128); + output_swap_image.assign(output_vector.begin(),output_vector.end()); + + std::vector transposed(3 * 128 * 128); + int channels = 3; + int height = 128; + int width = 128; + + for (int c = 0; c < channels; ++c) { + for (int h = 0; h < height; ++h) { + for (int w = 0; w < width; ++w) { + int src_idx = c * (height * width) + h * width + w; // CHW + int dst_idx = h * (width * channels) + w * channels + c; // HWC + transposed[dst_idx] = output_swap_image[src_idx]; + } + } + } + + for (auto& val : transposed) { + val = std::round(val * 255.0); + } + + cv::Mat mat(height, width, CV_32FC3, transposed.data()); + cv::cvtColor(mat, mat, cv::COLOR_RGB2BGR); + + cv::Mat dst_image = face_utils::paste_back(ori_image,mat,crop_list[0],affine_martix); + face_swap_image = dst_image; + + +} \ No newline at end of file diff --git a/lite/trt/cv/trt_face_swap.h b/lite/trt/cv/trt_face_swap.h new file mode 100644 index 00000000..fac57666 --- /dev/null +++ b/lite/trt/cv/trt_face_swap.h @@ -0,0 +1,32 @@ +// +// Created by wangzijian on 11/13/24. +// + +#ifndef LITE_AI_TOOLKIT_TRT_FACE_SWAP_H +#define LITE_AI_TOOLKIT_TRT_FACE_SWAP_H +#include "lite/ort/cv/face_utils.h" +#include "lite/trt/core/trt_core.h" +#include "lite/trt/core/trt_utils.h" +#include "lite/trt/core/trt_types.h" + +namespace trtcv{ + class LITE_EXPORTS TRTFaceFusionFaceSwap : BasicTRTHandler{ + public: + explicit TRTFaceFusionFaceSwap(const std::string& _trt_model_path,unsigned int _num_threads = 1): + BasicTRTHandler(_trt_model_path,_num_threads){}; + private: + void preprocess(cv::Mat &target_face,std::vector source_image_embeding,std::vector target_landmark_5, + std::vector &processed_source_embeding,cv::Mat &preprocessed_mat); + + private: + std::vector crop_list; + cv::Mat affine_martix; + public: + void detect(cv::Mat &target_image,std::vector source_face_embeding,std::vector target_landmark_5, + cv::Mat &face_swap_image); + + }; +} + + +#endif //LITE_AI_TOOLKIT_TRT_FACE_SWAP_H diff --git a/lite/trt/cv/trt_facefusion_pipeline.cpp b/lite/trt/cv/trt_facefusion_pipeline.cpp new file mode 100644 index 00000000..717119b5 --- /dev/null +++ b/lite/trt/cv/trt_facefusion_pipeline.cpp @@ -0,0 +1,47 @@ +// +// Created by wangzijian on 11/14/24. +// + +#include "trt_facefusion_pipeline.h" +using trtcv::TRTFaceFusionPipeLine; + +TRTFaceFusionPipeLine::TRTFaceFusionPipeLine(const std::string &face_detect_engine_path, + const std::string &face_landmarks_68_engine_path, + const std::string &face_recognizer_engine_path, + const std::string &face_swap_engine_path, + const std::string &face_restoration_engine_path) { + face_detect = std::make_unique(face_detect_engine_path,1); + face_landmarks = std::make_unique(face_landmarks_68_engine_path,1); + face_recognizer = std::make_unique(face_recognizer_engine_path,1); + face_swap = std::make_unique(face_swap_engine_path,1); + face_restoration = std::make_unique(face_restoration_engine_path,1); +} + +void TRTFaceFusionPipeLine::detect(const std::string &source_image, const std::string &target_image, + const std::string &save_image) { + std::vector detected_boxes; + cv::Mat img_bgr = cv::imread(source_image); + face_detect->detect(img_bgr,detected_boxes,0.25f,0.45f); + + int position = 0; // position number 0 + auto test_bounding_box = detected_boxes[0]; + std::vector face_landmark_5of68; + + face_landmarks->detect(img_bgr, test_bounding_box, face_landmark_5of68); + std::vector source_image_embeding; + face_recognizer->detect(img_bgr,face_landmark_5of68,source_image_embeding); + + + std::vector target_detected_boxes; + cv::Mat target_img_bgr = cv::imread(target_image); + face_detect->detect(target_img_bgr, target_detected_boxes,0.25f,0.45f); + auto target_test_bounding_box = target_detected_boxes[0]; + std::vector target_face_landmark_5of68; + face_landmarks->detect(target_img_bgr, target_test_bounding_box,target_face_landmark_5of68); + + cv::Mat face_swap_image; + face_swap->detect(target_img_bgr,source_image_embeding,target_face_landmark_5of68,face_swap_image); + face_restoration->detect(face_swap_image,target_face_landmark_5of68,save_image); +} + + diff --git a/lite/trt/cv/trt_facefusion_pipeline.h b/lite/trt/cv/trt_facefusion_pipeline.h new file mode 100644 index 00000000..b5b608e7 --- /dev/null +++ b/lite/trt/cv/trt_facefusion_pipeline.h @@ -0,0 +1,40 @@ +// +// Created by wangzijian on 11/14/24. +// + +#ifndef LITE_AI_TOOLKIT_TRT_FACEFUSION_PIPELINE_H +#define LITE_AI_TOOLKIT_TRT_FACEFUSION_PIPELINE_H + +#include "lite/trt/core/trt_core.h" +#include "lite/trt/cv/trt_face_restoration.h" +#include "lite/trt/cv/trt_face_swap.h" +#include "lite/trt/cv/trt_face_recognizer.h" +#include "lite/trt/cv/trt_yolofacev8.h" +#include "lite/trt/cv/trt_face_68landmarks.h" + +namespace trtcv{ + class TRTFaceFusionPipeLine{ + public: + TRTFaceFusionPipeLine( + const std::string &face_detect_engine_path, + const std::string &face_landmarks_68_engine_path, + const std::string &face_recognizer_engine_path, + const std::string &face_swap_engine_path, + const std::string &face_restoration_engine_path + ); + + private: + std::unique_ptr face_restoration; + std::unique_ptr face_detect; + std::unique_ptr face_landmarks; + std::unique_ptr face_recognizer; + std::unique_ptr face_swap; + + public: + void detect(const std::string &source_image,const std::string &target_image,const std::string &save_image); + + }; +} + + +#endif //LITE_AI_TOOLKIT_TRT_FACEFUSION_PIPELINE_H diff --git a/lite/trt/cv/trt_modnet.cpp b/lite/trt/cv/trt_modnet.cpp index 2a590960..8bbfcc6d 100644 --- a/lite/trt/cv/trt_modnet.cpp +++ b/lite/trt/cv/trt_modnet.cpp @@ -91,7 +91,7 @@ void TRTMODNet::generate_matting(float *trt_outputs, const cv::Mat &mat, types:: const unsigned int out_w = 512; cv::Mat alpha_pred(out_h, out_w, CV_32FC1, trt_outputs); - cv::imwrite("/home/lite.ai.toolkit/modnet.jpg",alpha_pred); + // post process if (remove_noise) trtcv::utils::remove_small_connected_area(alpha_pred,0.05f); // resize alpha diff --git a/lite/trt/cv/trt_yolofacev8.cpp b/lite/trt/cv/trt_yolofacev8.cpp index 8d411592..ca3381ca 100644 --- a/lite/trt/cv/trt_yolofacev8.cpp +++ b/lite/trt/cv/trt_yolofacev8.cpp @@ -20,6 +20,12 @@ float TRTYoloFaceV8::get_iou(const lite::types::Boxf box1, const lite::types::Bo } + +std::vector +TRTYoloFaceV8::nms_cuda(std::vector boxes, std::vector confidences, const float nms_thresh) { + return nms_cuda_manager->perform_nms(boxes, confidences, nms_thresh); +} + std::vector TRTYoloFaceV8::nms(std::vector boxes, std::vector confidences, const float nms_thresh) { sort(confidences.begin(), confidences.end(), [&confidences](size_t index_1, size_t index_2) { return confidences[index_1] > confidences[index_2]; }); @@ -96,30 +102,33 @@ void TRTYoloFaceV8::generate_box(float *trt_outputs, std::vector> bounding_box_raw; + + // 直接分配目标类型的向量 + std::vector> bounding_box_raw(num_box); + + // 调用包装函数 + launch_yolov8_postprocess( + static_cast(buffers[1]), + num_box, + conf_threshold, + ratio_height, + ratio_width, + bounding_box_raw.data(), + num_box + ); + std::vector score_raw; - for (int i = 0; i < num_box; i++) - { - const float score = trt_outputs[4 * num_box + i]; - if (score > conf_threshold) - { - float x1 = (trt_outputs[i] - 0.5 * trt_outputs[2 * num_box + i]) * ratio_width; - float y1 = (trt_outputs[num_box + i] - 0.5 * trt_outputs[3 * num_box + i]) * ratio_height; - float x2 = (trt_outputs[i] + 0.5 * trt_outputs[2 * num_box + i]) * ratio_width; - float y2 = (trt_outputs[num_box + i] + 0.5 * trt_outputs[3 * num_box + i]) * ratio_height; - - lite::types::BoundingBoxType bbox; - bbox.x1 = x1; - bbox.y1 = y1; - bbox.x2 = x2; - bbox.y2 = y2; - bbox.score = score; - bbox.flag = true; - bounding_box_raw.emplace_back(bbox); - score_raw.emplace_back(score); + for (const auto& bbox : bounding_box_raw) { + if (bbox.score >= 0) { + score_raw.emplace_back(bbox.score); } } - std::vector keep_inds = this->nms(bounding_box_raw, score_raw, iou_threshold); + + + + std::vector keep_inds = nms_cuda(bounding_box_raw, score_raw, iou_threshold); +// std::vector keep_inds = this->nms(bounding_box_raw, score_raw, iou_threshold); + const int keep_num = keep_inds.size(); boxes.clear(); boxes.resize(keep_num); @@ -135,6 +144,18 @@ void TRTYoloFaceV8::generate_box(float *trt_outputs, std::vector &boxes, float conf_threshold, float iou_threshold) { + // 检查输入 + if (mat.empty()) { + std::cerr << "Input image is empty!" << std::endl; + return; + } + + // 检查 TRT 上下文 + if (!trt_context) { + std::cerr << "TensorRT context is null!" << std::endl; + return; + } + // 1.normalized the input cv::Mat normalized_image = normalize(mat); diff --git a/lite/trt/cv/trt_yolofacev8.h b/lite/trt/cv/trt_yolofacev8.h index 9f6fcd93..b9752d54 100644 --- a/lite/trt/cv/trt_yolofacev8.h +++ b/lite/trt/cv/trt_yolofacev8.h @@ -6,14 +6,27 @@ #define LITE_AI_TOOLKIT_TRT_YOLOFACEV8_H #include "lite/trt/core/trt_core.h" #include "lite/trt/core/trt_utils.h" - +#include "lite/trt/kernel/nms_cuda_manager.h" +#include "lite/trt/kernel/generate_bbox_cuda_manager.h" namespace trtcv{ class LITE_EXPORTS TRTYoloFaceV8 : public BasicTRTHandler{ + + public: + + std::unique_ptr nms_cuda_manager; explicit TRTYoloFaceV8(const std::string& _trt_model_path,unsigned int _num_threads = 1): BasicTRTHandler(_trt_model_path, _num_threads) - {}; + { + nms_cuda_manager = std::make_unique(); + }; + + + + std::vector nms_cuda(std::vector boxes, + std::vector confidences, + const float nms_thresh); private: float mean = -127.5 / 128.0; diff --git a/lite/trt/kernel/face_restoration_postprocess.cu b/lite/trt/kernel/face_restoration_postprocess.cu new file mode 100644 index 00000000..8dacd0d0 --- /dev/null +++ b/lite/trt/kernel/face_restoration_postprocess.cu @@ -0,0 +1,44 @@ +#include "face_restoration_postprocess.cuh" + +// 第一步处理函数 +__device__ float process_range_single(float x) { + x = fmax(-1.0f, fmin(1.0f, x)); + return (x + 1.f) / 2.f; +} + +// CHW到HWC的索引转换 +__device__ int get_hwc_index(int c, int h, int w, int channel, int width) { + return h * (width * channel) + w * channel + c; +} + +// float转uint8的处理 +__device__ unsigned char float_to_uint8_simple(float x) { + return (unsigned char)rintf(fminf(255.f, fmaxf(0.f, x * 255.f))); +} + +// 主kernel函数 +__global__ void face_restoration_postprocess( + float* input_buffer, // 输入数据(TRT输出,CHW格式) + unsigned char* output_final, // 最终输出(HWC格式,uint8) + int channel, + int height, + int width +) { + int idx = blockDim.x * blockIdx.x + threadIdx.x; + int total_size = channel * height * width; + if (idx >= total_size) return; + + // 第一步:范围处理 + float processed = process_range_single(input_buffer[idx]); + + // 第二步:计算CHW中的位置 + int c = idx / (height * width); + int h = (idx % (height * width)) / width; + int w = idx % width; + + // 第三步:计算HWC位置并转换 + int hwc_idx = get_hwc_index(c, h, w, channel, width); + + // 第四步:转换为uint8并写入输出 + output_final[hwc_idx] = float_to_uint8_simple(processed); +} diff --git a/lite/trt/kernel/face_restoration_postprocess.cuh b/lite/trt/kernel/face_restoration_postprocess.cuh new file mode 100644 index 00000000..adc276e1 --- /dev/null +++ b/lite/trt/kernel/face_restoration_postprocess.cuh @@ -0,0 +1,8 @@ +#include "cuda_runtime.h" +extern "C" __global__ void face_restoration_postprocess( + float* input_buffer, // 输入数据(TRT输出,CHW格式) + unsigned char* output_final, // 最终输出(HWC格式,uint8) + int channel, + int height, + int width +); \ No newline at end of file diff --git a/lite/trt/kernel/face_restoration_postprocess_manager.cpp b/lite/trt/kernel/face_restoration_postprocess_manager.cpp new file mode 100644 index 00000000..f1ace60b --- /dev/null +++ b/lite/trt/kernel/face_restoration_postprocess_manager.cpp @@ -0,0 +1,45 @@ +// +// Created by root on 11/29/24. +// + +#include "face_restoration_postprocess_manager.h" +void launch_face_restoration_postprocess( + float* trt_outputs, + unsigned char* output_final, + int channel, + int height, + int width +){ + // 设计grid和block的尺寸 block直接设置为256的最大值 + int block_size = 256; + int vec_num = channel * height * width; + int grid_size = ( vec_num + block_size - 1) / block_size; + // GPU上的内存空间 + unsigned char* d_output_final; + int* d_output_count; + + // 在GPU上分配输出的空间 + cudaMalloc(&d_output_final,vec_num * sizeof(unsigned char )); + + // 启动内核 + face_restoration_postprocess<<>>( + trt_outputs, + d_output_final, + channel, + height, + width + ); + cudaDeviceSynchronize(); + cudaError_t error = cudaGetLastError(); + if (error != cudaSuccess) { + printf("CUDA error: %s\n", cudaGetErrorString(error)); + } + + // 将生成的数据复制出来 + cudaMemcpy(output_final,d_output_final,vec_num * sizeof(unsigned char ), + cudaMemcpyDeviceToHost); + + // 释放cuda上的内存 + cudaFree(d_output_final); + +} \ No newline at end of file diff --git a/lite/trt/kernel/face_restoration_postprocess_manager.h b/lite/trt/kernel/face_restoration_postprocess_manager.h new file mode 100644 index 00000000..57db9e11 --- /dev/null +++ b/lite/trt/kernel/face_restoration_postprocess_manager.h @@ -0,0 +1,21 @@ +// +// Created by root on 11/29/24. +// + +#ifndef LITE_AI_TOOLKIT_FACE_RESTORATION_POSTPROCESS_MANAGER_H +#define LITE_AI_TOOLKIT_FACE_RESTORATION_POSTPROCESS_MANAGER_H +#include +#include +#include +#include "face_restoration_postprocess.cuh" + +void launch_face_restoration_postprocess( + float* trt_outputs, + unsigned char* output_final, + int channel, + int height, + int width + ); + + +#endif //LITE_AI_TOOLKIT_FACE_RESTORATION_POSTPROCESS_MANAGER_H diff --git a/lite/trt/kernel/generate_bbox_cuda_manager.cpp b/lite/trt/kernel/generate_bbox_cuda_manager.cpp new file mode 100644 index 00000000..f203700d --- /dev/null +++ b/lite/trt/kernel/generate_bbox_cuda_manager.cpp @@ -0,0 +1,54 @@ +// +// Created by wangzijian on 11/26/24. +// + +#include "generate_bbox_cuda_manager.h" +// Kernel launch wrapper function +void launch_yolov8_postprocess( + float* trt_outputs, + int number_of_boxes, + float conf_threshold, + float ratio_height, + float ratio_width, + lite::types::BoundingBoxType* output_boxes, + int max_output_boxes +) { + // 计算grid和block尺寸 + int block_size = 256; + int grid_size = (number_of_boxes + block_size - 1) / block_size; + + // 分配设备内存 + lite::types::BoundingBoxType* d_output_boxes; + int* d_output_count; + + cudaMalloc(&d_output_boxes, max_output_boxes * sizeof(lite::types::BoundingBoxType)); + cudaMalloc(&d_output_count, sizeof(int)); + cudaMemset(d_output_count, 0, sizeof(int)); + + // 启动内核 + yolov8_postprocess_kernel<<>>( + trt_outputs, + number_of_boxes, + conf_threshold, + ratio_height, + ratio_width, + d_output_boxes, + d_output_count + ); + + // 同步和错误检查 + cudaDeviceSynchronize(); + cudaError_t error = cudaGetLastError(); + if (error != cudaSuccess) { + printf("CUDA error: %s\n", cudaGetErrorString(error)); + } + + // 复制输出数据 + int h_output_count; + cudaMemcpy(&h_output_count, d_output_count, sizeof(int), cudaMemcpyDeviceToHost); + cudaMemcpy(output_boxes, d_output_boxes, h_output_count * sizeof(lite::types::BoundingBoxType), cudaMemcpyDeviceToHost); + + // 释放设备内存 + cudaFree(d_output_boxes); + cudaFree(d_output_count); +} \ No newline at end of file diff --git a/lite/trt/kernel/generate_bbox_cuda_manager.h b/lite/trt/kernel/generate_bbox_cuda_manager.h new file mode 100644 index 00000000..83bb5263 --- /dev/null +++ b/lite/trt/kernel/generate_bbox_cuda_manager.h @@ -0,0 +1,20 @@ +// +// Created by wangzijian on 11/26/24. +// + +#pragma once +#include +#include +#include +#include "lite/types.h" +#include "generate_bbox_kernel.cuh" + +void launch_yolov8_postprocess( + float* trt_outputs, + int number_of_boxes, + float conf_threshold, + float ratio_height, + float ratio_width, + lite::types::BoundingBoxType* output_boxes, + int max_output_boxes +); \ No newline at end of file diff --git a/lite/trt/kernel/generate_bbox_kernel.cu b/lite/trt/kernel/generate_bbox_kernel.cu new file mode 100644 index 00000000..36ccb04f --- /dev/null +++ b/lite/trt/kernel/generate_bbox_kernel.cu @@ -0,0 +1,33 @@ +#include "generate_bbox_kernel.cuh" + +__global__ void yolov8_postprocess_kernel( + float* trt_outputs, + int number_of_boxes, + float conf_threshold, + float ratio_height, + float ratio_width, + lite::types::BoundingBoxType* output_boxes, // 直接使用目标类型 + int* output_count +) { + int index = blockIdx.x * blockDim.x + threadIdx.x; + if (index >= number_of_boxes) return; + + const float score = trt_outputs[4 * number_of_boxes + index]; + if (score > conf_threshold) { + + float x1 = (trt_outputs[index] - 0.5 * trt_outputs[2 * number_of_boxes + index]) * ratio_width; + float y1 = (trt_outputs[number_of_boxes + index] - 0.5 * trt_outputs[3 * number_of_boxes + index]) * ratio_height; + float x2 = (trt_outputs[index] + 0.5 * trt_outputs[2 * number_of_boxes + index]) * ratio_width; + float y2 = (trt_outputs[number_of_boxes + index] + 0.5 * trt_outputs[3 * number_of_boxes + index]) * ratio_height; + + // 使用原子操作获取输出索引 + int output_index = atomicAdd(output_count, 1); + // 直接设置BoundingBoxType + output_boxes[output_index].x1 = x1; + output_boxes[output_index].y1 = y1; + output_boxes[output_index].x2 = x2; + output_boxes[output_index].y2 = y2; + output_boxes[output_index].score = score; + output_boxes[output_index].flag = true; + } +} diff --git a/lite/trt/kernel/generate_bbox_kernel.cuh b/lite/trt/kernel/generate_bbox_kernel.cuh new file mode 100644 index 00000000..c2c74834 --- /dev/null +++ b/lite/trt/kernel/generate_bbox_kernel.cuh @@ -0,0 +1,12 @@ +#include "cuda_runtime.h" +#include "lite/types.h" + +extern "C" __global__ void yolov8_postprocess_kernel( + float* trt_outputs, + int number_of_boxes, + float conf_threshold, + float ratio_height, + float ratio_width, + lite::types::BoundingBoxType* output_boxes, // 直接使用目标类型 + int* output_count +); diff --git a/lite/trt/kernel/nms_cuda_manager.cpp b/lite/trt/kernel/nms_cuda_manager.cpp new file mode 100644 index 00000000..30c5cc90 --- /dev/null +++ b/lite/trt/kernel/nms_cuda_manager.cpp @@ -0,0 +1,122 @@ +// nms_cuda_manager.cpp +#include "nms_cuda_manager.h" +#include "nms_kernel.cuh" +#include + +// 宏定义:检查CUDA操作是否成功 +#define CUDA_CHECK(call) \ + do { \ + cudaError_t err = call; \ + if (err != cudaSuccess) { \ + throw std::runtime_error("CUDA error: " + std::string(cudaGetErrorString(err))); \ + } \ + } while(0) + +NMSCudaManager::NMSCudaManager() {} + +NMSCudaManager::~NMSCudaManager() { + release_resources(); +} + +void NMSCudaManager::init(size_t max_boxes) { + // 如果已经初始化且新的大小不超过当前大小,则直接返回 + if (is_initialized && max_boxes <= max_boxes_num) { + return; + } + + // 先释放现有资源 + release_resources(); + + try { + // 分配设备内存 + CUDA_CHECK(cudaMalloc(&d_boxes, max_boxes * 5 * sizeof(float))); + CUDA_CHECK(cudaMalloc(&d_result, max_boxes * sizeof(int))); + + // 分配主机内存 + h_result = new int[max_boxes]; + + max_boxes_num = max_boxes; + is_initialized = true; + } + catch (const std::exception& e) { + // 初始化失败时,确保资源被正确释放 + release_resources(); + throw; + } +} + +void NMSCudaManager::release_resources() { + if (d_boxes) { + CUDA_CHECK(cudaFree(d_boxes)); + d_boxes = nullptr; + } + + if (d_result) { + CUDA_CHECK(cudaFree(d_result)); + d_result = nullptr; + } + + if (h_result) { + delete[] h_result; + h_result = nullptr; + } + + max_boxes_num = 0; + is_initialized = false; +} + + + +std::vector NMSCudaManager::perform_nms( + const std::vector& boxes, + const std::vector& confidences, + float nms_thresh +) { + // 安全性检查 + if (boxes.size() != confidences.size()) { + throw std::invalid_argument("Box and confidence sizes must match"); + } + + // 初始化或调整资源大小 + const int num_boxes = boxes.size(); + if (true ) { + init(fmax(num_boxes, max_boxes_num * 2)); + } + + // 准备数据 + std::vector box_data(num_boxes * 5); + for (int i = 0; i < num_boxes; ++i) { + box_data[i * 5] = boxes[i].x1; + box_data[i * 5 + 1] = boxes[i].y1; + box_data[i * 5 + 2] = boxes[i].x2; + box_data[i * 5 + 3] = boxes[i].y2; + box_data[i * 5 + 4] = confidences[i]; + } + + // 拷贝数据到GPU + CUDA_CHECK(cudaMemcpy(d_boxes, box_data.data(), num_boxes * 5 * sizeof(float), cudaMemcpyHostToDevice)); + + // 设置CUDA kernel参数 + const int block_size = 256; + const int grid_size = (num_boxes + block_size - 1) / block_size; + + // 启动kernel + nms_kernel<<>>(d_boxes, num_boxes, nms_thresh, d_result); + CUDA_CHECK(cudaGetLastError()); + + // 等待kernel执行完成 + CUDA_CHECK(cudaDeviceSynchronize()); + + // 拷贝结果回CPU + CUDA_CHECK(cudaMemcpy(h_result, d_result, num_boxes * sizeof(int), cudaMemcpyDeviceToHost)); + + // 收集保留的索引 + std::vector keep_indices; + for (int i = 0; i < num_boxes; ++i) { + if (h_result[i] == 1) { + keep_indices.push_back(i); + } + } + + return keep_indices; +} \ No newline at end of file diff --git a/lite/trt/kernel/nms_cuda_manager.h b/lite/trt/kernel/nms_cuda_manager.h new file mode 100644 index 00000000..cb5fa240 --- /dev/null +++ b/lite/trt/kernel/nms_cuda_manager.h @@ -0,0 +1,38 @@ +// nms_cuda_manager.hpp +#pragma once +#include +#include +#include +#include "lite/types.h" + +class NMSCudaManager { +public: + NMSCudaManager(); + ~NMSCudaManager(); + + // 禁用拷贝构造和赋值运算符,防止意外的资源复制 + NMSCudaManager(const NMSCudaManager&) = delete; + NMSCudaManager& operator=(const NMSCudaManager&) = delete; + + // 初始化CUDA资源,支持动态调整 + void init(size_t max_boxes = 1024); + + // 安全的NMS执行方法 + std::vector perform_nms( + const std::vector& boxes, + const std::vector& confidences, + float nms_thresh + ); + +private: + // 资源释放方法 + void release_resources(); + + // CUDA内存指针 + float* d_boxes = nullptr; // 设备内存:框 + int* d_result = nullptr; // 设备内存:结果 + int* h_result = nullptr; // 主机内存:结果 + + size_t max_boxes_num = 0; // 最大框数 + bool is_initialized = false; // 初始化标志 +}; \ No newline at end of file diff --git a/lite/trt/kernel/nms_kernel.cu b/lite/trt/kernel/nms_kernel.cu new file mode 100644 index 00000000..92e47852 --- /dev/null +++ b/lite/trt/kernel/nms_kernel.cu @@ -0,0 +1,55 @@ +#include + +struct bbox { + float x1, y1, x2, y2, score; +}; + +// IoU计算的device函数 +extern "C" __device__ float calculate_iou(float* a, float* b) { + float left = max(a[0], b[0]); + float right = min(a[2], b[2]); + float top = max(a[1], b[1]); + float bottom = min(a[3], b[3]); + + float width = max(right - left, 0.f); + float height = max(bottom - top, 0.f); + + float interArea = width * height; + float boxAArea = (a[2] - a[0]) * (a[3] - a[1]); + float boxBArea = (b[2] - b[0]) * (b[3] - b[1]); + + return interArea / (boxAArea + boxBArea - interArea); +} + +// NMS核函数 +extern "C" __global__ void nms_kernel(float* bboxes, int number_of_boxes, float threshold_iou, int* result) { + int index = blockIdx.x * blockDim.x + threadIdx.x; + + // 边界检查 + if (index >= number_of_boxes) return; + + // 初始假设保留当前框 + result[index] = 1; + + for (int i = 0; i < number_of_boxes; i++) { + // 跳过自身 + if (i == index) continue; + + // 当前框和比较框的指针 + float* current_box = bboxes + index * 5; + float* compare_box = bboxes + i * 5; + + float iou = calculate_iou(current_box, compare_box); + + // 获取分数 + float current_score = current_box[4]; + float compare_score = compare_box[4]; + + // 如果IoU大于阈值且比较框分数更高,则抑制当前框 + if (iou > threshold_iou && compare_score > current_score) { + result[index] = 0; + break; + } + } + +} diff --git a/lite/trt/kernel/nms_kernel.cuh b/lite/trt/kernel/nms_kernel.cuh new file mode 100644 index 00000000..6d93735a --- /dev/null +++ b/lite/trt/kernel/nms_kernel.cuh @@ -0,0 +1,4 @@ +#include "cuda_runtime.h" +extern "C" __global__ void nms_kernel(float* bboxes, int number_of_boxes, float threshold_iou, int* result); + +extern "C" __device__ float calculate_iou(float* a, float* b); \ No newline at end of file