diff --git a/CMakeLists.txt b/CMakeLists.txt
index e550a53a..320c2ada 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -19,13 +19,18 @@ if (CMAKE_COMPILER_IS_GNUCXX)
     set(CMAKE_CXX_FLAGS "-std=c++17 -Wno-deprecated ${CMAKE_CXX_FLAGS} ")
     message(STATUS "[Linux GCC Compiler Options]+:-std=c++17 -Wno-deprecated")
 endif ()
+# 指定CUDA编译器
+set(CMAKE_CUDA_COMPILER "/usr/local/cuda/bin/nvcc")
+set(CMAKE_CUDA_ARCHITECTURES 89) # For RTX 20xx series
 
+# 启用 CUDA
+enable_language(CUDA)
 set(LITE_AI_ROOT_DIR ${CMAKE_SOURCE_DIR})
 
-option(ENABLE_TEST "build test examples." OFF)
+option(ENABLE_TEST "build test examples." ON)
 option(ENABLE_DEBUG_STRING "enable DEBUG string or not" ON)
 option(ENABLE_ONNXRUNTIME "enable ONNXRuntime engine" ON)
-option(ENABLE_TENSORRT "enable TensorRT engine" OFF)
+option(ENABLE_TENSORRT "enable TensorRT engine" ON)
 option(ENABLE_MNN "enable MNN engine" OFF)
 option(ENABLE_NCNN "enable NCNN engine" OFF) 
 option(ENABLE_TNN "enable TNN engine" OFF)
diff --git a/cmake/opencv.cmake b/cmake/opencv.cmake
index cea2b051..912446c5 100644
--- a/cmake/opencv.cmake
+++ b/cmake/opencv.cmake
@@ -19,9 +19,9 @@ link_directories(${OpenCV_DIR}/lib)
 
 if(NOT WIN32)
     if(ENABLE_OPENCV_VIDEOIO OR ENABLE_TEST)
-        set(OpenCV_LIBS opencv_core opencv_imgproc opencv_imgcodecs opencv_video opencv_videoio)
+        set(OpenCV_LIBS opencv_core opencv_imgproc opencv_imgcodecs opencv_video opencv_videoio opencv_calib3d)
     else()
-        set(OpenCV_LIBS opencv_core opencv_imgproc opencv_imgcodecs) # no videoio, video module
+        set(OpenCV_LIBS opencv_core opencv_imgproc opencv_imgcodecs opencv_calib3d) # no videoio, video module
     endif()
 else()
     set(OpenCV_LIBS opencv_world490)
diff --git a/cmake/tensorrt.cmake b/cmake/tensorrt.cmake
index 3de10b85..a88f4673 100644
--- a/cmake/tensorrt.cmake
+++ b/cmake/tensorrt.cmake
@@ -41,6 +41,8 @@ link_directories(${TensorRT_DIR}/lib)
 
 # 1. glob sources files
 file(GLOB TENSORRT_CORE_SRCS ${CMAKE_SOURCE_DIR}/lite/trt/core/*.cpp)
+file(GLOB TENSORRT_CUDA_KERNEL_SRCS_CPP ${CMAKE_SOURCE_DIR}/lite/trt/kernel/*.cpp)
+file(GLOB TENSORRT_CUDA_KERNEL_SRCS_CU ${CMAKE_SOURCE_DIR}/lite/trt/kernel/*.cu)
 file(GLOB TENSORRT_CV_SRCS ${CMAKE_SOURCE_DIR}/lite/trt/cv/*.cpp)
 file(GLOB TENSORRT_NLP_SRCS ${CMAKE_SOURCE_DIR}/lite/trt/nlp/*.cpp)
 file(GLOB TENSORRT_ASR_SRCS ${CMAKE_SOURCE_DIR}/lite/trt/asr/*.cpp)
@@ -52,8 +54,17 @@ file(GLOB TENSORRT_CV_HEAD ${CMAKE_SOURCE_DIR}/lite/trt/cv/*.h)
 file(GLOB TENSORRT_NLP_HEAD ${CMAKE_SOURCE_DIR}/lite/trt/nlp/*.h)
 file(GLOB TENSORRT_ASR_HEAD ${CMAKE_SOURCE_DIR}/lite/trt/asr/*.h)
 file(GLOB TENSORRT_SD_HEAD ${CMAKE_SOURCE_DIR}/lite/trt/sd/*.h)
+file(GLOB TENSORRT_CUDA_KERNEL_HEAD_CPP ${CMAKE_SOURCE_DIR}/lite/trt/kernel/*.h)
+file(GLOB TENSORRT_CUDA_KERNEL_HEAD_CU ${CMAKE_SOURCE_DIR}/lite/trt/kernel/*.cuh)
+
+
+
+set(TRT_SRCS ${TENSORRT_CV_SRCS} ${TENSORRT_NLP_SRCS} ${TENSORRT_ASR_SRCS} ${TENSORRT_CORE_SRCS} ${TENSORRT_SD_SRCS}
+        ${TENSORRT_CUDA_KERNEL_SRCS_CPP} ${TENSORRT_CUDA_KERNEL_SRCS_CU})
+set_source_files_properties(${TENSORRT_CUDA_KERNEL_SRCS_CU} ${TENSORRT_CUDA_KERNEL_SRCS_CPP}
+        ${TENSORRT_CUDA_KERNEL_HEAD_CPP} ${TENSORRT_CUDA_KERNEL_HEAD_CU}
+        PROPERTIES LANGUAGE CUDA)
 
-set(TRT_SRCS ${TENSORRT_CV_SRCS} ${TENSORRT_NLP_SRCS} ${TENSORRT_ASR_SRCS} ${TENSORRT_CORE_SRCS} ${TENSORRT_SD_SRCS})
 # 3. copy
 message("[Lite.AI.Toolkit][I] Installing Lite.AI.ToolKit Headers for TensorRT Backend ...")
 # "INSTALL" can copy all files from the list to the specified path.
@@ -63,4 +74,5 @@ file(INSTALL ${TENSORRT_CV_HEAD} DESTINATION ${CMAKE_INSTALL_PREFIX}/include/lit
 file(INSTALL ${TENSORRT_ASR_HEAD} DESTINATION ${CMAKE_INSTALL_PREFIX}/include/lite/trt/asr)
 file(INSTALL ${TENSORRT_NLP_HEAD} DESTINATION ${CMAKE_INSTALL_PREFIX}/include/lite/trt/nlp)
 file(INSTALL ${TENSORRT_SD_HEAD} DESTINATION ${CMAKE_INSTALL_PREFIX}/include/lite/trt/sd)
-
+file(INSTALL ${TENSORRT_CUDA_KERNEL_HEAD_CPP} DESTINATION ${CMAKE_INSTALL_PREFIX}/include/lite/trt/kernel)
+file(INSTALL ${TENSORRT_CUDA_KERNEL_HEAD_CU} DESTINATION ${CMAKE_INSTALL_PREFIX}/include/lite/trt/kernel)
diff --git a/cmake/utils.cmake b/cmake/utils.cmake
index f089cb7f..9073a3c4 100644
--- a/cmake/utils.cmake
+++ b/cmake/utils.cmake
@@ -52,7 +52,7 @@ function(add_lite_ai_toolkit_shared_library version soversion)
     if (ENABLE_TENSORRT)
         include(cmake/tensorrt.cmake)
         set(LITE_SRCS ${LITE_SRCS} ${TRT_SRCS})
-        set(LITE_DEPENDENCIES ${LITE_DEPENDENCIES} cudart nvinfer nvonnxparser 
+        set(LITE_DEPENDENCIES ${LITE_DEPENDENCIES} cuda cudart nvinfer nvonnxparser
                                                    nvinfer_plugin ddim_scheduler_cpp)
         link_directories(${CMAKE_SOURCE_DIR}/lite/bin)
     endif ()
diff --git a/examples/lite/CMakeLists.txt b/examples/lite/CMakeLists.txt
index 929e07d5..8e99cd70 100644
--- a/examples/lite/CMakeLists.txt
+++ b/examples/lite/CMakeLists.txt
@@ -103,5 +103,10 @@ add_lite_executable(lite_face_parsing_bisenet_dyn cv)
 add_lite_executable(lite_yolov8face cv)
 add_lite_executable(lite_lightenhance cv)
 add_lite_executable(lite_realesrgan cv)
+add_lite_executable(lite_face_68landmarks cv)
+add_lite_executable(lite_face_recognizer cv)
+add_lite_executable(lite_face_swap cv)
+add_lite_executable(lite_face_restoration cv)
+add_lite_executable(lite_facefusion_pipeline cv)
 add_lite_executable(lite_yolov8 cv)
 add_lite_executable(lite_sd_pipeline sd)
diff --git a/examples/lite/cv/test_lite_face_68landmarks.cpp b/examples/lite/cv/test_lite_face_68landmarks.cpp
new file mode 100644
index 00000000..0f4a0da9
--- /dev/null
+++ b/examples/lite/cv/test_lite_face_68landmarks.cpp
@@ -0,0 +1,112 @@
+//
+// Created by wangzijian on 11/1/24.
+//
+#include "lite/lite.h"
+#include "lite/trt/cv/trt_face_68landmarks_mt.h"
+
+static void test_default()
+{
+#ifdef ENABLE_ONNXRUNTIME
+    std::string onnx_path = "/home/lite.ai.toolkit/examples/hub/onnx/cv/2dfan4.onnx";
+    std::string test_img_path = "/home/lite.ai.toolkit/examples/lite/resources/test_lite_facefusion_pipeline_source.jpg";
+
+    // 1. Test Default Engine ONNXRuntime
+    lite::cv::faceid::Face_68Landmarks *face68Landmarks = new lite::cv::faceid::Face_68Landmarks(onnx_path);
+
+    lite::types::BoundingBoxType<float, float> bbox;
+    bbox.x1 = 487;
+    bbox.y1 = 236;
+    bbox.x2 = 784;
+    bbox.y2 = 624;
+
+    cv::Mat img_bgr = cv::imread(test_img_path);
+    std::vector<cv::Point2f> face_landmark_5of68;
+    face68Landmarks->detect(img_bgr, bbox, face_landmark_5of68);
+
+    std::cout<<"face id detect done!"<<std::endl;
+
+    delete face68Landmarks;
+#endif
+}
+
+
+
+
+static void test_tensorrt()
+{
+#ifdef ENABLE_TENSORRT
+    std::string engine_path = "/home/lite.ai.toolkit/examples/hub/trt/2dfan4_fp16.engine";
+    std::string test_img_path = "/home/lite.ai.toolkit/1.jpg";
+
+    // 1. Test TensorRT Engine
+    lite::trt::cv::faceid::FaceFusionFace68Landmarks  *face68Landmarks = new lite::trt::cv::faceid::FaceFusionFace68Landmarks(engine_path);
+    lite::types::BoundingBoxType<float, float> bbox;
+    bbox.x1 = 487;
+    bbox.y1 = 236;
+    bbox.x2 = 784;
+    bbox.y2 = 624;
+
+    cv::Mat img_bgr = cv::imread(test_img_path);
+    std::vector<cv::Point2f> face_landmark_5of68;
+    face68Landmarks->detect(img_bgr, bbox, face_landmark_5of68);
+
+    std::cout<<"face id detect done!"<<std::endl;
+
+    delete face68Landmarks;
+#endif
+}
+
+
+static void test_tensorrt_mt()
+{
+#ifdef ENABLE_TENSORRT
+    std::string engine_path = "/home/lite.ai.toolkit/examples/hub/trt/2dfan4_fp16.engine";
+    std::string test_img_path = "/home/lite.ai.toolkit/1.jpg";
+
+    // 1. Test TensorRT Engine
+//    lite::trt::cv::faceid::FaceFusionFace68Landmarks  *face68Landmarks = new lite::trt::cv::faceid::FaceFusionFace68Landmarks(engine_path);
+    trt_face_68landmarks_mt *face68Landmarks = new trt_face_68landmarks_mt(engine_path,4);
+
+    lite::types::BoundingBoxType<float, float> bbox;
+
+    bbox.x1 = 487;
+    bbox.y1 = 236;
+    bbox.x2 = 784;
+    bbox.y2 = 624;
+
+    cv::Mat img_bgr = cv::imread(test_img_path);
+    std::vector<cv::Point2f> face_landmark_5of68;
+    face68Landmarks->detect_async(img_bgr, bbox, face_landmark_5of68);
+
+    cv::Mat img_bgr2 = cv::imread(test_img_path);
+    std::vector<cv::Point2f> face_landmark_5of682;
+    face68Landmarks->detect_async(img_bgr, bbox, face_landmark_5of682);
+
+    cv::Mat img_bgr3 = cv::imread(test_img_path);
+    std::vector<cv::Point2f> face_landmark_5of683;
+    face68Landmarks->detect_async(img_bgr, bbox, face_landmark_5of683);
+
+
+    cv::Mat img_bgr4 = cv::imread(test_img_path);
+    std::vector<cv::Point2f> face_landmark_5of684;
+    face68Landmarks->detect_async(img_bgr, bbox, face_landmark_5of684);
+
+    face68Landmarks->wait_for_completion();
+
+    face68Landmarks->shutdown();
+
+    std::cout<<"face id detect done!"<<std::endl;
+
+    delete face68Landmarks;
+#endif
+}
+
+
+
+int main(__unused int argc, __unused char *argv[])
+{
+//    test_tensorrt();
+    test_tensorrt_mt();
+//    test_default();
+    return 0;
+}
\ No newline at end of file
diff --git a/examples/lite/cv/test_lite_face_recognizer.cpp b/examples/lite/cv/test_lite_face_recognizer.cpp
new file mode 100644
index 00000000..5a800fe5
--- /dev/null
+++ b/examples/lite/cv/test_lite_face_recognizer.cpp
@@ -0,0 +1,39 @@
+//
+// Created by wangzijian on 11/5/24.
+//
+#include "lite/lite.h"
+
+static void test_default()
+{
+#ifdef ENABLE_ONNXRUNTIME
+    std::string onnx_path = "../../../examples/hub/onnx/cv/arcface_w600k_r50.onnx";
+    std::string test_img_path = "../../../examples/lite/resources/test_lite_facefusion_pipeline_source.jpg";
+
+    // 1. Test Default Engine ONNXRuntime
+    lite::cv::faceid::Face_Recognizer *face_recognizer = new lite::cv::faceid::Face_Recognizer(onnx_path);
+
+    std::vector<cv::Point2f> face_landmark_5 = {
+            cv::Point2f(568.2485f, 398.9512f),
+            cv::Point2f(701.7346f, 399.64795f),
+            cv::Point2f(634.2213f, 482.92694f),
+            cv::Point2f(583.5656f, 543.10187f),
+            cv::Point2f(684.52405f, 543.125f)
+    };
+    cv::Mat img_bgr = cv::imread(test_img_path);
+
+    std::vector<float> source_image_embeding;
+    
+    face_recognizer->detect(img_bgr,face_landmark_5,source_image_embeding);
+
+
+    std::cout<<"face id detect done!"<<std::endl;
+
+    delete face_recognizer;
+#endif
+}
+
+int main(__unused int argc, __unused char *argv[])
+{
+    test_default();
+    return 0;
+}
\ No newline at end of file
diff --git a/examples/lite/cv/test_lite_face_restoration.cpp b/examples/lite/cv/test_lite_face_restoration.cpp
new file mode 100644
index 00000000..c30884e0
--- /dev/null
+++ b/examples/lite/cv/test_lite_face_restoration.cpp
@@ -0,0 +1,119 @@
+//
+// Created by wangzijian on 11/7/24.
+//
+#include "lite/lite.h"
+
+#include "lite/trt/cv/trt_face_restoration_mt.h"
+
+static void test_default()
+{
+#ifdef ENABLE_ONNXRUNTIME
+    std::string onnx_path = "/home/lite.ai.toolkit/examples/hub/onnx/cv/gfpgan_1.4.onnx";
+    std::string test_img_path = "/home/lite.ai.toolkit/trt_result.jpg";
+    std::string save_img_path = "/home/lite.ai.toolkit/trt_result_final.jpg";
+
+    // 1. Test Default Engine ONNXRuntime
+    lite::cv::face::restoration::GFPGAN *face_restoration = new  lite::cv::face::restoration::GFPGAN(onnx_path);
+
+    std::vector<cv::Point2f> face_landmark_5 = {
+            cv::Point2f(569.092041f, 398.845886f),
+            cv::Point2f(701.891724f, 399.156677f),
+            cv::Point2f(634.767212f, 482.927216f),
+            cv::Point2f(584.270996f, 543.294617f),
+            cv::Point2f(684.877991f, 543.067078f)
+    };
+    cv::Mat img_bgr = cv::imread(test_img_path);
+
+    face_restoration->detect(img_bgr,face_landmark_5,save_img_path);
+
+
+    std::cout<<"face restoration detect done!"<<std::endl;
+
+    delete face_restoration;
+#endif
+}
+
+
+
+
+static void test_tensorrt()
+{
+#ifdef ENABLE_TENSORRT
+    std::string engine_path = "/home/lite.ai.toolkit/examples/hub/trt/gfpgan_1.4_fp32.engine";
+    std::string test_img_path = "/home/lite.ai.toolkit/trt_result.jpg";
+    std::string save_img_path = "/home/lite.ai.toolkit/trt_facerestoration_mt_test111.jpg";
+
+    // 1. Test Default Engine TensorRT
+//    lite::trt::cv::face::restoration::TRTGFPGAN *face_restoration_trt = new  lite::trt::cv::face::restoration::TRTGFPGAN(engine_path);
+
+    const int num_threads = 4;  // 使用4个线程
+    auto face_restoration_trt = std::make_unique<trt_face_restoration_mt>(engine_path,4);
+
+//    trt_face_restoration_mt *face_restoration_trt = new trt_face_restoration_mt(engine_path);
+
+
+    // 2. 准备测试数据 - 这里假设我们要处理4张相同的图片作为示例
+    std::vector<std::string> test_img_paths = {
+            "/home/lite.ai.toolkit/trt_result.jpg",
+            "/home/lite.ai.toolkit/trt_result_2.jpg",
+            "/home/lite.ai.toolkit/trt_result_3.jpg",
+            "/home/lite.ai.toolkit/trt_result_4.jpg"
+    };
+
+    std::vector<std::string> save_img_paths = {
+            "/home/lite.ai.toolkit/trt_facerestoration_mt_thread1.jpg",
+            "/home/lite.ai.toolkit/trt_facerestoration_mt_thread2.jpg",
+            "/home/lite.ai.toolkit/trt_facerestoration_mt_thread3.jpg",
+            "/home/lite.ai.toolkit/trt_facerestoration_mt_thread4.jpg"
+    };
+
+    std::vector<cv::Point2f> face_landmark_5 = {
+            cv::Point2f(569.092041f, 398.845886f),
+            cv::Point2f(701.891724f, 399.156677f),
+            cv::Point2f(634.767212f, 482.927216f),
+            cv::Point2f(584.270996f, 543.294617f),
+            cv::Point2f(684.877991f, 543.067078f)
+    };
+//    cv::Mat img_bgr = cv::imread(test_img_path);
+//
+//    face_restoration_trt->detect_async(img_bgr,face_landmark_5,save_img_path);
+//
+//
+//    std::cout<<"face restoration detect done!"<<std::endl;
+//
+//    delete face_restoration_trt;
+    auto start_time = std::chrono::high_resolution_clock::now();
+
+    for (size_t i=0; i < test_img_paths.size();++i){
+        cv::Mat img_bgr = cv::imread(test_img_paths[i]);
+        if (img_bgr.empty()) {
+            std::cerr << "Failed to read image: " << test_img_paths[i] << std::endl;
+            continue;
+        }
+        // 异步提交任务
+        face_restoration_trt->detect_async(img_bgr, face_landmark_5, save_img_paths[i]);
+        std::cout << "Submitted task " << i + 1 << " for processing" << std::endl;
+    }
+
+    // 6. 等待所有任务完成
+    std::cout << "Waiting for all tasks to complete..." << std::endl;
+    face_restoration_trt->wait_for_completion();
+
+    // 7. 计算和输出总耗时
+    auto end_time = std::chrono::high_resolution_clock::now();
+    auto duration = std::chrono::duration_cast<std::chrono::milliseconds>(end_time - start_time);
+
+    std::cout << "All tasks completed!" << std::endl;
+    std::cout << "Total processing time: " << duration.count() << "ms" << std::endl;
+    std::cout << "Average time per image: " << duration.count() / test_img_paths.size() << "ms" << std::endl;
+
+
+#endif
+}
+
+int main(__unused int argc, __unused char *argv[])
+{
+//    test_default();
+    test_tensorrt();
+    return 0;
+}
\ No newline at end of file
diff --git a/examples/lite/cv/test_lite_face_swap.cpp b/examples/lite/cv/test_lite_face_swap.cpp
new file mode 100644
index 00000000..a7429030
--- /dev/null
+++ b/examples/lite/cv/test_lite_face_swap.cpp
@@ -0,0 +1,55 @@
+//
+// Created by wangzijian on 11/5/24.
+//
+#include "lite/lite.h"
+
+static void test_default()
+{
+#ifdef ENABLE_ONNXRUNTIME
+    std::string face_swap_onnx_path = "../../../examples/hub/onnx/cv/inswapper_128.onnx";
+    std::string face_detect_onnx_path = "../../../examples/hub/onnx/cv/yoloface_8n.onnx";
+    std::string face_landmarks_68 = "../../../examples/hub/onnx/cv/2dfan4.onnx";
+    std::string face_recognizer_onnx_path = "../../../examples/hub/onnx/cv/arcface_w600k_r50.onnx";;
+
+    std::string source_image_path = "../../../examples/lite/resources/test_lite_facefusion_pipeline_source.jpg";
+    std::string target_image_path = "../../../examples/lite/resources/test_lite_facefusion_pipeline_target.jpg";
+
+    lite::cv::face::detect::YOLOV8Face *yolov8_face = new lite::cv::face::detect::YOLOV8Face(face_detect_onnx_path);
+    lite::cv::faceid::Face_68Landmarks *face68Landmarks = new lite::cv::faceid::Face_68Landmarks(face_landmarks_68);
+    lite::cv::faceid::Face_Recognizer *face_recognizer = new lite::cv::faceid::Face_Recognizer(face_recognizer_onnx_path);
+    lite::cv::face::swap::InSwapper *face_swap = new lite::cv::face::swap::InSwapper(face_swap_onnx_path);
+
+
+    std::vector<lite::types::Boxf> detected_boxes;
+    cv::Mat img_bgr = cv::imread(source_image_path);
+    yolov8_face->detect(img_bgr, detected_boxes);
+    int position = 0; // 0号位置的脸部
+    auto test_bounding_box = detected_boxes[0];
+    std::vector<cv::Point2f> face_landmark_5of68;
+    face68Landmarks->detect(img_bgr, test_bounding_box, face_landmark_5of68);
+    std::vector<float> source_image_embeding;
+    face_recognizer->detect(img_bgr,face_landmark_5of68,source_image_embeding);
+
+    // 上面是source的 现在下面是target的
+    std::vector<lite::types::Boxf> target_detected_boxes;
+    cv::Mat target_img_bgr = cv::imread(target_image_path);
+    yolov8_face->detect(target_img_bgr, target_detected_boxes);
+    auto target_test_bounding_box = target_detected_boxes[0];
+    std::vector<cv::Point2f> target_face_landmark_5of68;
+    face68Landmarks->detect(target_img_bgr, target_test_bounding_box,target_face_landmark_5of68);
+
+    cv::Mat face_swap_image;
+    face_swap->detect(target_img_bgr,source_image_embeding,target_face_landmark_5of68,face_swap_image);
+
+    delete yolov8_face;
+    delete face68Landmarks;
+    delete face_swap;
+    delete face_recognizer;
+#endif
+}
+
+int main(__unused int argc, __unused char *argv[])
+{
+    test_default();
+    return 0;
+}
\ No newline at end of file
diff --git a/examples/lite/cv/test_lite_facefusion_pipeline.cpp b/examples/lite/cv/test_lite_facefusion_pipeline.cpp
new file mode 100644
index 00000000..7134900d
--- /dev/null
+++ b/examples/lite/cv/test_lite_facefusion_pipeline.cpp
@@ -0,0 +1,45 @@
+//
+// Created by wangzijian on 11/7/24.
+//
+#include "lite/lite.h"
+static void test_default()
+{
+#ifdef ENABLE_ONNXRUNTIME
+    std::string face_swap_onnx_path = "../../../examples/hub/onnx/cv/inswapper_128.onnx";
+    std::string face_detect_onnx_path = "../../../examples/hub/onnx/cv/yoloface_8n.onnx";
+    std::string face_landmarks_68 = "../../../examples/hub/onnx/cv/2dfan4.onnx";
+    std::string face_recognizer_onnx_path = "../../../examples/hub/onnx/cv/arcface_w600k_r50.onnx";
+    std::string face_restoration_onnx_path = "../../../examples/hub/onnx/cv/gfpgan_1.4.onnx";
+
+    auto pipeLine =  lite::cv::face::swap::facefusion::PipeLine(
+            face_detect_onnx_path,
+            face_landmarks_68,
+            face_recognizer_onnx_path,
+            face_swap_onnx_path,
+            face_restoration_onnx_path
+            );
+
+    std::string source_image_path = "../../../examples/lite/resources/test_lite_facefusion_pipeline_source.jpg";
+    std::string target_image_path = "../../../examples/lite/resources/test_lite_facefusion_pipeline_target.jpg";
+    std::string save_image_path = "../../../examples/logs/test_lite_facefusion_pipeline_result.jpg";
+
+
+    // 写一个测试时间的代码
+    auto start = std::chrono::high_resolution_clock::now();
+
+
+
+    pipeLine.detect(source_image_path,target_image_path,save_image_path);
+    auto end = std::chrono::high_resolution_clock::now();
+    std::chrono::duration<double> diff = end-start;
+    std::cout << "Time: " << diff.count() << " s\n";
+
+
+#endif
+}
+
+int main()
+{
+
+    test_default();
+}
\ No newline at end of file
diff --git a/examples/lite/resources/test_lite_facefusion_pipeline_source.jpg b/examples/lite/resources/test_lite_facefusion_pipeline_source.jpg
new file mode 100644
index 00000000..d04c2239
Binary files /dev/null and b/examples/lite/resources/test_lite_facefusion_pipeline_source.jpg differ
diff --git a/examples/lite/resources/test_lite_facefusion_pipeline_target.jpg b/examples/lite/resources/test_lite_facefusion_pipeline_target.jpg
new file mode 100644
index 00000000..a1948be1
Binary files /dev/null and b/examples/lite/resources/test_lite_facefusion_pipeline_target.jpg differ
diff --git a/lite/models.h b/lite/models.h
index 60eb82e6..4dd27101 100644
--- a/lite/models.h
+++ b/lite/models.h
@@ -115,6 +115,11 @@
 #include "lite/ort/cv/yolofacev8.h"
 #include "lite/ort/cv/light_enhance.h"
 #include "lite/ort/cv/real_esr_gan.h"
+#include "lite/ort/cv/face_68landmarks.h"
+#include "lite/ort/cv/face_recognizer.h"
+#include "lite/ort/cv/face_swap.h"
+#include "lite/ort/cv/face_restoration.h"
+#include "lite/ort/cv/face_fusion_pipeline.h"
 #include "lite/ort/sd/clip.h"
 #include "lite/ort/sd/unet.h"
 #include "lite/ort/sd/vae.h"
@@ -136,6 +141,11 @@
 #include "lite/trt/cv/trt_yolov5_blazeface.h"
 #include "lite/trt/cv/trt_lightenhance.h"
 #include "lite/trt/cv/trt_realesrgan.h"
+#include "lite/trt/cv/trt_face_68landmarks.h"
+#include "lite/trt/cv/trt_face_recognizer.h"
+#include "lite/trt/cv/trt_face_swap.h"
+#include "lite/trt/cv/trt_face_restoration.h"
+#include "lite/trt/cv/trt_facefusion_pipeline.h"
 #include "lite/trt/sd/trt_clip.h"
 #include "lite/trt/sd/trt_vae.h"
 #include "lite/trt/sd/trt_unet.h"
@@ -502,6 +512,11 @@ namespace lite
       typedef ortcv::YoloFaceV8 _ONNXYOLOFaceNet;
       typedef ortcv::LightEnhance _ONNXLightEnhance;
       typedef ortcv::RealESRGAN _ONNXRealESRGAN;
+      typedef ortcv::Face_68Landmarks _ONNXFace_68Landmarks;
+      typedef ortcv::Face_Recognizer _ONNXFace_Recognizer;
+      typedef ortcv::Face_Swap _ONNXFace_Swap;
+      typedef ortcv::Face_Restoration _ONNXFace_Restoration;
+      typedef ortcv::Face_Fusion_Pipeline _ONNXFace_Fusion_Pipeline;
 
       // 1. classification
       namespace classification
@@ -580,6 +595,25 @@ namespace lite
 
         }
 
+        namespace swap
+        {
+            namespace facefusion
+            {
+                typedef _ONNXYOLOFaceNet YOLOV8Face;
+                typedef _ONNXFace_Swap InSwapper;
+                typedef _ONNXFace_Restoration GFPGAN;
+                typedef _ONNXFace_68Landmarks Face_68Landmarks;
+                typedef _ONNXFace_Recognizer Face_Recognizer;
+                typedef _ONNXFace_Fusion_Pipeline PipeLine;
+            }
+            typedef _ONNXFace_Swap InSwapper;
+        }
+
+        namespace restoration
+        {
+            typedef _ONNXFace_Restoration GFPGAN;
+        }
+
         namespace pose
         {
           typedef _ONNXFSANet FSANet; // head pose estimation.
@@ -618,7 +652,8 @@ namespace lite
         typedef _ONNXCavaGhostArcFace CavaGhostArcFace;
         typedef _ONNXCavaCombinedFace CavaCombinedFace;
         typedef _ONNXMobileSEFocalFace MobileSEFocalFace;
-
+        typedef _ONNXFace_68Landmarks Face_68Landmarks;
+        typedef _ONNXFace_Recognizer Face_Recognizer;
       }
       // 5. segmentation
       namespace segmentation
@@ -733,6 +768,11 @@ namespace lite{
             typedef trtcv::TRTLightEnhance _TRT_LightEnhance;
             typedef trtcv::TRTRealESRGAN _TRT_RealESRGAN;
             typedef trtcv::TRTMODNet _TRT_MODNet;
+            typedef trtcv::TRTFaceFusionFace68Landmarks _TRT_FaceFusionFace68Landmarks;
+            typedef trtcv::TRTFaceFusionFaceRecognizer _TRTFaceFusionFaceRecognizer;
+            typedef trtcv::TRTFaceFusionFaceSwap _TRTFaceFusionFaceSwap;
+            typedef trtcv::TRTFaceFusionFaceRestoration _TRTFaceFusionFaceRestoration;
+            typedef trtcv::TRTFaceFusionPipeLine _TRTFaceFusionPipeLine;
             namespace classification
             {
 
@@ -755,6 +795,20 @@ namespace lite{
                     typedef _TRT_YOLOFaceNet YOLOV8Face;
                     typedef _TRT_YOLO5Face  YOLOV5Face;
                 }
+                namespace swap
+                {
+                    typedef _TRTFaceFusionFaceSwap FaceFusionFaceSwap;
+                    typedef _TRTFaceFusionPipeLine FaceFusionPipeLine;
+                }
+                namespace restoration
+                {
+                    typedef _TRTFaceFusionFaceRestoration TRTGFPGAN;
+                }
+            }
+            namespace faceid
+            {
+                typedef _TRT_FaceFusionFace68Landmarks FaceFusionFace68Landmarks;
+                typedef _TRTFaceFusionFaceRecognizer FaceFusionFaceRecognizer;
             }
             namespace lightenhance
             {
diff --git a/lite/ort/cv/face_68landmarks.cpp b/lite/ort/cv/face_68landmarks.cpp
new file mode 100644
index 00000000..7bbff59c
--- /dev/null
+++ b/lite/ort/cv/face_68landmarks.cpp
@@ -0,0 +1,97 @@
+//
+// Created by wangzijian on 11/1/24.
+//
+
+#include "face_68landmarks.h"
+
+using ortcv::Face_68Landmarks;
+
+void Face_68Landmarks::preprocess(const lite::types::Boxf &bounding_box,
+                                  const cv::Mat &input_mat,
+                                  cv::Mat &crop_img) {
+
+    float xmin = bounding_box.x1;
+    float ymin = bounding_box.y1;
+    float xmax = bounding_box.x2;
+    float ymax = bounding_box.y2;
+
+
+    float width = xmax - xmin;
+    float height = ymax - ymin;
+    float max_side = std::max(width, height);
+    float scale = 195.0f / max_side;
+
+    float center_x = (xmax + xmin) * scale;
+    float center_y = (ymax + ymin) * scale;
+
+    cv::Point2f translation;
+    translation.x = (256.0f - center_x) * 0.5f;
+    translation.y = (256.0f - center_y) * 0.5f;
+
+    cv::Size crop_size(256, 256);
+
+    std::tie(crop_img, affine_matrix) = face_utils::warp_face_by_translation(input_mat, translation, scale, crop_size);
+
+    crop_img.convertTo(crop_img,CV_32FC3,1 / 255.f);
+
+}
+
+
+Ort::Value Face_68Landmarks::transform(const cv::Mat &mat_rs) {
+    input_node_dims[0] = 1;
+    input_node_dims[1] = mat_rs.channels();
+    input_node_dims[2] = mat_rs.rows;
+    input_node_dims[3] = mat_rs.cols;
+
+    return ortcv::utils::transform::create_tensor(
+            mat_rs, input_node_dims, memory_info_handler,
+            input_values_handler, ortcv::utils::transform::CHW);
+}
+
+
+void Face_68Landmarks::detect(const cv::Mat &input_mat, const lite::types::BoundingBoxType<float, float> &bbox,
+                               std::vector<cv::Point2f> &face_landmark_5of68) {
+    if (input_mat.empty()) return;
+
+    img_with_landmarks = input_mat.clone();
+    cv::Mat crop_image;
+
+    preprocess(bbox,input_mat,crop_image);
+
+    Ort::Value input_tensor = transform(crop_image);
+    Ort::RunOptions runOptions;
+
+    // 2.infer
+    auto output_tensors = ort_session->Run(
+            runOptions, input_node_names.data(),
+            &input_tensor, 1, output_node_names.data(), num_outputs
+    );
+
+    postprocess(output_tensors,face_landmark_5of68);
+
+}
+
+
+
+void Face_68Landmarks::postprocess(std::vector<Ort::Value> &ort_outputs,
+                                   std::vector<cv::Point2f> &face_landmark_5of68) {
+    float *pdata = ort_outputs[0].GetTensorMutableData<float>();
+    std::vector<int64_t> out_shape = ort_outputs[0].GetTensorTypeAndShapeInfo().GetShape();
+    std::vector<cv::Point2f> landmarks;
+
+    for (int i = 0;i < 68; ++i)
+    {
+        float x = pdata[i * 3] / 64.0f  * 256.f;
+        float y = pdata[i * 3 + 1] / 64.0f * 256.f;
+        landmarks.emplace_back(x, y);
+    }
+
+    cv::Mat inverse_affine_matrix;
+    cv::invertAffineTransform(affine_matrix, inverse_affine_matrix);
+
+    cv::transform(landmarks, landmarks, inverse_affine_matrix);
+
+    face_landmark_5of68 = face_utils::convert_face_landmark_68_to_5(landmarks);
+}
+
+
diff --git a/lite/ort/cv/face_68landmarks.h b/lite/ort/cv/face_68landmarks.h
new file mode 100644
index 00000000..3bb89c79
--- /dev/null
+++ b/lite/ort/cv/face_68landmarks.h
@@ -0,0 +1,45 @@
+//
+// Created by wangzijian on 11/1/24.
+//
+
+#ifndef LITE_AI_TOOLKIT_FACE_68LANDMARKS_H
+#define LITE_AI_TOOLKIT_FACE_68LANDMARKS_H
+#include "lite/ort/core/ort_core.h"
+#include "lite/ort/core/ort_types.h"
+#include "lite/ort/core/ort_utils.h"
+#include "lite/utils.h"
+#include "lite/ort/cv/face_utils.h"
+#include "algorithm"
+
+namespace ortcv{
+    class LITE_EXPORTS Face_68Landmarks : public BasicOrtHandler{
+    public:
+        explicit  Face_68Landmarks(const std::string &_onnx_path, unsigned int _num_threads = 1):
+                BasicOrtHandler(_onnx_path, _num_threads = 1){};
+
+        ~Face_68Landmarks() override = default;
+
+    private:
+        cv::Mat affine_matrix;
+        cv::Mat img_with_landmarks;
+
+    private:
+        void preprocess(const lite::types::Boxf &bouding_box,const cv::Mat &input_mat,cv::Mat &crop_img);
+
+        Ort::Value transform(const cv::Mat &mat_rs) override;
+
+        void postprocess(std::vector<Ort::Value> &ort_outputs, std::vector<cv::Point2f> &face_landmark_5of68);
+
+
+    public:
+
+        void detect(const cv::Mat &input_mat,const lite::types::BoundingBoxType<float, float> &bbox, std::vector<cv::Point2f> &face_landmark_5of68);
+
+
+
+    };
+
+
+}
+
+#endif //LITE_AI_TOOLKIT_FACE_68LANDMARKS_H
diff --git a/lite/ort/cv/face_fusion_pipeline.cpp b/lite/ort/cv/face_fusion_pipeline.cpp
new file mode 100644
index 00000000..f3192719
--- /dev/null
+++ b/lite/ort/cv/face_fusion_pipeline.cpp
@@ -0,0 +1,45 @@
+//
+// Created by wangzijian on 11/7/24.
+//
+
+#include "face_fusion_pipeline.h"
+using ortcv::Face_Fusion_Pipeline;
+
+Face_Fusion_Pipeline::Face_Fusion_Pipeline(const std::string &face_detect_onnx_path,
+                                           const std::string &face_landmarks_68_onnx_path,
+                                           const std::string &face_recognizer_onnx_path,
+                                           const std::string &face_swap_onnx_path,
+                                           const std::string &face_restoration_onnx_path) {
+    face_detect  = std::make_unique<YoloFaceV8>(face_detect_onnx_path,6);
+    face_landmarks = std::make_unique<Face_68Landmarks>(face_landmarks_68_onnx_path,6);
+    face_recognizer = std::make_unique<Face_Recognizer>(face_recognizer_onnx_path,6);
+    face_swap = std::make_unique<Face_Swap>(face_swap_onnx_path,6);
+    face_restoration = std::make_unique<Face_Restoration>(face_restoration_onnx_path,6);
+}
+
+void Face_Fusion_Pipeline::detect(const std::string &source_image, const std::string &target_image,const std::string &save_image_path) {
+    std::vector<lite::types::Boxf> detected_boxes;
+    cv::Mat img_bgr = cv::imread(source_image);
+    face_detect->detect(img_bgr,detected_boxes);
+
+    int position = 0; // position number 0
+    auto test_bounding_box = detected_boxes[0];
+    std::vector<cv::Point2f> face_landmark_5of68;
+
+    face_landmarks->detect(img_bgr, test_bounding_box, face_landmark_5of68);
+    std::vector<float> source_image_embeding;
+    face_recognizer->detect(img_bgr,face_landmark_5of68,source_image_embeding);
+
+
+    std::vector<lite::types::Boxf> target_detected_boxes;
+    cv::Mat target_img_bgr = cv::imread(target_image);
+    face_detect->detect(target_img_bgr, target_detected_boxes);
+    auto target_test_bounding_box = target_detected_boxes[0];
+    std::vector<cv::Point2f> target_face_landmark_5of68;
+    face_landmarks->detect(target_img_bgr, target_test_bounding_box,target_face_landmark_5of68);
+
+    cv::Mat face_swap_image;
+    face_swap->detect(target_img_bgr,source_image_embeding,target_face_landmark_5of68,face_swap_image);
+    face_restoration->detect(face_swap_image,target_face_landmark_5of68,save_image_path);
+
+}
\ No newline at end of file
diff --git a/lite/ort/cv/face_fusion_pipeline.h b/lite/ort/cv/face_fusion_pipeline.h
new file mode 100644
index 00000000..bbf31395
--- /dev/null
+++ b/lite/ort/cv/face_fusion_pipeline.h
@@ -0,0 +1,38 @@
+//
+// Created by wangzijian on 11/7/24.
+//
+
+#ifndef LITE_AI_TOOLKIT_FACE_FUSION_PIPELINE_H
+#define LITE_AI_TOOLKIT_FACE_FUSION_PIPELINE_H
+#include "lite/ort/core/ort_core.h"
+#include "lite/ort/cv/face_restoration.h"
+#include "lite/ort/cv/face_swap.h"
+#include "lite/ort/cv/face_recognizer.h"
+#include "lite/ort/cv/yolofacev8.h"
+#include "lite/ort/cv/face_68landmarks.h"
+
+namespace ortcv{
+    class Face_Fusion_Pipeline{
+    public:
+        Face_Fusion_Pipeline(
+                const std::string &face_detect_onnx_path,
+                const std::string &face_landmarks_68_onnx_path,
+                const std::string &face_recognizer_onnx_path,
+                const std::string &face_swap_onnx_path,
+                const std::string &face_restoration_onnx_path
+                );
+        ~Face_Fusion_Pipeline() = default; // 使用智能指针来进行管理
+
+    private:
+        std::unique_ptr<Face_Restoration> face_restoration;
+        std::unique_ptr<YoloFaceV8> face_detect;
+        std::unique_ptr<Face_68Landmarks> face_landmarks;
+        std::unique_ptr<Face_Recognizer> face_recognizer;
+        std::unique_ptr<Face_Swap> face_swap;
+
+    public:
+        void detect(const std::string &source_image,const std::string &target_image,const std::string &save_image);
+    };
+}
+
+#endif //LITE_AI_TOOLKIT_FACE_FUSION_PIPELINE_H
diff --git a/lite/ort/cv/face_recognizer.cpp b/lite/ort/cv/face_recognizer.cpp
new file mode 100644
index 00000000..8082d800
--- /dev/null
+++ b/lite/ort/cv/face_recognizer.cpp
@@ -0,0 +1,96 @@
+//
+// Created by wangzijian on 11/4/24.
+//
+
+#include "face_recognizer.h"
+using ortcv::Face_Recognizer;
+
+
+cv::Mat Face_Recognizer::preprocess(cv::Mat &input_mat, std::vector<cv::Point2f> &face_landmark_5,cv::Mat &preprocessed_mat) {
+    cv::Mat crop_image;
+    cv::Mat affine_martix;
+
+    std::tie(crop_image,affine_martix) = face_utils::warp_face_by_face_landmark_5(input_mat,face_landmark_5,face_utils::ARCFACE_112_V2);
+    crop_image.convertTo(crop_image,CV_32FC3, 1.0f / 127.5f,-1.0);
+    cv::cvtColor(crop_image,crop_image,cv::COLOR_BGR2RGB);
+
+    return crop_image;
+
+}
+
+
+Ort::Value Face_Recognizer::transform(const cv::Mat &mat_rs) {
+    input_node_dims[0] = 1;
+    input_node_dims[1] = mat_rs.channels();
+    input_node_dims[2] = mat_rs.rows;
+    input_node_dims[3] = mat_rs.cols;
+
+    return ortcv::utils::transform::create_tensor(
+            mat_rs, input_node_dims, memory_info_handler,
+            input_values_handler, ortcv::utils::transform::CHW);
+}
+
+void Face_Recognizer::detect(cv::Mat &input_mat, std::vector<cv::Point2f> &face_landmark_5) {
+    cv::Mat ori_image = input_mat.clone();
+
+    cv::Mat crop_image = preprocess(input_mat,face_landmark_5,ori_image);
+    Ort::Value input_tensor = transform(crop_image);
+    Ort::RunOptions runOptions;
+
+    // 2.infer
+    auto output_tensors = ort_session->Run(
+            runOptions, input_node_names.data(),
+            &input_tensor, 1, output_node_names.data(), num_outputs
+    );
+
+    float *pdata = output_tensors[0].GetTensorMutableData<float>();
+    std::vector<int64_t> out_shape = output_tensors[0].GetTensorTypeAndShapeInfo().GetShape();
+
+    std::vector<float> output(pdata, pdata + 512);
+
+    float norm = 0.0f;
+    for (const auto &val : output) {
+        norm += val * val;
+    }
+    norm = std::sqrt(norm);
+
+    for (auto &val : output) {
+        val /= norm;
+    }
+
+    std::cout<<"done!"<<std::endl;
+
+}
+
+void Face_Recognizer::detect(cv::Mat &input_mat, std::vector<cv::Point2f> &face_landmark_5, std::vector<float> &embeding) {
+    cv::Mat ori_image = input_mat.clone();
+
+    cv::Mat crop_image = preprocess(input_mat,face_landmark_5,ori_image);
+    Ort::Value input_tensor = transform(crop_image);
+    Ort::RunOptions runOptions;
+
+    // 2.infer
+    auto output_tensors = ort_session->Run(
+            runOptions, input_node_names.data(),
+            &input_tensor, 1, output_node_names.data(), num_outputs
+    );
+
+    float *pdata = output_tensors[0].GetTensorMutableData<float>();
+    std::vector<int64_t> out_shape = output_tensors[0].GetTensorTypeAndShapeInfo().GetShape();
+
+    embeding.assign(pdata,pdata + 512);
+    std::vector<float> normal_embeding(pdata,pdata + 512);
+
+
+    float norm = 0.0f;
+    for (const auto &val : normal_embeding) {
+        norm += val * val;
+    }
+    norm = std::sqrt(norm);
+
+    for (auto &val : normal_embeding) {
+        val /= norm;
+    }
+
+    std::cout<<"done!"<<std::endl;
+}
\ No newline at end of file
diff --git a/lite/ort/cv/face_recognizer.h b/lite/ort/cv/face_recognizer.h
new file mode 100644
index 00000000..32ec1460
--- /dev/null
+++ b/lite/ort/cv/face_recognizer.h
@@ -0,0 +1,33 @@
+//
+// Created by wangzijian on 11/4/24.
+//
+
+#ifndef LITE_AI_TOOLKIT_FACE_RECOGNIZER_H
+#define LITE_AI_TOOLKIT_FACE_RECOGNIZER_H
+#include "lite/ort/core/ort_core.h"
+#include "lite/ort/core/ort_types.h"
+#include "lite/ort/core/ort_utils.h"
+#include "lite/ort/cv/face_utils.h"
+namespace ortcv{
+    class LITE_EXPORTS Face_Recognizer : public BasicOrtHandler{
+    public:
+        explicit  Face_Recognizer(const std::string &_onnx_path, unsigned int _num_threads = 1):
+                BasicOrtHandler(_onnx_path, _num_threads = 1){};
+
+        ~Face_Recognizer() override = default;
+
+    private:
+        cv::Mat  preprocess(cv::Mat &input_mat, std::vector<cv::Point2f> &face_landmark_5,cv::Mat &preprocessed_mat);
+
+        Ort::Value transform(const cv::Mat &mat_rs) override;
+
+    public:
+        void detect(cv::Mat &input_mat,std::vector<cv::Point2f> &face_landmark_5);
+
+        void detect(cv::Mat &input_mat,std::vector<cv::Point2f> &face_landmark_5,std::vector<float> &embeding);
+
+    };
+}
+
+
+#endif //LITE_AI_TOOLKIT_FACE_RECOGNIZER_H
diff --git a/lite/ort/cv/face_restoration.cpp b/lite/ort/cv/face_restoration.cpp
new file mode 100644
index 00000000..77df1bf2
--- /dev/null
+++ b/lite/ort/cv/face_restoration.cpp
@@ -0,0 +1,95 @@
+//
+// Created by wangzijian on 11/7/24.
+//
+
+#include "face_restoration.h"
+
+using ortcv::Face_Restoration;
+
+
+
+Ort::Value Face_Restoration::transform(const cv::Mat &mat_rs) {
+    input_node_dims[0] = 1;
+    input_node_dims[1] = mat_rs.channels();
+    input_node_dims[2] = mat_rs.rows;
+    input_node_dims[3] = mat_rs.cols;
+
+    return ortcv::utils::transform::create_tensor(
+            mat_rs, input_node_dims, memory_info_handler,
+            input_values_handler, ortcv::utils::transform::CHW);
+}
+
+
+
+void Face_Restoration::detect(cv::Mat &face_swap_image, std::vector<cv::Point2f > &target_landmarks_5 , const std::string &face_enchaner_path) {
+    auto ori_image = face_swap_image.clone();
+
+    cv::Mat crop_image;
+    cv::Mat affine_matrix;
+    std::tie(crop_image,affine_matrix) = face_utils::warp_face_by_face_landmark_5(face_swap_image,target_landmarks_5,face_utils::FFHQ_512);
+
+    std::vector<float> crop_size = {512,512};
+    cv::Mat box_mask = face_utils::create_static_box_mask(crop_size);
+    std::vector<cv::Mat> crop_mask_list;
+    crop_mask_list.emplace_back(box_mask);
+
+    cv::cvtColor(crop_image,crop_image,cv::COLOR_BGR2RGB);
+    crop_image.convertTo(crop_image,CV_32FC3,1.f / 255.f);
+    crop_image.convertTo(crop_image,CV_32FC3,2.0f,-1.f);
+
+    Ort::Value input_tensor = transform(crop_image);
+
+    Ort::RunOptions runOptions;
+
+    // 2.infer
+    auto output_tensors = ort_session->Run(
+            runOptions, input_node_names.data(),
+            &input_tensor, 1, output_node_names.data(), num_outputs
+    );
+
+    float *pdata = output_tensors[0].GetTensorMutableData<float>();
+    std::vector<int64_t> out_shape = output_tensors[0].GetTensorTypeAndShapeInfo().GetShape();
+
+    int channel = 3;
+    int height = 512;
+    int width = 512;
+    std::vector<float> output(channel * height * width);
+    output.assign(pdata,pdata + (channel * height * width));
+
+    std::transform(output.begin(),output.end(),output.begin(),
+                   [](double x){return std::max(-1.0,std::max(-1.0,std::min(1.0,x)));});
+
+    std::transform(output.begin(),output.end(),output.begin(),
+                   [](double x){return (x + 1.f) /2.f;});
+
+
+    std::vector<float> transposed_data(channel * height * width);
+    for (int c = 0; c < channel; ++c){
+        for (int h = 0 ; h < height; ++h){
+            for (int w = 0; w < width ; ++w){
+                int src_index = c * (height * width) + h * width + w;
+                int dst_index = h * (width * channel) + w *  channel + c;
+                transposed_data[dst_index] = output[src_index];
+            }
+        }
+    }
+
+    std::transform(transposed_data.begin(),transposed_data.end(),transposed_data.begin(),
+                   [](float x){return std::round(x * 255.f);});
+
+    std::transform(transposed_data.begin(), transposed_data.end(), transposed_data.begin(),
+                   [](float x) { return static_cast<uint8_t>(x); });
+
+
+    cv::Mat mat(height, width, CV_32FC3, transposed_data.data());
+    cv::cvtColor(mat, mat, cv::COLOR_RGB2BGR);
+
+
+    auto crop_mask = crop_mask_list[0];
+    cv::Mat paste_frame = face_utils::paste_back(ori_image,mat,crop_mask,affine_matrix);
+
+    cv::Mat dst_image = face_utils::blend_frame(ori_image,paste_frame);
+
+    cv::imwrite(face_enchaner_path,dst_image);
+
+}
\ No newline at end of file
diff --git a/lite/ort/cv/face_restoration.h b/lite/ort/cv/face_restoration.h
new file mode 100644
index 00000000..e4febb85
--- /dev/null
+++ b/lite/ort/cv/face_restoration.h
@@ -0,0 +1,28 @@
+//
+// Created by wangzijian on 11/7/24.
+//
+
+#ifndef LITE_AI_TOOLKIT_FACE_RESTORATION_H
+#define LITE_AI_TOOLKIT_FACE_RESTORATION_H
+#include "lite/ort/core/ort_core.h"
+#include "lite/ort/core/ort_types.h"
+#include "lite/ort/core/ort_utils.h"
+#include "lite/ort/cv/face_utils.h"
+
+namespace ortcv{
+    class LITE_EXPORTS Face_Restoration : public BasicOrtHandler{
+    public:
+        explicit Face_Restoration(const std::string &_onnx_path, unsigned int _num_threads = 1):
+                BasicOrtHandler(_onnx_path,_num_threads){};
+        ~Face_Restoration() override = default;
+
+    private:
+
+        Ort::Value transform(const cv::Mat &mat_rs) override;
+
+    public:
+        void detect(cv::Mat &face_swap_image,std::vector<cv::Point2f > &target_landmarks_5 ,const std::string &face_enchaner_path);
+    };
+}
+
+#endif //LITE_AI_TOOLKIT_FACE_RESTORATION_H
diff --git a/lite/ort/cv/face_swap.cpp b/lite/ort/cv/face_swap.cpp
new file mode 100644
index 00000000..36ec87ac
--- /dev/null
+++ b/lite/ort/cv/face_swap.cpp
@@ -0,0 +1,120 @@
+//
+// Created by wangzijian on 11/5/24.
+//
+
+#include "face_swap.h"
+using ortcv::Face_Swap;
+
+void Face_Swap::preprocess(cv::Mat &target_face, std::vector<float> source_image_embeding,
+                           std::vector<cv::Point2f> target_landmark_5,std::vector<float> &processed_source_embeding,
+                           cv::Mat &preprocessed_mat) {
+
+
+    std::tie(preprocessed_mat, affine_martix) = face_utils::warp_face_by_face_landmark_5(target_face,target_landmark_5,face_utils::ARCFACE_128_V2);
+
+    std::vector<float> crop_size= {128.0,128.0};
+    crop_list.emplace_back(face_utils::create_static_box_mask(crop_size));
+
+    cv::cvtColor(preprocessed_mat,preprocessed_mat,cv::COLOR_BGR2RGB);
+    preprocessed_mat.convertTo(preprocessed_mat,CV_32FC3,1.0 / 255.f);
+    preprocessed_mat.convertTo(preprocessed_mat,CV_32FC3,1.0 / 1.f,0);
+
+    std::vector<float> model_martix = face_utils::load_npy("/home/facefusion-onnxrun/python/model_matrix.npy");
+
+    processed_source_embeding= face_utils::dot_product(source_image_embeding,model_martix,512);
+
+    face_utils::normalize(processed_source_embeding);
+
+    std::cout<<"done!"<<std::endl;
+}
+
+
+Ort::Value Face_Swap::transform(const cv::Mat &mat_rs) {
+    input_node_dims[0] = 1;
+    input_node_dims[1] = mat_rs.channels();
+    input_node_dims[2] = mat_rs.rows;
+    input_node_dims[3] = mat_rs.cols;
+
+    return ortcv::utils::transform::create_tensor(
+            mat_rs, input_node_dims, memory_info_handler,
+            input_values_handler, ortcv::utils::transform::CHW);
+}
+
+
+
+void Face_Swap::detect(cv::Mat &target_image,std::vector<float> source_face_embeding,std::vector<cv::Point2f> target_landmark_5,
+                       cv::Mat &face_swap_image){
+
+    cv::Mat ori_image = target_image.clone();
+    std::vector<float> source_embeding_input;
+    cv::Mat model_input_mat;
+    preprocess(target_image,source_face_embeding,target_landmark_5,source_embeding_input,model_input_mat);
+    Ort::Value inputTensor_target = transform(model_input_mat);
+
+    std::vector<int64_t> input_node_dims = {1, 512};
+    Ort::MemoryInfo memory_info = Ort::MemoryInfo::CreateCpu(OrtArenaAllocator, OrtMemTypeDefault);
+    Ort::Value inputTensor_embeding = Ort::Value::CreateTensor<float>(
+            memory_info,
+            source_embeding_input.data(),
+            source_embeding_input.size(),
+            input_node_dims.data(),
+            input_node_dims.size()
+    );
+
+    std::vector<Ort::Value> inputTensors;
+    inputTensors.push_back(std::move(inputTensor_target));
+    inputTensors.push_back(std::move(inputTensor_embeding));
+
+
+    Ort::RunOptions runOptions;
+
+    std::vector<const char *> input_node_names_face_swap = {
+            "target",
+            "source",
+    };
+
+    std::vector<const char *> output_node_names_face_swap = {
+            "output"
+    };
+
+    std::vector<Ort::Value> outputTensors = ort_session->Run(
+            runOptions,
+            input_node_names_face_swap.data(),
+            inputTensors.data(),
+            inputTensors.size(),
+            output_node_names_face_swap.data(),
+            output_node_names_face_swap.size()
+    );
+
+    float *p_data = outputTensors[0].GetTensorMutableData<float>();
+    std::vector<int64_t> out_shape = outputTensors[0].GetTensorTypeAndShapeInfo().GetShape();
+
+    std::vector<float> output_swap_image(1 * 3 * 128 * 128);
+    output_swap_image.assign(p_data,p_data + (1 * 3 * 128 * 128));
+
+    std::vector<float> transposed(3 * 128 * 128);
+    int channels = 3;
+    int height = 128;
+    int width = 128;
+
+    for (int c = 0; c < channels; ++c) {
+        for (int h = 0; h < height; ++h) {
+            for (int w = 0; w < width; ++w) {
+                int src_idx = c * (height * width) + h * width + w;  // CHW
+                int dst_idx = h * (width * channels) + w * channels + c;  // HWC
+                transposed[dst_idx] = output_swap_image[src_idx];
+            }
+        }
+    }
+
+    for (auto& val : transposed) {
+        val = std::round(val * 255.0);
+    }
+
+    cv::Mat mat(height, width, CV_32FC3, transposed.data());
+    cv::cvtColor(mat, mat, cv::COLOR_RGB2BGR);
+
+    cv::Mat dst_image = face_utils::paste_back(ori_image,mat,crop_list[0],affine_martix);
+    face_swap_image = dst_image;
+
+}
\ No newline at end of file
diff --git a/lite/ort/cv/face_swap.h b/lite/ort/cv/face_swap.h
new file mode 100644
index 00000000..9346aef2
--- /dev/null
+++ b/lite/ort/cv/face_swap.h
@@ -0,0 +1,38 @@
+//
+// Created by wangzijian on 11/5/24.
+//
+
+#ifndef LITE_AI_TOOLKIT_FACE_SWAP_H
+#define LITE_AI_TOOLKIT_FACE_SWAP_H
+#include "lite/ort/core/ort_core.h"
+#include "lite/ort/core/ort_types.h"
+#include "lite/ort/core/ort_utils.h"
+#include "lite/ort/cv/face_restoration.h"
+#include "lite/ort/cv/face_utils.h"
+
+namespace ortcv{
+    class LITE_EXPORTS Face_Swap : public BasicOrtHandler
+    {
+    public:
+        explicit  Face_Swap(const std::string &_onnx_path, unsigned int _num_threads = 1):
+                BasicOrtHandler(_onnx_path, _num_threads = 1){};
+        ~Face_Swap() override = default;
+    private:
+        void preprocess(cv::Mat &target_face,std::vector<float> source_image_embeding,std::vector<cv::Point2f> target_landmark_5,
+                        std::vector<float> &processed_source_embeding,cv::Mat &preprocessed_mat);
+
+        Ort::Value transform(const cv::Mat &mat_rs) override;
+
+    private:
+
+        std::vector<cv::Mat> crop_list;
+
+        cv::Mat affine_martix;
+
+    public:
+        void detect(cv::Mat &target_image,std::vector<float> source_face_embeding,std::vector<cv::Point2f> target_landmark_5, cv::Mat &face_swap_image);
+
+    };
+}
+
+#endif //LITE_AI_TOOLKIT_FACE_SWAP_H
diff --git a/lite/ort/cv/face_utils.cpp b/lite/ort/cv/face_utils.cpp
new file mode 100644
index 00000000..56df2ee6
--- /dev/null
+++ b/lite/ort/cv/face_utils.cpp
@@ -0,0 +1,335 @@
+//
+// Created by wangzijian on 11/11/24.
+//
+
+#include "face_utils.h"
+
+cv::Mat
+face_utils::paste_back(const cv::Mat &temp_vision_frame, const cv::Mat &crop_vision_frame, const cv::Mat &crop_mask,
+                       const cv::Mat &affine_matrix) {
+
+        // 确保所有图像都是float类型
+        cv::Mat temp_float, crop_float, mask_float;
+        temp_vision_frame.convertTo(temp_float, CV_32F);
+        crop_vision_frame.convertTo(crop_float, CV_32F);
+        crop_mask.convertTo(mask_float, CV_32F);
+
+        // 获取仿射变换的逆矩阵
+        cv::Mat inverse_matrix;
+        cv::invertAffineTransform(affine_matrix, inverse_matrix);
+
+        // 获取目标尺寸
+        cv::Size temp_size(temp_vision_frame.cols, temp_vision_frame.rows);
+
+        // 对mask进行反向仿射变换
+        cv::Mat inverse_mask;
+        cv::warpAffine(mask_float, inverse_mask, inverse_matrix, temp_size);
+        cv::threshold(inverse_mask, inverse_mask, 1.0, 1.0, cv::THRESH_TRUNC); // clip at 1
+        cv::threshold(inverse_mask, inverse_mask, 0.0, 0.0, cv::THRESH_TOZERO); // clip at 0
+
+        // 对crop_vision_frame进行反向仿射变换
+        cv::Mat inverse_vision_frame;
+        cv::warpAffine(crop_float, inverse_vision_frame, inverse_matrix,
+                       temp_size, cv::INTER_LINEAR, cv::BORDER_REPLICATE);
+
+        // 创建输出图像
+        cv::Mat paste_vision_frame;
+        temp_float.copyTo(paste_vision_frame);
+
+        // 对每个通道进行混合
+        std::vector<cv::Mat> channels(3);
+        std::vector<cv::Mat> inverse_channels(3);
+        std::vector<cv::Mat> temp_channels(3);
+
+        cv::split(inverse_vision_frame, inverse_channels);
+        cv::split(temp_float, temp_channels);
+
+        // 创建 1 - mask
+        cv::Mat inverse_weight;
+        cv::subtract(cv::Scalar(1.0), inverse_mask, inverse_weight);
+
+        for (int i = 0; i < 3; ++i) {
+            // 确保所有运算都在相同类型（CV_32F）下进行
+            cv::Mat weighted_inverse, weighted_temp;
+            cv::multiply(inverse_mask, inverse_channels[i], weighted_inverse);
+            cv::multiply(inverse_weight, temp_channels[i], weighted_temp);
+            cv::add(weighted_inverse, weighted_temp, channels[i]);
+        }
+
+        cv::merge(channels, paste_vision_frame);
+
+        // 如果需要，将结果转换回原始类型
+        cv::Mat result;
+        if(temp_vision_frame.type() != CV_32F) {
+            paste_vision_frame.convertTo(result, temp_vision_frame.type());
+        } else {
+            result = paste_vision_frame;
+        }
+
+        return result;
+
+}
+
+namespace face_utils
+{
+    const std::vector<cv::Point2f> face_template_128 = {
+            cv::Point2f(0.36167656, 0.40387734),
+            cv::Point2f(0.63696719, 0.40235469),
+            cv::Point2f(0.50019687, 0.56044219),
+            cv::Point2f(0.38710391, 0.72160547),
+            cv::Point2f(0.61507734, 0.72034453)
+    };
+
+    const std::vector<cv::Point2f> face_template_112 = {
+            cv::Point2f(0.34191607, 0.46157411),
+            cv::Point2f(0.65653393, 0.45983393),
+            cv::Point2f(0.50022500, 0.64050536),
+            cv::Point2f(0.37097589, 0.82469196),
+            cv::Point2f(0.63151696, 0.82325089)
+    };
+
+    const std::vector<cv::Point2f> face_template_512 = {
+            cv::Point2f(0.37691676, 0.46864664),
+            cv::Point2f(0.62285697, 0.46912813),
+            cv::Point2f(0.50123859, 0.61331904),
+            cv::Point2f(0.39308822, 0.72541100),
+            cv::Point2f(0.61150205, 0.72490465)
+    };
+
+    const std::vector<std::vector<cv::Point2f>> face_template_vector = {face_template_112, face_template_128, face_template_512};
+
+}
+
+
+std::pair<cv::Mat, cv::Mat>
+face_utils::warp_face_by_face_landmark_5(cv::Mat input_mat, std::vector<cv::Point2f> face_landmark_5,
+                                         unsigned int type) {
+
+    std::vector<cv::Point2f> current_template_select;
+    if (type == face_utils::ARCFACE_112_V2)
+    {
+        current_template_select = face_utils::face_template_vector[0];
+    }
+
+    if (type == face_utils::ARCFACE_128_V2)
+    {
+        current_template_select = face_utils::face_template_vector[1];
+    }
+
+    if (type == face_utils::FFHQ_512)
+    {
+        current_template_select = face_utils::face_template_vector[2];
+    }
+
+    // 创建标准模板点
+    std::vector<cv::Point2f> normed_template;
+    for(auto current_template : current_template_select)  // face_template应该是类的成员变量
+    {
+        current_template.x = current_template.x * type;  // 512
+        current_template.y = current_template.y * type;  // 注意：原代码中y使用了x，这里修正为y
+        normed_template.emplace_back(current_template);
+    }
+
+    // 估计仿射变换矩阵
+    cv::Mat inliers;
+    cv::Mat affine_matrix = cv::estimateAffinePartial2D(
+            face_landmark_5,
+            normed_template,
+            inliers,
+            cv::RANSAC,
+            100
+    );
+
+    // 检查变换矩阵是否有效
+    if (affine_matrix.empty()) {
+        throw std::runtime_error("Failed to estimate affine transformation");
+    }
+
+    // 进行仿射变换
+    cv::Mat crop_img;
+    cv::warpAffine(
+            input_mat,
+            crop_img,
+            affine_matrix,
+            cv::Size(type, type),
+            cv::INTER_AREA,
+            cv::BORDER_REPLICATE
+    );
+
+    return std::make_pair(crop_img, affine_matrix);
+}
+
+
+std::vector<float>
+face_utils::dot_product(const std::vector<float> &vec, const std::vector<float> &matrix, int matrix_cols) {
+    std::vector<float> result(matrix_cols);
+    int vec_size = vec.size();
+
+    for (int j = 0; j < matrix_cols; ++j) {
+        float sum = 0.0f;
+        for (int i = 0; i < vec_size; ++i) {
+            sum += vec[i] * matrix[i * matrix_cols + j];
+        }
+        result[j] = sum;
+    }
+    return result;
+}
+
+float face_utils::calculate_norm(const std::vector<float> &vec) {
+    float sum = 0.0f;
+    for (float v : vec) {
+        sum += v * v;
+    }
+    return std::sqrt(sum);
+}
+
+
+void face_utils::normalize(std::vector<float> &vec) {
+    float norm = calculate_norm(vec);
+    if (norm > 0) {
+        for (float& v : vec) {
+            v /= norm;
+        }
+    }
+}
+
+std::vector<float> face_utils::load_npy(const std::string &filename) {
+    std::ifstream file(filename, std::ios::binary);
+    if (!file.is_open()) {
+        throw std::runtime_error("Cannot open file: " + filename);
+    }
+
+    char magic[6];
+    file.read(magic, 6);
+    if (magic[0] != '\x93' || magic[1] != 'N' || magic[2] != 'U' ||
+        magic[3] != 'M' || magic[4] != 'P' || magic[5] != 'Y') {
+        throw std::runtime_error("Invalid .npy file format");
+    }
+
+    uint8_t major_version, minor_version;
+    file.read(reinterpret_cast<char*>(&major_version), 1);
+    file.read(reinterpret_cast<char*>(&minor_version), 1);
+
+    uint16_t header_len;
+    file.read(reinterpret_cast<char*>(&header_len), 2);
+
+    std::vector<char> header(header_len);
+    file.read(header.data(), header_len);
+
+    size_t num_elements = 512 * 512;
+
+    // 读取数据
+    std::vector<float> data(num_elements);
+    file.read(reinterpret_cast<char*>(data.data()), num_elements * sizeof(float));
+
+    return data;
+}
+
+std::pair<cv::Mat, cv::Mat>
+face_utils::warp_face_by_translation(const cv::Mat &temp_img, cv::Point2f &translation, float scale,
+                                     const cv::Size &crop_size) {
+    cv::Mat affine_matrix = (cv::Mat_<float>(2, 3) << scale, 0, translation.x,
+            0, scale, translation.y);
+
+    cv::Mat crop_img;
+    cv::warpAffine(temp_img, crop_img, affine_matrix, crop_size);
+
+    return {crop_img, affine_matrix};
+}
+
+
+std::vector<cv::Point2f> face_utils::convert_face_landmark_68_to_5(const std::vector<cv::Point2f> &landmark_68) {
+    std::vector<cv::Point2f> face_landmark_5;
+
+    // 计算左眼的中心位置
+    cv::Point2f left_eye(0.0f, 0.0f);
+    for (int i = 36; i < 42; ++i) {
+        left_eye += landmark_68[i];
+    }
+    left_eye *= (1.0f / 6.0f); // 取平均
+
+    // 计算右眼的中心位置
+    cv::Point2f right_eye(0.0f, 0.0f);
+    for (int i = 42; i < 48; ++i) {
+        right_eye += landmark_68[i];
+    }
+    right_eye *= (1.0f / 6.0f); // 取平均
+
+    // 获取鼻尖位置
+    cv::Point2f nose = landmark_68[30];
+
+    // 获取左右嘴角的位置
+    cv::Point2f left_mouth_end = landmark_68[48];
+    cv::Point2f right_mouth_end = landmark_68[54];
+
+    // 将5个点加入到结果中
+    face_landmark_5.push_back(left_eye);
+    face_landmark_5.push_back(right_eye);
+    face_landmark_5.push_back(nose);
+    face_landmark_5.push_back(left_mouth_end);
+    face_landmark_5.push_back(right_mouth_end);
+
+    return face_landmark_5;
+}
+
+cv::Mat face_utils::blend_frame(const cv::Mat &target_image, const cv::Mat &paste_frame) {
+    float face_enhancer_blend = 1.0f - (80.0f / 100.0f);
+
+    cv::Mat temp_vision_frame;
+
+    cv::addWeighted(target_image, face_enhancer_blend,
+                    paste_frame, 1.0f - face_enhancer_blend,
+                    0,
+                    temp_vision_frame);
+
+    return temp_vision_frame;
+}
+
+
+cv::Mat face_utils::create_static_box_mask(std::vector<float> crop_size) {
+
+    float face_mask_blur = 0.3;
+
+    std::vector<int> face_mask_padding = {0,0,0,0};
+
+    // Calculate blur parameters
+    int blur_amount = static_cast<int>(crop_size[0] * 0.5 * face_mask_blur);
+    int blur_area = std::max(blur_amount / 2, 1);
+
+    // Create initial mask filled with ones
+    cv::Mat box_mask = cv::Mat::ones(crop_size[1], crop_size[0], CV_32F);
+
+    // Calculate padding areas
+    int top_padding = std::max(blur_area, static_cast<int>(crop_size[1] * face_mask_padding[0] / 100.0));
+    int bottom_padding = std::max(blur_area, static_cast<int>(crop_size[1] * face_mask_padding[2] / 100.0));
+    int right_padding = std::max(blur_area, static_cast<int>(crop_size[0] * face_mask_padding[1] / 100.0));
+    int left_padding = std::max(blur_area, static_cast<int>(crop_size[0] * face_mask_padding[3] / 100.0));
+
+    // Set padding regions to zero
+    // Top region
+    if (top_padding > 0) {
+        box_mask(cv::Rect(0, 0, crop_size[0], top_padding)) = 0.0;
+    }
+
+    // Bottom region
+    if (bottom_padding > 0) {
+        box_mask(cv::Rect(0, crop_size[1] - bottom_padding, crop_size[0], bottom_padding)) = 0.0;
+    }
+
+    // Left region
+    if (left_padding > 0) {
+        box_mask(cv::Rect(0, 0, left_padding, crop_size[1])) = 0.0;
+    }
+
+    // Right region
+    if (right_padding > 0) {
+        box_mask(cv::Rect(crop_size[0] - right_padding, 0, right_padding, crop_size[1])) = 0.0;
+    }
+
+    // Apply Gaussian blur if needed
+    if (blur_amount > 0) {
+        cv::GaussianBlur(box_mask, box_mask, cv::Size(0, 0), blur_amount * 0.25);
+    }
+
+    return box_mask;
+}
diff --git a/lite/ort/cv/face_utils.h b/lite/ort/cv/face_utils.h
new file mode 100644
index 00000000..08f5b24b
--- /dev/null
+++ b/lite/ort/cv/face_utils.h
@@ -0,0 +1,60 @@
+//
+// Created by wangzijian on 11/11/24.
+//
+
+#ifndef LITE_AI_TOOLKIT_FACE_UTILS_H
+#define LITE_AI_TOOLKIT_FACE_UTILS_H
+#include "opencv2/opencv.hpp"
+#include <fstream>
+#pragma once
+
+namespace face_utils
+{
+
+
+    cv::Mat paste_back(const cv::Mat& temp_vision_frame,
+                       const cv::Mat& crop_vision_frame,
+                       const cv::Mat& crop_mask,
+                       const cv::Mat& affine_matrix);
+
+    std::pair<cv::Mat, cv::Mat> warp_face_by_translation(const cv::Mat& temp_img,cv::Point2f& translation,
+                                                         float scale, const cv::Size& crop_size);
+
+    std::vector<float> dot_product(const std::vector<float>& vec,
+                                   const std::vector<float>& matrix,
+                                   int matrix_cols);
+
+    std::pair<cv::Mat, cv::Mat> warp_face_by_face_landmark_5(cv::Mat input_mat,
+                                                             std::vector<cv::Point2f> face_landmark_5,unsigned int type);
+
+    std::vector<cv::Point2f> convert_face_landmark_68_to_5(const std::vector<cv::Point2f>& landmark_68);
+
+    cv::Mat blend_frame(const cv::Mat &target_image, const cv::Mat &paste_frame);
+
+    cv::Mat create_static_box_mask(std::vector<float> crop_size);
+
+    void normalize(std::vector<float>& vec);
+
+    float calculate_norm(const std::vector<float>& vec);
+
+    std::vector<float> load_npy(const std::string& filename);
+
+    // 需要把下面三个vector整合在一起
+
+    extern const std::vector<std::vector<cv::Point2f>> face_template_vector;
+
+    extern const std::vector<cv::Point2f> face_template_128;
+
+    extern const std::vector<cv::Point2f> face_template_112;
+
+    extern const std::vector<cv::Point2f> face_template_512;
+
+    enum FaceType {
+        ARCFACE_112_V2 = 112,
+        ARCFACE_128_V2 = 128,
+        FFHQ_512 = 512
+    };
+
+}
+
+#endif //LITE_AI_TOOLKIT_FACE_UTILS_H
diff --git a/lite/ort/cv/yolofacev8.h b/lite/ort/cv/yolofacev8.h
index 10e98d62..fe455db1 100644
--- a/lite/ort/cv/yolofacev8.h
+++ b/lite/ort/cv/yolofacev8.h
@@ -20,8 +20,6 @@ namespace ortcv {
     private:
         float mean = -127.5 / 128.0;
         float scale = 1 / 128.0;
-        // const float conf_threshold = 0.5f;
-        // const float iou_threshold = 0.4f;
         float ratio_width;
         float ratio_height;
 
diff --git a/lite/trt/cv/trt_face_68landmarks.cpp b/lite/trt/cv/trt_face_68landmarks.cpp
new file mode 100644
index 00000000..b8165ca9
--- /dev/null
+++ b/lite/trt/cv/trt_face_68landmarks.cpp
@@ -0,0 +1,90 @@
+//
+// Created by wangzijian on 11/12/24.
+//
+
+#include "trt_face_68landmarks.h"
+using trtcv::TRTFaceFusionFace68Landmarks;
+
+void
+TRTFaceFusionFace68Landmarks::preprocess(const lite::types::Boxf &bounding_box, const cv::Mat &input_mat, cv::Mat &crop_img) {
+    float xmin = bounding_box.x1;
+    float ymin = bounding_box.y1;
+    float xmax = bounding_box.x2;
+    float ymax = bounding_box.y2;
+
+
+    float width = xmax - xmin;
+    float height = ymax - ymin;
+    float max_side = std::max(width, height);
+    float scale = 195.0f / max_side;
+
+    float center_x = (xmax + xmin) * scale;
+    float center_y = (ymax + ymin) * scale;
+
+    cv::Point2f translation;
+    translation.x = (256.0f - center_x) * 0.5f;
+    translation.y = (256.0f - center_y) * 0.5f;
+
+    cv::Size crop_size(256, 256);
+
+    std::tie(crop_img, affine_matrix) = face_utils::warp_face_by_translation(input_mat, translation, scale, crop_size);
+
+    crop_img.convertTo(crop_img,CV_32FC3,1 / 255.f);
+}
+
+void TRTFaceFusionFace68Landmarks::detect(const cv::Mat &input_mat, const lite::types::BoundingBoxType<float, float> &bbox,
+                                 std::vector<cv::Point2f> &face_landmark_5of68) {
+    if (input_mat.empty()) return;
+
+    img_with_landmarks = input_mat.clone();
+    cv::Mat crop_image;
+
+    preprocess(bbox,input_mat,crop_image);
+
+    std::vector<float> input_data;
+
+    trtcv::utils::transform::create_tensor(crop_image,input_data,input_node_dims,trtcv::utils::transform::CHW);
+
+    cudaMemcpyAsync(buffers[0], input_data.data(), input_node_dims[0] * input_node_dims[1] * input_node_dims[2] * input_node_dims[3] * sizeof(float),
+                    cudaMemcpyHostToDevice, stream);
+
+    // 在推理之前同步流，確保數據完全拷貝
+    cudaStreamSynchronize(stream);
+    bool status = trt_context->enqueueV3(stream);
+    cudaStreamSynchronize(stream);
+
+    if (!status){
+        std::cerr << "Failed to infer by TensorRT." << std::endl;
+        return;
+    }
+
+    std::vector<float> output(output_node_dims[0][0] * output_node_dims[0][1] * output_node_dims[0][2]);
+    cudaMemcpyAsync(output.data(), buffers[1], output_node_dims[0][0] * output_node_dims[0][1] * output_node_dims[0][2] * sizeof(float),
+                    cudaMemcpyDeviceToHost, stream);
+    cudaStreamSynchronize(stream);
+
+
+    postprocess(output.data(),face_landmark_5of68);
+
+}
+
+
+void TRTFaceFusionFace68Landmarks::postprocess(float *trt_outputs, std::vector<cv::Point2f> &face_landmark_5of68) {
+    std::vector<cv::Point2f> landmarks;
+
+    for (int i = 0;i < 68; ++i)
+    {
+        float x = trt_outputs[i * 3] / 64.0f  * 256.f;
+        float y = trt_outputs[i * 3 + 1] / 64.0f * 256.f;
+        landmarks.emplace_back(x, y);
+    }
+
+    cv::Mat inverse_affine_matrix;
+    cv::invertAffineTransform(affine_matrix, inverse_affine_matrix);
+
+    cv::transform(landmarks, landmarks, inverse_affine_matrix);
+
+    face_landmark_5of68 = face_utils::convert_face_landmark_68_to_5(landmarks);
+}
+
+
diff --git a/lite/trt/cv/trt_face_68landmarks.h b/lite/trt/cv/trt_face_68landmarks.h
new file mode 100644
index 00000000..5ead05a8
--- /dev/null
+++ b/lite/trt/cv/trt_face_68landmarks.h
@@ -0,0 +1,37 @@
+//
+// Created by wangzijian on 11/12/24.
+//
+
+#ifndef LITE_AI_TOOLKIT_TRT_FACE_68LANDMARKS_H
+#define LITE_AI_TOOLKIT_TRT_FACE_68LANDMARKS_H
+#include "lite/ort/cv/face_utils.h"
+#include "lite/trt/core/trt_core.h"
+#include "lite/trt/core/trt_utils.h"
+#include "lite/trt/core/trt_types.h"
+#include "algorithm"
+
+
+namespace trtcv{
+    class LITE_EXPORTS TRTFaceFusionFace68Landmarks : public BasicTRTHandler{
+    public:
+        explicit TRTFaceFusionFace68Landmarks(const std::string& _trt_model_path,unsigned int _num_threads = 1):
+        BasicTRTHandler(_trt_model_path,_num_threads){};
+    private:
+        cv::Mat affine_matrix;
+        cv::Mat img_with_landmarks;
+    private:
+        void preprocess(const lite::types::Boxf &bouding_box,const cv::Mat &input_mat,cv::Mat &crop_img);
+
+        void postprocess(float *trt_outputs, std::vector<cv::Point2f> &face_landmark_5of68);
+
+    public:
+
+        void detect(const cv::Mat &input_mat,const lite::types::BoundingBoxType<float, float> &bbox, std::vector<cv::Point2f> &face_landmark_5of68);
+
+
+
+    };
+}
+
+
+#endif //LITE_AI_TOOLKIT_TRT_FACE_68LANDMARKS_H
diff --git a/lite/trt/cv/trt_face_68landmarks_mt.cpp b/lite/trt/cv/trt_face_68landmarks_mt.cpp
new file mode 100644
index 00000000..7db634a6
--- /dev/null
+++ b/lite/trt/cv/trt_face_68landmarks_mt.cpp
@@ -0,0 +1,291 @@
+//
+// Created by root on 11/15/24.
+//
+
+#include "trt_face_68landmarks_mt.h"
+
+
+trt_face_68landmarks_mt::trt_face_68landmarks_mt(std::string &model_path, size_t num_threads) : num_threads(num_threads){
+
+    // 1. 读取模型文件
+    std::ifstream file(model_path, std::ios::binary);
+    if (!file.good()) {
+        std::cerr << "Failed to read model file: " << model_path << std::endl;
+        return;
+    }
+
+    file.seekg(0, std::ifstream::end);
+    size_t model_size = file.tellg();
+    file.seekg(0, std::ifstream::beg);
+    std::vector<char> model_data(model_size);
+    file.read(model_data.data(), model_size);
+    file.close();
+
+    // 2. 创建TensorRT运行时和引擎
+    trt_runtime.reset(nvinfer1::createInferRuntime(logger));
+    trt_engine.reset(trt_runtime->deserializeCudaEngine(model_data.data(), model_size));
+
+    if (!trt_engine) {
+        std::cerr << "Failed to deserialize the TensorRT engine." << std::endl;
+        return;
+    }
+
+    // 3. 获取模型输入输出信息
+    int num_io_tensors = trt_engine->getNbIOTensors();
+
+    // 4. 为每个线程创建执行上下文和CUDA流
+    trt_contexts.resize(num_threads);
+    streams.resize(num_threads);
+    buffers.resize(num_threads);
+
+    for (size_t thread_id = 0; thread_id < num_threads; ++thread_id) {
+        // 创建执行上下文
+        trt_contexts[thread_id].reset(trt_engine->createExecutionContext());
+        if (!trt_contexts[thread_id]) {
+            std::cerr << "Failed to create execution context for thread " << thread_id << std::endl;
+            return;
+        }
+
+        // 创建CUDA流
+        cudaStreamCreate(&streams[thread_id]);
+
+        // 为每个线程分配输入输出缓冲区
+        buffers[thread_id].resize(num_io_tensors);
+
+        for (int i = 0; i < num_io_tensors; ++i) {
+            auto tensor_name = trt_engine->getIOTensorName(i);
+            nvinfer1::Dims tensor_dims = trt_engine->getTensorShape(tensor_name);
+
+            // 处理输入tensor
+            if (i == 0) {
+                size_t tensor_size = 1;
+                for (int j = 0; j < tensor_dims.nbDims; ++j) {
+                    tensor_size *= tensor_dims.d[j];
+                    if (thread_id == 0) {  // 只在第一个线程记录输入维度
+                        input_node_dims.push_back(tensor_dims.d[j]);
+                    }
+                }
+                cudaMalloc(&buffers[thread_id][i], tensor_size * sizeof(float));
+                trt_contexts[thread_id]->setTensorAddress(tensor_name, buffers[thread_id][i]);
+                continue;
+            }
+
+            // 处理输出tensor
+            size_t tensor_size = 1;
+            if (thread_id == 0) {  // 只在第一个线程记录输出维度
+                std::vector<int64_t> output_node;
+                for (int j = 0; j < tensor_dims.nbDims; ++j) {
+                    output_node.push_back(tensor_dims.d[j]);
+                    tensor_size *= tensor_dims.d[j];
+                }
+                output_node_dims.push_back(output_node);
+            } else {
+                for (int j = 0; j < tensor_dims.nbDims; ++j) {
+                    tensor_size *= tensor_dims.d[j];
+                }
+            }
+
+            cudaMalloc(&buffers[thread_id][i], tensor_size * sizeof(float));
+            trt_contexts[thread_id]->setTensorAddress(tensor_name, buffers[thread_id][i]);
+
+            if (thread_id == 0) {
+                output_tensor_size++;
+            }
+        }
+    }
+
+    // 5. 启动工作线程
+    for (size_t i = 0; i < num_threads; ++i) {
+        worker_threads.emplace_back(&trt_face_68landmarks_mt::worker_function, this, i);
+    }
+
+}
+
+// 在cpp文件中修改相关实现
+void trt_face_68landmarks_mt::worker_function(int thread_id) {
+    while (true) {
+        InferenceTask task;
+        bool has_task = false;
+
+        // 从任务队列获取任务
+        {
+            std::unique_lock<std::mutex> lock(queue_mutex);
+            if (!task_queue.empty()) {
+                task = std::move(task_queue.front());
+                task_queue.pop();
+                has_task = true;
+                active_tasks++;
+            } else if (stop_flag) {
+                break;
+            } else {
+                condition.wait(lock);
+                continue;
+            }
+        }
+
+        if (has_task) {
+            // 处理任务
+            process_single_task(task, thread_id);
+
+            // 更新活跃任务计数
+            {
+                std::lock_guard<std::mutex> lock(completion_mutex);
+                active_tasks--;
+                completion_cv.notify_all();
+            }
+        }
+    }
+}
+
+void
+trt_face_68landmarks_mt::preprocess(const lite::types::Boxf &bounding_box, const cv::Mat &input_mat, cv::Mat &crop_img) {
+    float xmin = bounding_box.x1;
+    float ymin = bounding_box.y1;
+    float xmax = bounding_box.x2;
+    float ymax = bounding_box.y2;
+
+
+    float width = xmax - xmin;
+    float height = ymax - ymin;
+    float max_side = std::max(width, height);
+    float scale = 195.0f / max_side;
+
+    float center_x = (xmax + xmin) * scale;
+    float center_y = (ymax + ymin) * scale;
+
+    cv::Point2f translation;
+    translation.x = (256.0f - center_x) * 0.5f;
+    translation.y = (256.0f - center_y) * 0.5f;
+
+    cv::Size crop_size(256, 256);
+
+    std::tie(crop_img, affine_matrix) = face_utils::warp_face_by_translation(input_mat, translation, scale, crop_size);
+
+    crop_img.convertTo(crop_img,CV_32FC3,1 / 255.f);
+}
+
+
+void trt_face_68landmarks_mt::process_single_task( InferenceTask &task, int thread_id) {
+    if (task.input_mat.empty()) return;
+
+    img_with_landmarks = task.input_mat.clone();
+    cv::Mat crop_image;
+
+    preprocess(task.bbox, task.input_mat, crop_image);
+
+    std::vector<float> input_data;
+
+    trtcv::utils::transform::create_tensor(crop_image,input_data,input_node_dims,trtcv::utils::transform::CHW);
+
+    cudaMemcpyAsync(buffers[thread_id][0], input_data.data(), input_node_dims[0] * input_node_dims[1] * input_node_dims[2] * input_node_dims[3] * sizeof(float),
+                    cudaMemcpyHostToDevice, streams[thread_id]);
+
+    // 在推理之前同步流，確保數據完全拷貝
+    cudaStreamSynchronize(streams[thread_id]);
+    bool status = trt_contexts[thread_id]->enqueueV3(streams[thread_id]);
+    cudaStreamSynchronize(streams[thread_id]);
+
+    if (!status){
+        std::cerr << "Failed to infer by TensorRT." << std::endl;
+        return;
+    }
+
+    std::vector<float> output(output_node_dims[0][0] * output_node_dims[0][1] * output_node_dims[0][2]);
+    cudaMemcpyAsync(output.data(), buffers[thread_id][1], output_node_dims[0][0] * output_node_dims[0][1] * output_node_dims[0][2] * sizeof(float),
+                    cudaMemcpyDeviceToHost, streams[thread_id]);
+    cudaStreamSynchronize(streams[thread_id]);
+
+
+    // 带出结果
+    // 指针指向带出来
+    *task.face_landmark_5of68 = postprocess(output.data());
+
+    task.completion_promise.set_value();
+}
+
+
+std::vector<cv::Point2f> trt_face_68landmarks_mt::postprocess(float *trt_outputs) {
+    std::vector<cv::Point2f> landmarks;
+
+    for (int i = 0;i < 68; ++i)
+    {
+        float x = trt_outputs[i * 3] / 64.0f  * 256.f;
+        float y = trt_outputs[i * 3 + 1] / 64.0f * 256.f;
+        landmarks.emplace_back(x, y);
+    }
+
+    cv::Mat inverse_affine_matrix;
+    cv::invertAffineTransform(affine_matrix, inverse_affine_matrix);
+
+    cv::transform(landmarks, landmarks, inverse_affine_matrix);
+
+    return face_utils::convert_face_landmark_68_to_5(landmarks);
+}
+
+
+void trt_face_68landmarks_mt::postprocess(float *trt_outputs, std::vector<cv::Point2f> &face_landmark_5of68) {
+    std::vector<cv::Point2f> landmarks;
+
+    for (int i = 0;i < 68; ++i)
+    {
+        float x = trt_outputs[i * 3] / 64.0f  * 256.f;
+        float y = trt_outputs[i * 3 + 1] / 64.0f * 256.f;
+        landmarks.emplace_back(x, y);
+    }
+
+    cv::Mat inverse_affine_matrix;
+    cv::invertAffineTransform(affine_matrix, inverse_affine_matrix);
+
+    cv::transform(landmarks, landmarks, inverse_affine_matrix);
+
+    face_landmark_5of68 = face_utils::convert_face_landmark_68_to_5(landmarks);
+}
+
+void trt_face_68landmarks_mt::detect_async(cv::Mat &input_image, const lite::types::Boxf &bbox,
+                                           std::vector<cv::Point2f> &face_landmark_5of68) {
+//    InferenceTask task{input_image.clone(), bbox, face_landmark_5of68};
+    auto promise = std::promise<void>();
+    auto future = promise.get_future();
+
+    // 创建任务，传入结果向量的指针
+    InferenceTask task{input_image.clone(), bbox, &face_landmark_5of68, std::move(promise)};
+
+    {
+        std::lock_guard<std::mutex> lock(queue_mutex);
+        task_queue.push(std::move(task));
+    }
+
+    condition.notify_one();
+}
+
+void trt_face_68landmarks_mt::shutdown() {
+    // 设置停止标志
+    stop_flag = true;
+    condition.notify_all();
+
+    // 等待所有工作线程结束
+    for (auto& thread : worker_threads) {
+        if (thread.joinable()) {
+            thread.join();
+        }
+    }
+}
+
+void trt_face_68landmarks_mt::wait_for_completion() {
+    std::unique_lock<std::mutex> lock(completion_mutex);
+    completion_cv.wait(lock, [this]() {
+        return active_tasks == 0 && task_queue.empty();
+    });
+}
+
+trt_face_68landmarks_mt::~trt_face_68landmarks_mt() {
+    shutdown();
+
+    // 释放CUDA资源
+    for (size_t thread_id = 0; thread_id < num_threads; ++thread_id) {
+        for (auto buffer : buffers[thread_id]) {
+            cudaFree(buffer);
+        }
+        cudaStreamDestroy(streams[thread_id]);
+    }
+}
\ No newline at end of file
diff --git a/lite/trt/cv/trt_face_68landmarks_mt.h b/lite/trt/cv/trt_face_68landmarks_mt.h
new file mode 100644
index 00000000..3088a041
--- /dev/null
+++ b/lite/trt/cv/trt_face_68landmarks_mt.h
@@ -0,0 +1,93 @@
+// trt_face_restoration_mt.h
+
+#ifndef LITE_AI_TOOLKIT_TRT_FACE_LANDMARKS_MT_H_  // 使用不同的后缀
+#define LITE_AI_TOOLKIT_TRT_FACE_LANDMARKS_MT_H_
+
+
+#include "cuda_runtime.h"
+#include "NvInfer.h"
+#include "opencv2/opencv.hpp"
+#include "opencv2/core.hpp"
+#include "lite/trt/core/trt_logger.h"
+#include "lite/ort/cv/face_utils.h"
+#include "lite/trt/core/trt_utils.h"
+#include "fstream"
+#include <queue>
+#include <mutex>
+#include <condition_variable>
+#include <thread>
+#include "lite/types.h"
+#include <atomic>
+#include <memory>
+#include <future>
+
+// 定义任务结构体
+struct InferenceTask {
+    cv::Mat input_mat;
+    lite::types::Boxf bbox;
+    // 如果你想带出来结果的话 需要在这里加上指针 用于存储结果
+    std::vector<cv::Point2f>* face_landmark_5of68;  // 改为指针
+    std::promise<void> completion_promise;  // 添加promise用于同步
+};
+
+class trt_face_68landmarks_mt {
+private:
+    Logger logger;
+
+    // TensorRT相关组件
+    std::unique_ptr<nvinfer1::IRuntime> trt_runtime;
+    std::unique_ptr<nvinfer1::ICudaEngine> trt_engine;
+    std::vector<std::unique_ptr<nvinfer1::IExecutionContext>> trt_contexts;  // 每个线程一个context
+    std::vector<cudaStream_t> streams;  // 每个线程一个stream
+    std::vector<std::vector<void*>> buffers;  // 每个线程一组buffer
+
+    // 模型相关维度信息
+    std::vector<int64_t> input_node_dims;
+    std::vector<std::vector<int64_t>> output_node_dims;
+    std::size_t input_tensor_size = 1;
+    std::size_t output_tensor_size = 0;
+
+    // 线程池相关组件
+    std::vector<std::thread> worker_threads;
+    std::queue<InferenceTask> task_queue;
+    std::mutex queue_mutex;
+    std::condition_variable condition;
+    std::atomic<bool> stop_flag{false};
+    size_t num_threads;
+
+    std::atomic<int> active_tasks{0};  // 新增：跟踪活跃任务数
+    std::mutex completion_mutex;
+    std::condition_variable completion_cv;
+
+    // 线程工作函数
+    void worker_function(int thread_id);
+
+    // 实际的推理函数
+    void process_single_task(InferenceTask& task, int thread_id);
+
+    void preprocess(const lite::types::Boxf &bouding_box,const cv::Mat &input_mat,cv::Mat &crop_img);
+
+    void postprocess(float *trt_outputs, std::vector<cv::Point2f> &face_landmark_5of68);
+
+    std::vector<cv::Point2f> postprocess(float *trt_outputs);
+
+
+
+private:
+    cv::Mat affine_matrix;
+    cv::Mat img_with_landmarks;
+
+public:
+    explicit trt_face_68landmarks_mt(std::string& model_path, size_t num_threads = 4);
+    ~trt_face_68landmarks_mt();
+
+    // 异步任务提交接口
+//    void detect_async(cv::Mat& input_image, const lite::types::Boxf& bbox, std::vector<cv::Point2f>& face_landmark_5of68);
+    void detect_async(cv::Mat& input_image, const lite::types::Boxf& bbox, std::vector<cv::Point2f>& face_landmark_5of68);
+    void shutdown(); // 新增：显式关闭方法
+
+    // 等待所有任务完成
+    void wait_for_completion();
+};
+
+#endif //LITE_AI_TOOLKIT_TRT_FACE_RESTORATION_MT_H
\ No newline at end of file
diff --git a/lite/trt/cv/trt_face_recognizer.cpp b/lite/trt/cv/trt_face_recognizer.cpp
new file mode 100644
index 00000000..a7d0b7e1
--- /dev/null
+++ b/lite/trt/cv/trt_face_recognizer.cpp
@@ -0,0 +1,68 @@
+//
+// Created by wangzijian on 11/13/24.
+//
+
+#include "trt_face_recognizer.h"
+using trtcv::TRTFaceFusionFaceRecognizer;
+
+cv::Mat TRTFaceFusionFaceRecognizer::preprocess(cv::Mat &input_mat, std::vector<cv::Point2f> &face_landmark_5,
+                                                cv::Mat &preprocessed_mat) {
+    cv::Mat crop_image;
+    cv::Mat affine_martix;
+
+    std::tie(crop_image,affine_martix) = face_utils::warp_face_by_face_landmark_5(input_mat,face_landmark_5,face_utils::ARCFACE_112_V2);
+    crop_image.convertTo(crop_image,CV_32FC3, 1.0f / 127.5f,-1.0);
+    cv::cvtColor(crop_image,crop_image,cv::COLOR_BGR2RGB);
+
+    return crop_image;
+}
+
+
+void TRTFaceFusionFaceRecognizer::detect(cv::Mat &input_mat, std::vector<cv::Point2f> &face_landmark_5,
+                                         std::vector<float> &embeding) {
+    cv::Mat ori_image = input_mat.clone();
+
+    cv::Mat crop_image = preprocess(input_mat,face_landmark_5,ori_image);
+
+
+    std::vector<float> input_vector;
+
+    trtcv::utils::transform::create_tensor(crop_image,input_vector,input_node_dims,
+                                           trtcv::utils::transform::CHW);
+
+    cudaMemcpyAsync(buffers[0], input_vector.data(), input_node_dims[0] * input_node_dims[1] * input_node_dims[2] * input_node_dims[3] * sizeof(float),
+                    cudaMemcpyHostToDevice, stream);
+
+    // 在推理之前同步流，確保數據完全拷貝
+    cudaStreamSynchronize(stream);
+    bool status = trt_context->enqueueV3(stream);
+    cudaStreamSynchronize(stream);
+
+    if (!status){
+        std::cerr << "Failed to infer by TensorRT." << std::endl;
+        return;
+    }
+
+    std::vector<float> output(output_node_dims[0][0] * output_node_dims[0][1]);
+    cudaMemcpyAsync(output.data(), buffers[1], output_node_dims[0][0] * output_node_dims[0][1] * sizeof(float),
+                    cudaMemcpyDeviceToHost, stream);
+    cudaStreamSynchronize(stream);
+
+    embeding.assign(output.begin(),output.end());
+    std::vector<float> normal_embeding(output.begin(),output.end());
+
+
+    float norm = 0.0f;
+    for (const auto &val : normal_embeding) {
+        norm += val * val;
+    }
+    norm = std::sqrt(norm);
+
+    for (auto &val : normal_embeding) {
+        val /= norm;
+    }
+
+    std::cout<<"done!"<<std::endl;
+
+
+}
\ No newline at end of file
diff --git a/lite/trt/cv/trt_face_recognizer.h b/lite/trt/cv/trt_face_recognizer.h
new file mode 100644
index 00000000..e3b32339
--- /dev/null
+++ b/lite/trt/cv/trt_face_recognizer.h
@@ -0,0 +1,28 @@
+//
+// Created by wangzijian on 11/13/24.
+//
+
+#ifndef LITE_AI_TOOLKIT_TRT_FACE_RECOGNIZER_H
+#define LITE_AI_TOOLKIT_TRT_FACE_RECOGNIZER_H
+#include "lite/ort/cv/face_utils.h"
+#include "lite/trt/core/trt_core.h"
+#include "lite/trt/core/trt_utils.h"
+#include "lite/trt/core/trt_types.h"
+
+namespace trtcv{
+    class  LITE_EXPORTS TRTFaceFusionFaceRecognizer : BasicTRTHandler{
+    public:
+        explicit TRTFaceFusionFaceRecognizer(const std::string& _trt_model_path,unsigned int _num_threads = 1):
+                BasicTRTHandler(_trt_model_path,_num_threads){};
+    private:
+        cv::Mat  preprocess(cv::Mat &input_mat, std::vector<cv::Point2f> &face_landmark_5,cv::Mat &preprocessed_mat);
+
+    public:
+        void detect(cv::Mat &input_mat,std::vector<cv::Point2f> &face_landmark_5,std::vector<float> &embeding);
+
+    };
+}
+
+
+
+#endif //LITE_AI_TOOLKIT_TRT_FACE_RECOGNIZER_H
diff --git a/lite/trt/cv/trt_face_restoration.cpp b/lite/trt/cv/trt_face_restoration.cpp
new file mode 100644
index 00000000..bb874cc5
--- /dev/null
+++ b/lite/trt/cv/trt_face_restoration.cpp
@@ -0,0 +1,110 @@
+//
+// Created by wangzijian on 11/14/24.
+//
+
+#include "trt_face_restoration.h"
+using trtcv::TRTFaceFusionFaceRestoration;
+
+void TRTFaceFusionFaceRestoration::detect(cv::Mat &face_swap_image, std::vector<cv::Point2f> &target_landmarks_5,
+                                          const std::string &face_enchaner_path) {
+    auto ori_image = face_swap_image.clone();
+
+    cv::Mat crop_image;
+    cv::Mat affine_matrix;
+    std::tie(crop_image,affine_matrix) = face_utils::warp_face_by_face_landmark_5(face_swap_image,target_landmarks_5,face_utils::FFHQ_512);
+
+    std::vector<float> crop_size = {512,512};
+    cv::Mat box_mask = face_utils::create_static_box_mask(crop_size);
+    std::vector<cv::Mat> crop_mask_list;
+    crop_mask_list.emplace_back(box_mask);
+
+    cv::cvtColor(crop_image,crop_image,cv::COLOR_BGR2RGB);
+    crop_image.convertTo(crop_image,CV_32FC3,1.f / 255.f);
+    crop_image.convertTo(crop_image,CV_32FC3,2.0f,-1.f);
+
+    std::vector<float> input_vector;
+    trtcv::utils::transform::create_tensor(crop_image,input_vector,input_node_dims,trtcv::utils::transform::CHW);
+
+    // 拷贝
+
+    // 先不用拷贝了 处理完成再拷贝出来 类似于整个后处理放在GPU上完成
+    cudaMemcpyAsync(buffers[0],input_vector.data(),1 * 3 * 512 * 512 * sizeof(float),cudaMemcpyHostToDevice,stream);
+
+    // 同步
+    cudaStreamSynchronize(stream);
+
+    // 推理
+    bool status = trt_context->enqueueV3(stream);
+    if (!status) {
+        std::cerr << "Failed to inference" << std::endl;
+        return;
+    }
+
+
+    // 同步
+    cudaStreamSynchronize(stream);
+    std::vector<unsigned char> transposed_data(1 * 3 * 512 * 512);
+
+//    std::vector<float> transposed_data(1 * 3 * 512 * 512);
+
+    // 这里buffer1就是输出了
+    launch_face_restoration_postprocess(
+            static_cast<float*>(buffers[1]),
+            transposed_data.data(),
+            3,
+            512,
+            512
+            );
+
+    std::vector<float> transposed_data_float(transposed_data.begin(),
+                                             transposed_data.end());
+
+
+    // 获取输出
+    std::vector<float> output_vector(1 * 3 * 512 * 512);
+//    cudaMemcpyAsync(output_vector.data(),buffers[1],1 * 3 * 512 * 512 * sizeof(float),cudaMemcpyDeviceToHost,stream);
+    cudaStreamSynchronize(stream);
+//
+    // 后处理
+    int channel = 3;
+    int height = 512;
+    int width = 512;
+//    std::vector<float> output(channel * height * width);
+//    output.assign(output_vector.begin(),output_vector.end());
+//
+//    std::transform(output.begin(),output.end(),output.begin(),
+//                   [](double x){return std::max(-1.0,std::max(-1.0,std::min(1.0,x)));});
+//
+//    std::transform(output.begin(),output.end(),output.begin(),
+//                   [](double x){return (x + 1.f) /2.f;});
+//
+//    // CHW2HWC
+//    for (int c = 0; c < channel; ++c){
+//        for (int h = 0 ; h < height; ++h){
+//            for (int w = 0; w < width ; ++w){
+//                int src_index = c * (height * width) + h * width + w;
+//                int dst_index = h * (width * channel) + w *  channel + c;
+//                transposed_data[dst_index] = output[src_index];
+//            }
+//        }
+//    }
+//
+//    std::transform(transposed_data.begin(),transposed_data.end(),transposed_data.begin(),
+//                   [](float x){return std::round(x * 255.f);});
+//
+//    std::transform(transposed_data.begin(), transposed_data.end(), transposed_data.begin(),
+//                   [](float x) { return static_cast<uint8_t>(x); });
+
+
+    cv::Mat mat(height, width, CV_32FC3, transposed_data_float.data());
+//    cv::imwrite("/home/lite.ai.toolkit/mid_process.jpg",mat);
+    cv::cvtColor(mat, mat, cv::COLOR_RGB2BGR);
+
+
+    auto crop_mask = crop_mask_list[0];
+    cv::Mat paste_frame = face_utils::paste_back(ori_image,mat,crop_mask,affine_matrix);
+
+    cv::Mat dst_image = face_utils::blend_frame(ori_image,paste_frame);
+
+    cv::imwrite(face_enchaner_path,dst_image);
+}
\ No newline at end of file
diff --git a/lite/trt/cv/trt_face_restoration.h b/lite/trt/cv/trt_face_restoration.h
new file mode 100644
index 00000000..a525f657
--- /dev/null
+++ b/lite/trt/cv/trt_face_restoration.h
@@ -0,0 +1,23 @@
+//
+// Created by wangzijian on 11/14/24.
+//
+
+#ifndef LITE_AI_TOOLKIT_TRT_FACE_RESTORATION_H
+#define LITE_AI_TOOLKIT_TRT_FACE_RESTORATION_H
+#include "lite/trt/core/trt_core.h"
+#include "lite/trt/core/trt_utils.h"
+#include "lite/trt/core/trt_config.h"
+#include "lite/ort/cv/face_utils.h"
+#include "lite/trt/kernel/face_restoration_postprocess_manager.h"
+namespace trtcv{
+    class LITE_EXPORTS TRTFaceFusionFaceRestoration : BasicTRTHandler{
+    public:
+        explicit TRTFaceFusionFaceRestoration(const std::string& _trt_model_path,unsigned int _num_threads = 1) :
+                BasicTRTHandler(_trt_model_path,_num_threads){};;
+    public:
+        void detect(cv::Mat &face_swap_image,std::vector<cv::Point2f > &target_landmarks_5 ,const std::string &face_enchaner_path);
+
+    };
+}
+
+#endif //LITE_AI_TOOLKIT_TRT_FACE_RESTORATION_H
diff --git a/lite/trt/cv/trt_face_restoration_mt.cpp b/lite/trt/cv/trt_face_restoration_mt.cpp
new file mode 100644
index 00000000..496ac4ba
--- /dev/null
+++ b/lite/trt/cv/trt_face_restoration_mt.cpp
@@ -0,0 +1,289 @@
+// trt_face_restoration_mt.cpp
+#include "trt_face_restoration_mt.h"
+
+trt_face_restoration_mt::trt_face_restoration_mt(std::string& model_path, size_t num_threads)
+        : num_threads(num_threads) {
+    // 1. 读取模型文件
+    std::ifstream file(model_path, std::ios::binary);
+    if (!file.good()) {
+        std::cerr << "Failed to read model file: " << model_path << std::endl;
+        return;
+    }
+
+    file.seekg(0, std::ifstream::end);
+    size_t model_size = file.tellg();
+    file.seekg(0, std::ifstream::beg);
+    std::vector<char> model_data(model_size);
+    file.read(model_data.data(), model_size);
+    file.close();
+
+    // 2. 创建TensorRT运行时和引擎
+    trt_runtime.reset(nvinfer1::createInferRuntime(logger));
+    trt_engine.reset(trt_runtime->deserializeCudaEngine(model_data.data(), model_size));
+
+    if (!trt_engine) {
+        std::cerr << "Failed to deserialize the TensorRT engine." << std::endl;
+        return;
+    }
+
+    // 3. 获取模型输入输出信息
+    int num_io_tensors = trt_engine->getNbIOTensors();
+
+    // 4. 为每个线程创建执行上下文和CUDA流
+    trt_contexts.resize(num_threads);
+    streams.resize(num_threads);
+    buffers.resize(num_threads);
+
+    for (size_t thread_id = 0; thread_id < num_threads; ++thread_id) {
+        // 创建执行上下文
+        trt_contexts[thread_id].reset(trt_engine->createExecutionContext());
+        if (!trt_contexts[thread_id]) {
+            std::cerr << "Failed to create execution context for thread " << thread_id << std::endl;
+            return;
+        }
+
+        // 创建CUDA流
+        cudaStreamCreate(&streams[thread_id]);
+
+        // 为每个线程分配输入输出缓冲区
+        buffers[thread_id].resize(num_io_tensors);
+
+        for (int i = 0; i < num_io_tensors; ++i) {
+            auto tensor_name = trt_engine->getIOTensorName(i);
+            nvinfer1::Dims tensor_dims = trt_engine->getTensorShape(tensor_name);
+
+            // 处理输入tensor
+            if (i == 0) {
+                size_t tensor_size = 1;
+                for (int j = 0; j < tensor_dims.nbDims; ++j) {
+                    tensor_size *= tensor_dims.d[j];
+                    if (thread_id == 0) {  // 只在第一个线程记录输入维度
+                        input_node_dims.push_back(tensor_dims.d[j]);
+                    }
+                }
+                cudaMalloc(&buffers[thread_id][i], tensor_size * sizeof(float));
+                trt_contexts[thread_id]->setTensorAddress(tensor_name, buffers[thread_id][i]);
+                continue;
+            }
+
+            // 处理输出tensor
+            size_t tensor_size = 1;
+            if (thread_id == 0) {  // 只在第一个线程记录输出维度
+                std::vector<int64_t> output_node;
+                for (int j = 0; j < tensor_dims.nbDims; ++j) {
+                    output_node.push_back(tensor_dims.d[j]);
+                    tensor_size *= tensor_dims.d[j];
+                }
+                output_node_dims.push_back(output_node);
+            } else {
+                for (int j = 0; j < tensor_dims.nbDims; ++j) {
+                    tensor_size *= tensor_dims.d[j];
+                }
+            }
+
+            cudaMalloc(&buffers[thread_id][i], tensor_size * sizeof(float));
+            trt_contexts[thread_id]->setTensorAddress(tensor_name, buffers[thread_id][i]);
+
+            if (thread_id == 0) {
+                output_tensor_size++;
+            }
+        }
+    }
+
+    // 5. 启动工作线程
+    for (size_t i = 0; i < num_threads; ++i) {
+        worker_threads.emplace_back(&trt_face_restoration_mt::worker_function, this, i);
+    }
+}
+
+// 在cpp文件中修改相关实现
+void trt_face_restoration_mt::worker_function(int thread_id) {
+    while (true) {
+        InferenceTaskTest task;
+        bool has_task = false;
+
+        // 从任务队列获取任务
+        {
+            std::unique_lock<std::mutex> lock(queue_mutex);
+            if (!task_queue.empty()) {
+                task = std::move(task_queue.front());
+                task_queue.pop();
+                has_task = true;
+                active_tasks++;
+            } else if (stop_flag) {
+                break;
+            } else {
+                condition.wait(lock);
+                continue;
+            }
+        }
+
+        if (has_task) {
+            // 处理任务
+            process_single_task(task, thread_id);
+
+            // 更新活跃任务计数
+            {
+                std::lock_guard<std::mutex> lock(completion_mutex);
+                active_tasks--;
+                completion_cv.notify_all();
+            }
+        }
+    }
+}
+
+
+void trt_face_restoration_mt::process_single_task(const InferenceTaskTest& task, int thread_id) {
+    auto ori_image = task.face_swap_image.clone();
+
+    // 1. 图像预处理
+    cv::Mat crop_image;
+    cv::Mat affine_matrix;
+    std::tie(crop_image, affine_matrix) = face_utils::warp_face_by_face_landmark_5(
+            task.face_swap_image,
+            task.target_landmarks_5,
+            face_utils::FFHQ_512
+    );
+
+    std::vector<float> crop_size = {512, 512};
+    cv::Mat box_mask = face_utils::create_static_box_mask(crop_size);
+    std::vector<cv::Mat> crop_mask_list;
+    crop_mask_list.emplace_back(box_mask);
+
+    cv::cvtColor(crop_image, crop_image, cv::COLOR_BGR2RGB);
+    crop_image.convertTo(crop_image, CV_32FC3, 1.f / 255.f);
+    crop_image.convertTo(crop_image, CV_32FC3, 2.0f, -1.f);
+
+    std::vector<float> input_vector;
+    trtcv::utils::transform::create_tensor(
+            crop_image,
+            input_vector,
+            input_node_dims,
+            trtcv::utils::transform::CHW
+    );
+
+    // 2. 拷贝输入数据到GPU
+    cudaMemcpyAsync(
+            buffers[thread_id][0],
+            input_vector.data(),
+            1 * 3 * 512 * 512 * sizeof(float),
+            cudaMemcpyHostToDevice,
+            streams[thread_id]
+    );
+
+    // 3. 同步并推理
+    cudaStreamSynchronize(streams[thread_id]);
+    bool status = trt_contexts[thread_id]->enqueueV3(streams[thread_id]);
+
+    if (!status) {
+        std::cerr << "Failed to inference in thread " << thread_id << std::endl;
+        return;
+    }
+
+    cudaStreamSynchronize(streams[thread_id]);
+
+    // 4. 获取输出数据
+    std::vector<float> output_vector(1 * 3 * 512 * 512);
+    cudaMemcpyAsync(
+            output_vector.data(),
+            buffers[thread_id][1],
+            1 * 3 * 512 * 512 * sizeof(float),
+            cudaMemcpyDeviceToHost,
+            streams[thread_id]
+    );
+
+    cudaStreamSynchronize(streams[thread_id]);
+
+    // 5. 后处理
+    int channel = 3;
+    int height = 512;
+    int width = 512;
+    std::vector<float> output(channel * height * width);
+    output.assign(output_vector.begin(), output_vector.end());
+
+    // 值范围裁剪到[-1, 1]
+    std::transform(output.begin(), output.end(), output.begin(),
+                   [](double x) { return std::max(-1.0, std::min(1.0, x)); });
+
+    // 转换到[0, 1]范围
+    std::transform(output.begin(), output.end(), output.begin(),
+                   [](double x) { return (x + 1.f) / 2.f; });
+
+    // CHW到HWC转换
+    std::vector<float> transposed_data(channel * height * width);
+    for (int c = 0; c < channel; ++c) {
+        for (int h = 0; h < height; ++h) {
+            for (int w = 0; w < width; ++w) {
+                int src_index = c * (height * width) + h * width + w;
+                int dst_index = h * (width * channel) + w * channel + c;
+                transposed_data[dst_index] = output[src_index];
+            }
+        }
+    }
+
+    // 转换到0-255范围
+    std::transform(transposed_data.begin(), transposed_data.end(), transposed_data.begin(),
+                   [](float x) { return std::round(x * 255.f); });
+
+    // 转换到uint8
+    std::transform(transposed_data.begin(), transposed_data.end(), transposed_data.begin(),
+                   [](float x) { return static_cast<uint8_t>(x); });
+
+    // 6. 创建输出图像
+    cv::Mat mat(height, width, CV_32FC3, transposed_data.data());
+    cv::cvtColor(mat, mat, cv::COLOR_RGB2BGR);
+
+    // 7. 后处理和保存
+    auto crop_mask = crop_mask_list[0];
+    cv::Mat paste_frame = face_utils::paste_back(ori_image, mat, crop_mask, affine_matrix);
+    cv::Mat dst_image = face_utils::blend_frame(ori_image, paste_frame);
+    cv::imwrite(task.face_enchaner_path, dst_image);
+}
+
+void trt_face_restoration_mt::detect_async(
+        cv::Mat& face_swap_image,
+        std::vector<cv::Point2f>& target_landmarks_5,
+        const std::string& face_enchaner_path
+) {
+    InferenceTaskTest task{face_swap_image.clone(), target_landmarks_5, face_enchaner_path};
+
+    {
+        std::lock_guard<std::mutex> lock(queue_mutex);
+        task_queue.push(std::move(task));
+    }
+
+    condition.notify_one();
+}
+
+
+void trt_face_restoration_mt::shutdown() {
+    // 设置停止标志
+    stop_flag = true;
+    condition.notify_all();
+
+    // 等待所有工作线程结束
+    for (auto& thread : worker_threads) {
+        if (thread.joinable()) {
+            thread.join();
+        }
+    }
+}
+
+void trt_face_restoration_mt::wait_for_completion() {
+    std::unique_lock<std::mutex> lock(completion_mutex);
+    completion_cv.wait(lock, [this]() {
+        return active_tasks == 0 && task_queue.empty();
+    });
+}
+
+trt_face_restoration_mt::~trt_face_restoration_mt() {
+    shutdown();
+
+    // 释放CUDA资源
+    for (size_t thread_id = 0; thread_id < num_threads; ++thread_id) {
+        for (auto buffer : buffers[thread_id]) {
+            cudaFree(buffer);
+        }
+        cudaStreamDestroy(streams[thread_id]);
+    }
+}
\ No newline at end of file
diff --git a/lite/trt/cv/trt_face_restoration_mt.h b/lite/trt/cv/trt_face_restoration_mt.h
new file mode 100644
index 00000000..f9cedcbb
--- /dev/null
+++ b/lite/trt/cv/trt_face_restoration_mt.h
@@ -0,0 +1,77 @@
+// trt_face_restoration_mt.h
+#ifndef LITE_AI_TOOLKIT_TRT_FACE_RESTORATION_MT_H_  // 注意添加_MT后缀
+#define LITE_AI_TOOLKIT_TRT_FACE_RESTORATION_MT_H_
+
+#include "cuda_runtime.h"
+#include "NvInfer.h"
+#include "opencv2/opencv.hpp"
+#include "opencv2/core.hpp"
+#include "lite/trt/core/trt_logger.h"
+#include "lite/ort/cv/face_utils.h"
+#include "lite/trt/core/trt_utils.h"
+#include <queue>
+#include <mutex>
+#include <condition_variable>
+#include <thread>
+#include <atomic>
+#include <memory>
+#include <fstream>
+
+
+// 定义任务结构体
+struct InferenceTaskTest {
+    cv::Mat face_swap_image;
+    std::vector<cv::Point2f> target_landmarks_5;
+    std::string face_enchaner_path;
+};
+
+class trt_face_restoration_mt {
+private:
+    Logger logger;
+
+    // TensorRT相关组件
+    std::unique_ptr<nvinfer1::IRuntime> trt_runtime;
+    std::unique_ptr<nvinfer1::ICudaEngine> trt_engine;
+    std::vector<std::unique_ptr<nvinfer1::IExecutionContext>> trt_contexts;  // 每个线程一个context
+    std::vector<cudaStream_t> streams;  // 每个线程一个stream
+    std::vector<std::vector<void*>> buffers;  // 每个线程一组buffer
+
+    // 模型相关维度信息
+    std::vector<int64_t> input_node_dims;
+    std::vector<std::vector<int64_t>> output_node_dims;
+    std::size_t input_tensor_size = 1;
+    std::size_t output_tensor_size = 0;
+
+    // 线程池相关组件
+    std::vector<std::thread> worker_threads;
+    std::queue<InferenceTaskTest> task_queue;
+    std::mutex queue_mutex;
+    std::condition_variable condition;
+    std::atomic<bool> stop_flag{false};
+    size_t num_threads;
+
+    std::atomic<int> active_tasks{0};  // 新增：跟踪活跃任务数
+    std::mutex completion_mutex;
+    std::condition_variable completion_cv;
+
+    // 线程工作函数
+    void worker_function(int thread_id);
+
+    // 实际的推理函数
+    void process_single_task(const InferenceTaskTest& task, int thread_id);
+
+public:
+    explicit trt_face_restoration_mt(std::string& model_path, size_t num_threads = 4);
+    ~trt_face_restoration_mt();
+
+    // 异步任务提交接口
+    void detect_async(cv::Mat& face_swap_image,
+                      std::vector<cv::Point2f>& target_landmarks_5,
+                      const std::string& face_enchaner_path);
+
+    void shutdown(); // 新增：显式关闭方法
+
+    // 等待所有任务完成
+    void wait_for_completion();
+};
+#endif
\ No newline at end of file
diff --git a/lite/trt/cv/trt_face_swap.cpp b/lite/trt/cv/trt_face_swap.cpp
new file mode 100644
index 00000000..e0f1358f
--- /dev/null
+++ b/lite/trt/cv/trt_face_swap.cpp
@@ -0,0 +1,90 @@
+//
+// Created by wangzijian on 11/13/24.
+//
+
+#include "trt_face_swap.h"
+using trtcv::TRTFaceFusionFaceSwap;
+
+void TRTFaceFusionFaceSwap::preprocess(cv::Mat &target_face, std::vector<float> source_image_embeding,
+                                       std::vector<cv::Point2f> target_landmark_5,
+                                       std::vector<float> &processed_source_embeding, cv::Mat &preprocessed_mat) {
+
+    std::tie(preprocessed_mat, affine_martix) = face_utils::warp_face_by_face_landmark_5(target_face,target_landmark_5,face_utils::ARCFACE_128_V2);
+
+    std::vector<float> crop_size= {128.0,128.0};
+    crop_list.emplace_back(face_utils::create_static_box_mask(crop_size));
+
+    cv::cvtColor(preprocessed_mat,preprocessed_mat,cv::COLOR_BGR2RGB);
+    preprocessed_mat.convertTo(preprocessed_mat,CV_32FC3,1.0 / 255.f);
+    preprocessed_mat.convertTo(preprocessed_mat,CV_32FC3,1.0 / 1.f,0);
+
+    std::vector<float> model_martix = face_utils::load_npy("/home/facefusion-onnxrun/python/model_matrix.npy");
+
+    processed_source_embeding= face_utils::dot_product(source_image_embeding,model_martix,512);
+
+    face_utils::normalize(processed_source_embeding);
+
+    std::cout<<"done!"<<std::endl;
+
+}
+
+
+void TRTFaceFusionFaceSwap::detect(cv::Mat &target_image, std::vector<float> source_face_embeding,
+                                   std::vector<cv::Point2f> target_landmark_5, cv::Mat &face_swap_image) {
+    cv::Mat ori_image = target_image.clone();
+    std::vector<float> source_embeding_input;
+    cv::Mat model_input_mat;
+    preprocess(target_image,source_face_embeding,target_landmark_5,source_embeding_input,model_input_mat);
+
+    std::vector<float> input_vector;
+    trtcv::utils::transform::create_tensor(model_input_mat,input_vector,input_node_dims,trtcv::utils::transform::CHW);
+
+    // 这个是 source 的输入下面写一个 embeding 的输入
+    cudaMemcpyAsync(buffers[0],input_vector.data(),1 * 3 * 128 * 128 *sizeof(float ), cudaMemcpyHostToDevice,stream);
+    cudaMemcpyAsync(buffers[1],source_embeding_input.data(),512 * sizeof(float), cudaMemcpyHostToDevice,stream);
+
+    // 推理之前先同步一下
+    cudaStreamSynchronize(stream);
+
+    // 这里是推理
+    bool status = trt_context->enqueueV3(stream);
+    if (!status) {
+        std::cerr << "Failed to enqueue TensorRT model." << std::endl;
+        return;
+    }
+
+    // 将输出拷贝出来
+    std::vector<float> output_vector(3 * 128 * 128);
+    cudaMemcpyAsync(output_vector.data(),buffers[2],1 * 3 * 128 * 128 * sizeof(float),cudaMemcpyDeviceToHost,stream);
+    cudaStreamSynchronize(stream);
+
+    std::vector<float> output_swap_image(1 * 3 * 128 * 128);
+    output_swap_image.assign(output_vector.begin(),output_vector.end());
+
+    std::vector<float> transposed(3 * 128 * 128);
+    int channels = 3;
+    int height = 128;
+    int width = 128;
+
+    for (int c = 0; c < channels; ++c) {
+        for (int h = 0; h < height; ++h) {
+            for (int w = 0; w < width; ++w) {
+                int src_idx = c * (height * width) + h * width + w;  // CHW
+                int dst_idx = h * (width * channels) + w * channels + c;  // HWC
+                transposed[dst_idx] = output_swap_image[src_idx];
+            }
+        }
+    }
+
+    for (auto& val : transposed) {
+        val = std::round(val * 255.0);
+    }
+
+    cv::Mat mat(height, width, CV_32FC3, transposed.data());
+    cv::cvtColor(mat, mat, cv::COLOR_RGB2BGR);
+
+    cv::Mat dst_image = face_utils::paste_back(ori_image,mat,crop_list[0],affine_martix);
+    face_swap_image = dst_image;
+
+
+}
\ No newline at end of file
diff --git a/lite/trt/cv/trt_face_swap.h b/lite/trt/cv/trt_face_swap.h
new file mode 100644
index 00000000..fac57666
--- /dev/null
+++ b/lite/trt/cv/trt_face_swap.h
@@ -0,0 +1,32 @@
+//
+// Created by wangzijian on 11/13/24.
+//
+
+#ifndef LITE_AI_TOOLKIT_TRT_FACE_SWAP_H
+#define LITE_AI_TOOLKIT_TRT_FACE_SWAP_H
+#include "lite/ort/cv/face_utils.h"
+#include "lite/trt/core/trt_core.h"
+#include "lite/trt/core/trt_utils.h"
+#include "lite/trt/core/trt_types.h"
+
+namespace trtcv{
+    class LITE_EXPORTS TRTFaceFusionFaceSwap : BasicTRTHandler{
+    public:
+        explicit TRTFaceFusionFaceSwap(const std::string& _trt_model_path,unsigned int _num_threads = 1):
+                BasicTRTHandler(_trt_model_path,_num_threads){};
+    private:
+        void preprocess(cv::Mat &target_face,std::vector<float> source_image_embeding,std::vector<cv::Point2f> target_landmark_5,
+                        std::vector<float> &processed_source_embeding,cv::Mat &preprocessed_mat);
+
+    private:
+        std::vector<cv::Mat> crop_list;
+        cv::Mat affine_martix;
+    public:
+        void detect(cv::Mat &target_image,std::vector<float> source_face_embeding,std::vector<cv::Point2f> target_landmark_5,
+                    cv::Mat &face_swap_image);
+
+    };
+}
+
+
+#endif //LITE_AI_TOOLKIT_TRT_FACE_SWAP_H
diff --git a/lite/trt/cv/trt_facefusion_pipeline.cpp b/lite/trt/cv/trt_facefusion_pipeline.cpp
new file mode 100644
index 00000000..717119b5
--- /dev/null
+++ b/lite/trt/cv/trt_facefusion_pipeline.cpp
@@ -0,0 +1,47 @@
+//
+// Created by wangzijian on 11/14/24.
+//
+
+#include "trt_facefusion_pipeline.h"
+using trtcv::TRTFaceFusionPipeLine;
+
+TRTFaceFusionPipeLine::TRTFaceFusionPipeLine(const std::string &face_detect_engine_path,
+                                             const std::string &face_landmarks_68_engine_path,
+                                             const std::string &face_recognizer_engine_path,
+                                             const std::string &face_swap_engine_path,
+                                             const std::string &face_restoration_engine_path) {
+    face_detect  = std::make_unique<TRTYoloFaceV8>(face_detect_engine_path,1);
+    face_landmarks = std::make_unique<TRTFaceFusionFace68Landmarks>(face_landmarks_68_engine_path,1);
+    face_recognizer = std::make_unique<TRTFaceFusionFaceRecognizer>(face_recognizer_engine_path,1);
+    face_swap = std::make_unique<TRTFaceFusionFaceSwap>(face_swap_engine_path,1);
+    face_restoration = std::make_unique<TRTFaceFusionFaceRestoration>(face_restoration_engine_path,1);
+}
+
+void TRTFaceFusionPipeLine::detect(const std::string &source_image, const std::string &target_image,
+                                   const std::string &save_image) {
+    std::vector<lite::types::Boxf> detected_boxes;
+    cv::Mat img_bgr = cv::imread(source_image);
+    face_detect->detect(img_bgr,detected_boxes,0.25f,0.45f);
+
+    int position = 0; // position number 0
+    auto test_bounding_box = detected_boxes[0];
+    std::vector<cv::Point2f> face_landmark_5of68;
+
+    face_landmarks->detect(img_bgr, test_bounding_box, face_landmark_5of68);
+    std::vector<float> source_image_embeding;
+    face_recognizer->detect(img_bgr,face_landmark_5of68,source_image_embeding);
+
+
+    std::vector<lite::types::Boxf> target_detected_boxes;
+    cv::Mat target_img_bgr = cv::imread(target_image);
+    face_detect->detect(target_img_bgr, target_detected_boxes,0.25f,0.45f);
+    auto target_test_bounding_box = target_detected_boxes[0];
+    std::vector<cv::Point2f> target_face_landmark_5of68;
+    face_landmarks->detect(target_img_bgr, target_test_bounding_box,target_face_landmark_5of68);
+
+    cv::Mat face_swap_image;
+    face_swap->detect(target_img_bgr,source_image_embeding,target_face_landmark_5of68,face_swap_image);
+    face_restoration->detect(face_swap_image,target_face_landmark_5of68,save_image);
+}
+
+
diff --git a/lite/trt/cv/trt_facefusion_pipeline.h b/lite/trt/cv/trt_facefusion_pipeline.h
new file mode 100644
index 00000000..b5b608e7
--- /dev/null
+++ b/lite/trt/cv/trt_facefusion_pipeline.h
@@ -0,0 +1,40 @@
+//
+// Created by wangzijian on 11/14/24.
+//
+
+#ifndef LITE_AI_TOOLKIT_TRT_FACEFUSION_PIPELINE_H
+#define LITE_AI_TOOLKIT_TRT_FACEFUSION_PIPELINE_H
+
+#include "lite/trt/core/trt_core.h"
+#include "lite/trt/cv/trt_face_restoration.h"
+#include "lite/trt/cv/trt_face_swap.h"
+#include "lite/trt/cv/trt_face_recognizer.h"
+#include "lite/trt/cv/trt_yolofacev8.h"
+#include "lite/trt/cv/trt_face_68landmarks.h"
+
+namespace trtcv{
+    class TRTFaceFusionPipeLine{
+    public:
+        TRTFaceFusionPipeLine(
+                const std::string &face_detect_engine_path,
+                const std::string &face_landmarks_68_engine_path,
+                const std::string &face_recognizer_engine_path,
+                const std::string &face_swap_engine_path,
+                const std::string &face_restoration_engine_path
+                );
+
+    private:
+        std::unique_ptr<TRTFaceFusionFaceRestoration> face_restoration;
+        std::unique_ptr<TRTYoloFaceV8> face_detect;
+        std::unique_ptr<TRTFaceFusionFace68Landmarks> face_landmarks;
+        std::unique_ptr<TRTFaceFusionFaceRecognizer> face_recognizer;
+        std::unique_ptr<TRTFaceFusionFaceSwap> face_swap;
+
+    public:
+        void detect(const std::string &source_image,const std::string &target_image,const std::string &save_image);
+
+    };
+}
+
+
+#endif //LITE_AI_TOOLKIT_TRT_FACEFUSION_PIPELINE_H
diff --git a/lite/trt/cv/trt_modnet.cpp b/lite/trt/cv/trt_modnet.cpp
index 2a590960..8bbfcc6d 100644
--- a/lite/trt/cv/trt_modnet.cpp
+++ b/lite/trt/cv/trt_modnet.cpp
@@ -91,7 +91,7 @@ void TRTMODNet::generate_matting(float *trt_outputs, const cv::Mat &mat, types::
     const unsigned int out_w = 512;
 
     cv::Mat alpha_pred(out_h, out_w, CV_32FC1, trt_outputs);
-    cv::imwrite("/home/lite.ai.toolkit/modnet.jpg",alpha_pred);
+
     // post process
     if (remove_noise) trtcv::utils::remove_small_connected_area(alpha_pred,0.05f);
     // resize alpha
diff --git a/lite/trt/cv/trt_yolofacev8.cpp b/lite/trt/cv/trt_yolofacev8.cpp
index 8d411592..ca3381ca 100644
--- a/lite/trt/cv/trt_yolofacev8.cpp
+++ b/lite/trt/cv/trt_yolofacev8.cpp
@@ -20,6 +20,12 @@ float TRTYoloFaceV8::get_iou(const lite::types::Boxf box1, const lite::types::Bo
 }
 
 
+
+std::vector<int>
+TRTYoloFaceV8::nms_cuda(std::vector<lite::types::Boxf> boxes, std::vector<float> confidences, const float nms_thresh) {
+        return nms_cuda_manager->perform_nms(boxes, confidences, nms_thresh);
+}
+
 std::vector<int> TRTYoloFaceV8::nms(std::vector<lite::types::Boxf> boxes, std::vector<float> confidences, const float nms_thresh) {
     sort(confidences.begin(), confidences.end(), [&confidences](size_t index_1, size_t index_2)
     { return confidences[index_1] > confidences[index_2]; });
@@ -96,30 +102,33 @@ void TRTYoloFaceV8::generate_box(float *trt_outputs, std::vector<lite::types::Bo
                                  float iou_threshold) {
 
     int num_box = output_node_dims[0][2];
-    std::vector<lite::types::BoundingBoxType<float, float>> bounding_box_raw;
+
+    // 直接分配目标类型的向量
+    std::vector<lite::types::BoundingBoxType<float, float>> bounding_box_raw(num_box);
+
+    // 调用包装函数
+    launch_yolov8_postprocess(
+            static_cast<float*>(buffers[1]),
+            num_box,
+            conf_threshold,
+            ratio_height,
+            ratio_width,
+            bounding_box_raw.data(),
+            num_box
+    );
+
     std::vector<float> score_raw;
-    for (int i = 0; i < num_box; i++)
-    {
-        const float score = trt_outputs[4 * num_box + i];
-        if (score > conf_threshold)
-        {
-            float x1 = (trt_outputs[i] - 0.5 * trt_outputs[2 * num_box + i]) * ratio_width;
-            float y1 = (trt_outputs[num_box + i] - 0.5 * trt_outputs[3 * num_box + i]) * ratio_height;
-            float x2 = (trt_outputs[i] + 0.5 * trt_outputs[2 * num_box + i]) * ratio_width;
-            float y2 = (trt_outputs[num_box + i] + 0.5 * trt_outputs[3 * num_box + i]) * ratio_height;
-
-            lite::types::BoundingBoxType<float, float> bbox;
-            bbox.x1 = x1;
-            bbox.y1 = y1;
-            bbox.x2 = x2;
-            bbox.y2 = y2;
-            bbox.score = score;
-            bbox.flag = true;
-            bounding_box_raw.emplace_back(bbox);
-            score_raw.emplace_back(score);
+    for (const auto& bbox : bounding_box_raw) {
+        if (bbox.score >= 0) {
+            score_raw.emplace_back(bbox.score);
         }
     }
-    std::vector<int> keep_inds = this->nms(bounding_box_raw, score_raw, iou_threshold);
+
+
+
+    std::vector<int> keep_inds = nms_cuda(bounding_box_raw, score_raw, iou_threshold);
+//    std::vector<int> keep_inds = this->nms(bounding_box_raw, score_raw, iou_threshold);
+
     const int keep_num = keep_inds.size();
     boxes.clear();
     boxes.resize(keep_num);
@@ -135,6 +144,18 @@ void TRTYoloFaceV8::generate_box(float *trt_outputs, std::vector<lite::types::Bo
 void TRTYoloFaceV8::detect(const cv::Mat &mat, std::vector<lite::types::Boxf> &boxes, float conf_threshold,
                            float iou_threshold) {
 
+    // 检查输入
+    if (mat.empty()) {
+        std::cerr << "Input image is empty!" << std::endl;
+        return;
+    }
+
+    // 检查 TRT 上下文
+    if (!trt_context) {
+        std::cerr << "TensorRT context is null!" << std::endl;
+        return;
+    }
+
 
     // 1.normalized the input
     cv::Mat normalized_image = normalize(mat);
diff --git a/lite/trt/cv/trt_yolofacev8.h b/lite/trt/cv/trt_yolofacev8.h
index 9f6fcd93..b9752d54 100644
--- a/lite/trt/cv/trt_yolofacev8.h
+++ b/lite/trt/cv/trt_yolofacev8.h
@@ -6,14 +6,27 @@
 #define LITE_AI_TOOLKIT_TRT_YOLOFACEV8_H
 #include "lite/trt/core/trt_core.h"
 #include "lite/trt/core/trt_utils.h"
-
+#include "lite/trt/kernel/nms_cuda_manager.h"
+#include "lite/trt/kernel/generate_bbox_cuda_manager.h"
 
 namespace trtcv{
     class LITE_EXPORTS TRTYoloFaceV8 : public BasicTRTHandler{
+
+
     public:
+
+        std::unique_ptr<NMSCudaManager> nms_cuda_manager;
         explicit TRTYoloFaceV8(const std::string& _trt_model_path,unsigned int _num_threads = 1):
                 BasicTRTHandler(_trt_model_path, _num_threads)
-        {};
+        {
+            nms_cuda_manager = std::make_unique<NMSCudaManager>();
+        };
+
+
+
+        std::vector<int> nms_cuda(std::vector<lite::types::Boxf> boxes,
+                                  std::vector<float> confidences,
+                                  const float nms_thresh);
 
     private:
         float mean = -127.5 / 128.0;
diff --git a/lite/trt/kernel/face_restoration_postprocess.cu b/lite/trt/kernel/face_restoration_postprocess.cu
new file mode 100644
index 00000000..8dacd0d0
--- /dev/null
+++ b/lite/trt/kernel/face_restoration_postprocess.cu
@@ -0,0 +1,44 @@
+#include "face_restoration_postprocess.cuh"
+
+// 第一步处理函数
+__device__ float process_range_single(float x) {
+    x = fmax(-1.0f, fmin(1.0f, x));
+    return (x + 1.f) / 2.f;
+}
+
+// CHW到HWC的索引转换
+__device__ int get_hwc_index(int c, int h, int w, int channel, int width) {
+    return h * (width * channel) + w * channel + c;
+}
+
+// float转uint8的处理
+__device__ unsigned char float_to_uint8_simple(float x) {
+    return (unsigned char)rintf(fminf(255.f, fmaxf(0.f, x * 255.f)));
+}
+
+// 主kernel函数
+__global__ void face_restoration_postprocess(
+        float* input_buffer,        // 输入数据（TRT输出，CHW格式）
+        unsigned char* output_final,  // 最终输出（HWC格式，uint8）
+        int channel,
+        int height,
+        int width
+) {
+    int idx = blockDim.x * blockIdx.x + threadIdx.x;
+    int total_size = channel * height * width;
+    if (idx >= total_size) return;
+
+    // 第一步：范围处理
+    float processed = process_range_single(input_buffer[idx]);
+
+    // 第二步：计算CHW中的位置
+    int c = idx / (height * width);
+    int h = (idx % (height * width)) / width;
+    int w = idx % width;
+
+    // 第三步：计算HWC位置并转换
+    int hwc_idx = get_hwc_index(c, h, w, channel, width);
+
+    // 第四步：转换为uint8并写入输出
+    output_final[hwc_idx] = float_to_uint8_simple(processed);
+}
diff --git a/lite/trt/kernel/face_restoration_postprocess.cuh b/lite/trt/kernel/face_restoration_postprocess.cuh
new file mode 100644
index 00000000..adc276e1
--- /dev/null
+++ b/lite/trt/kernel/face_restoration_postprocess.cuh
@@ -0,0 +1,8 @@
+#include "cuda_runtime.h"
+extern "C"  __global__ void face_restoration_postprocess(
+        float* input_buffer,        // 输入数据（TRT输出，CHW格式）
+        unsigned char* output_final,  // 最终输出（HWC格式，uint8）
+        int channel,
+        int height,
+        int width
+);
\ No newline at end of file
diff --git a/lite/trt/kernel/face_restoration_postprocess_manager.cpp b/lite/trt/kernel/face_restoration_postprocess_manager.cpp
new file mode 100644
index 00000000..f1ace60b
--- /dev/null
+++ b/lite/trt/kernel/face_restoration_postprocess_manager.cpp
@@ -0,0 +1,45 @@
+//
+// Created by root on 11/29/24.
+//
+
+#include "face_restoration_postprocess_manager.h"
+void launch_face_restoration_postprocess(
+        float* trt_outputs,
+        unsigned char* output_final,
+        int channel,
+        int height,
+        int width
+){
+    // 设计grid和block的尺寸 block直接设置为256的最大值
+    int block_size  = 256;
+    int vec_num = channel * height * width;
+    int grid_size = ( vec_num + block_size - 1) / block_size;
+    // GPU上的内存空间
+    unsigned char* d_output_final;
+    int* d_output_count;
+
+    // 在GPU上分配输出的空间
+    cudaMalloc(&d_output_final,vec_num * sizeof(unsigned char ));
+
+    // 启动内核
+    face_restoration_postprocess<<<grid_size,block_size>>>(
+            trt_outputs,
+            d_output_final,
+            channel,
+            height,
+            width
+            );
+    cudaDeviceSynchronize();
+    cudaError_t error = cudaGetLastError();
+    if (error != cudaSuccess) {
+        printf("CUDA error: %s\n", cudaGetErrorString(error));
+    }
+
+    // 将生成的数据复制出来
+    cudaMemcpy(output_final,d_output_final,vec_num * sizeof(unsigned char ),
+               cudaMemcpyDeviceToHost);
+
+    // 释放cuda上的内存
+    cudaFree(d_output_final);
+
+}
\ No newline at end of file
diff --git a/lite/trt/kernel/face_restoration_postprocess_manager.h b/lite/trt/kernel/face_restoration_postprocess_manager.h
new file mode 100644
index 00000000..57db9e11
--- /dev/null
+++ b/lite/trt/kernel/face_restoration_postprocess_manager.h
@@ -0,0 +1,21 @@
+//
+// Created by root on 11/29/24.
+//
+
+#ifndef LITE_AI_TOOLKIT_FACE_RESTORATION_POSTPROCESS_MANAGER_H
+#define LITE_AI_TOOLKIT_FACE_RESTORATION_POSTPROCESS_MANAGER_H
+#include <vector>
+#include <memory>
+#include <stdexcept>
+#include "face_restoration_postprocess.cuh"
+
+void launch_face_restoration_postprocess(
+        float* trt_outputs,
+        unsigned char* output_final,
+        int channel,
+        int height,
+        int width
+        );
+
+
+#endif //LITE_AI_TOOLKIT_FACE_RESTORATION_POSTPROCESS_MANAGER_H
diff --git a/lite/trt/kernel/generate_bbox_cuda_manager.cpp b/lite/trt/kernel/generate_bbox_cuda_manager.cpp
new file mode 100644
index 00000000..f203700d
--- /dev/null
+++ b/lite/trt/kernel/generate_bbox_cuda_manager.cpp
@@ -0,0 +1,54 @@
+//
+// Created by wangzijian on 11/26/24.
+//
+
+#include "generate_bbox_cuda_manager.h"
+// Kernel launch wrapper function
+void launch_yolov8_postprocess(
+        float* trt_outputs,
+        int number_of_boxes,
+        float conf_threshold,
+        float ratio_height,
+        float ratio_width,
+        lite::types::BoundingBoxType<float, float>* output_boxes,
+        int max_output_boxes
+) {
+    // 计算grid和block尺寸
+    int block_size = 256;
+    int grid_size = (number_of_boxes + block_size - 1) / block_size;
+
+    // 分配设备内存
+    lite::types::BoundingBoxType<float, float>* d_output_boxes;
+    int* d_output_count;
+
+    cudaMalloc(&d_output_boxes, max_output_boxes * sizeof(lite::types::BoundingBoxType<float, float>));
+    cudaMalloc(&d_output_count, sizeof(int));
+    cudaMemset(d_output_count, 0, sizeof(int));
+
+    // 启动内核
+    yolov8_postprocess_kernel<<<grid_size, block_size>>>(
+            trt_outputs,
+            number_of_boxes,
+            conf_threshold,
+            ratio_height,
+            ratio_width,
+            d_output_boxes,
+            d_output_count
+    );
+
+    // 同步和错误检查
+    cudaDeviceSynchronize();
+    cudaError_t error = cudaGetLastError();
+    if (error != cudaSuccess) {
+        printf("CUDA error: %s\n", cudaGetErrorString(error));
+    }
+
+    // 复制输出数据
+    int h_output_count;
+    cudaMemcpy(&h_output_count, d_output_count, sizeof(int), cudaMemcpyDeviceToHost);
+    cudaMemcpy(output_boxes, d_output_boxes, h_output_count * sizeof(lite::types::BoundingBoxType<float, float>), cudaMemcpyDeviceToHost);
+
+    // 释放设备内存
+    cudaFree(d_output_boxes);
+    cudaFree(d_output_count);
+}
\ No newline at end of file
diff --git a/lite/trt/kernel/generate_bbox_cuda_manager.h b/lite/trt/kernel/generate_bbox_cuda_manager.h
new file mode 100644
index 00000000..83bb5263
--- /dev/null
+++ b/lite/trt/kernel/generate_bbox_cuda_manager.h
@@ -0,0 +1,20 @@
+//
+// Created by wangzijian on 11/26/24.
+//
+
+#pragma once
+#include <vector>
+#include <memory>
+#include <stdexcept>
+#include "lite/types.h"
+#include "generate_bbox_kernel.cuh"
+
+void launch_yolov8_postprocess(
+        float* trt_outputs,
+        int number_of_boxes,
+        float conf_threshold,
+        float ratio_height,
+        float ratio_width,
+        lite::types::BoundingBoxType<float, float>* output_boxes,
+        int max_output_boxes
+);
\ No newline at end of file
diff --git a/lite/trt/kernel/generate_bbox_kernel.cu b/lite/trt/kernel/generate_bbox_kernel.cu
new file mode 100644
index 00000000..36ccb04f
--- /dev/null
+++ b/lite/trt/kernel/generate_bbox_kernel.cu
@@ -0,0 +1,33 @@
+#include "generate_bbox_kernel.cuh"
+
+__global__ void yolov8_postprocess_kernel(
+        float* trt_outputs,
+        int number_of_boxes,
+        float conf_threshold,
+        float ratio_height,
+        float ratio_width,
+        lite::types::BoundingBoxType<float, float>* output_boxes,  // 直接使用目标类型
+        int* output_count
+) {
+    int index = blockIdx.x * blockDim.x + threadIdx.x;
+    if (index >= number_of_boxes) return;
+
+    const float score = trt_outputs[4 * number_of_boxes + index];
+    if (score > conf_threshold) {
+
+        float x1 = (trt_outputs[index] - 0.5 * trt_outputs[2 * number_of_boxes + index]) * ratio_width;
+        float y1 = (trt_outputs[number_of_boxes + index] - 0.5 * trt_outputs[3 * number_of_boxes + index]) * ratio_height;
+        float x2 = (trt_outputs[index] + 0.5 * trt_outputs[2 * number_of_boxes + index]) * ratio_width;
+        float y2 = (trt_outputs[number_of_boxes + index] + 0.5 * trt_outputs[3 * number_of_boxes + index]) * ratio_height;
+
+        // 使用原子操作获取输出索引
+        int output_index = atomicAdd(output_count, 1);
+        // 直接设置BoundingBoxType
+        output_boxes[output_index].x1 = x1;
+        output_boxes[output_index].y1 = y1;
+        output_boxes[output_index].x2 = x2;
+        output_boxes[output_index].y2 = y2;
+        output_boxes[output_index].score = score;
+        output_boxes[output_index].flag = true;
+    }
+}
diff --git a/lite/trt/kernel/generate_bbox_kernel.cuh b/lite/trt/kernel/generate_bbox_kernel.cuh
new file mode 100644
index 00000000..c2c74834
--- /dev/null
+++ b/lite/trt/kernel/generate_bbox_kernel.cuh
@@ -0,0 +1,12 @@
+#include "cuda_runtime.h"
+#include "lite/types.h"
+
+extern "C" __global__ void yolov8_postprocess_kernel(
+        float* trt_outputs,
+        int number_of_boxes,
+        float conf_threshold,
+        float ratio_height,
+        float ratio_width,
+        lite::types::BoundingBoxType<float, float>* output_boxes,  // 直接使用目标类型
+        int* output_count
+);
diff --git a/lite/trt/kernel/nms_cuda_manager.cpp b/lite/trt/kernel/nms_cuda_manager.cpp
new file mode 100644
index 00000000..30c5cc90
--- /dev/null
+++ b/lite/trt/kernel/nms_cuda_manager.cpp
@@ -0,0 +1,122 @@
+// nms_cuda_manager.cpp
+#include "nms_cuda_manager.h"
+#include "nms_kernel.cuh"
+#include <cuda_runtime.h>
+
+// 宏定义：检查CUDA操作是否成功
+#define CUDA_CHECK(call) \
+    do { \
+        cudaError_t err = call; \
+        if (err != cudaSuccess) { \
+            throw std::runtime_error("CUDA error: " + std::string(cudaGetErrorString(err))); \
+        } \
+    } while(0)
+
+NMSCudaManager::NMSCudaManager() {}
+
+NMSCudaManager::~NMSCudaManager() {
+    release_resources();
+}
+
+void NMSCudaManager::init(size_t max_boxes) {
+    // 如果已经初始化且新的大小不超过当前大小，则直接返回
+    if (is_initialized && max_boxes <= max_boxes_num) {
+        return;
+    }
+
+    // 先释放现有资源
+    release_resources();
+
+    try {
+        // 分配设备内存
+        CUDA_CHECK(cudaMalloc(&d_boxes, max_boxes * 5 * sizeof(float)));
+        CUDA_CHECK(cudaMalloc(&d_result, max_boxes * sizeof(int)));
+
+        // 分配主机内存
+        h_result = new int[max_boxes];
+
+        max_boxes_num = max_boxes;
+        is_initialized = true;
+    }
+    catch (const std::exception& e) {
+        // 初始化失败时，确保资源被正确释放
+        release_resources();
+        throw;
+    }
+}
+
+void NMSCudaManager::release_resources() {
+    if (d_boxes) {
+        CUDA_CHECK(cudaFree(d_boxes));
+        d_boxes = nullptr;
+    }
+
+    if (d_result) {
+        CUDA_CHECK(cudaFree(d_result));
+        d_result = nullptr;
+    }
+
+    if (h_result) {
+        delete[] h_result;
+        h_result = nullptr;
+    }
+
+    max_boxes_num = 0;
+    is_initialized = false;
+}
+
+
+
+std::vector<int> NMSCudaManager::perform_nms(
+        const std::vector<lite::types::Boxf>& boxes,
+        const std::vector<float>& confidences,
+        float nms_thresh
+) {
+    // 安全性检查
+    if (boxes.size() != confidences.size()) {
+        throw std::invalid_argument("Box and confidence sizes must match");
+    }
+
+    // 初始化或调整资源大小
+    const int num_boxes = boxes.size();
+    if (true ) {
+        init(fmax(num_boxes, max_boxes_num * 2));
+    }
+
+    // 准备数据
+    std::vector<float> box_data(num_boxes * 5);
+    for (int i = 0; i < num_boxes; ++i) {
+        box_data[i * 5] = boxes[i].x1;
+        box_data[i * 5 + 1] = boxes[i].y1;
+        box_data[i * 5 + 2] = boxes[i].x2;
+        box_data[i * 5 + 3] = boxes[i].y2;
+        box_data[i * 5 + 4] = confidences[i];
+    }
+
+    // 拷贝数据到GPU
+    CUDA_CHECK(cudaMemcpy(d_boxes, box_data.data(), num_boxes * 5 * sizeof(float), cudaMemcpyHostToDevice));
+
+    // 设置CUDA kernel参数
+    const int block_size = 256;
+    const int grid_size = (num_boxes + block_size - 1) / block_size;
+
+    // 启动kernel
+    nms_kernel<<<grid_size, block_size>>>(d_boxes, num_boxes, nms_thresh, d_result);
+    CUDA_CHECK(cudaGetLastError());
+
+    // 等待kernel执行完成
+    CUDA_CHECK(cudaDeviceSynchronize());
+
+    // 拷贝结果回CPU
+    CUDA_CHECK(cudaMemcpy(h_result, d_result, num_boxes * sizeof(int), cudaMemcpyDeviceToHost));
+
+    // 收集保留的索引
+    std::vector<int> keep_indices;
+    for (int i = 0; i < num_boxes; ++i) {
+        if (h_result[i] == 1) {
+            keep_indices.push_back(i);
+        }
+    }
+
+    return keep_indices;
+}
\ No newline at end of file
diff --git a/lite/trt/kernel/nms_cuda_manager.h b/lite/trt/kernel/nms_cuda_manager.h
new file mode 100644
index 00000000..cb5fa240
--- /dev/null
+++ b/lite/trt/kernel/nms_cuda_manager.h
@@ -0,0 +1,38 @@
+// nms_cuda_manager.hpp
+#pragma once
+#include <vector>
+#include <memory>
+#include <stdexcept>
+#include "lite/types.h"
+
+class NMSCudaManager {
+public:
+    NMSCudaManager();
+    ~NMSCudaManager();
+
+    // 禁用拷贝构造和赋值运算符，防止意外的资源复制
+    NMSCudaManager(const NMSCudaManager&) = delete;
+    NMSCudaManager& operator=(const NMSCudaManager&) = delete;
+
+    // 初始化CUDA资源，支持动态调整
+    void init(size_t max_boxes = 1024);
+
+    // 安全的NMS执行方法
+    std::vector<int> perform_nms(
+            const std::vector<lite::types::Boxf>& boxes,
+            const std::vector<float>& confidences,
+            float nms_thresh
+    );
+
+private:
+    // 资源释放方法
+    void release_resources();
+
+    // CUDA内存指针
+    float* d_boxes = nullptr;      // 设备内存：框
+    int* d_result = nullptr;       // 设备内存：结果
+    int* h_result = nullptr;       // 主机内存：结果
+
+    size_t max_boxes_num = 0;      // 最大框数
+    bool is_initialized = false;   // 初始化标志
+};
\ No newline at end of file
diff --git a/lite/trt/kernel/nms_kernel.cu b/lite/trt/kernel/nms_kernel.cu
new file mode 100644
index 00000000..92e47852
--- /dev/null
+++ b/lite/trt/kernel/nms_kernel.cu
@@ -0,0 +1,55 @@
+#include <cuda_runtime.h>
+
+struct bbox {
+    float x1, y1, x2, y2, score;
+};
+
+// IoU计算的device函数
+extern "C" __device__ float calculate_iou(float* a, float* b) {
+    float left = max(a[0], b[0]);
+    float right = min(a[2], b[2]);
+    float top = max(a[1], b[1]);
+    float bottom = min(a[3], b[3]);
+
+    float width = max(right - left, 0.f);
+    float height = max(bottom - top, 0.f);
+
+    float interArea = width * height;
+    float boxAArea = (a[2] - a[0]) * (a[3] - a[1]);
+    float boxBArea = (b[2] - b[0]) * (b[3] - b[1]);
+
+    return interArea / (boxAArea + boxBArea - interArea);
+}
+
+// NMS核函数
+extern "C" __global__ void nms_kernel(float* bboxes, int number_of_boxes, float threshold_iou, int* result) {
+    int index = blockIdx.x * blockDim.x + threadIdx.x;
+
+    // 边界检查
+    if (index >= number_of_boxes) return;
+
+    // 初始假设保留当前框
+    result[index] = 1;
+
+    for (int i = 0; i < number_of_boxes; i++) {
+        // 跳过自身
+        if (i == index) continue;
+
+        // 当前框和比较框的指针
+        float* current_box = bboxes + index * 5;
+        float* compare_box = bboxes + i * 5;
+
+        float iou = calculate_iou(current_box, compare_box);
+
+        // 获取分数
+        float current_score = current_box[4];
+        float compare_score = compare_box[4];
+
+        // 如果IoU大于阈值且比较框分数更高，则抑制当前框
+        if (iou > threshold_iou && compare_score > current_score) {
+            result[index] = 0;
+            break;
+        }
+    }
+
+}
diff --git a/lite/trt/kernel/nms_kernel.cuh b/lite/trt/kernel/nms_kernel.cuh
new file mode 100644
index 00000000..6d93735a
--- /dev/null
+++ b/lite/trt/kernel/nms_kernel.cuh
@@ -0,0 +1,4 @@
+#include "cuda_runtime.h"
+extern "C" __global__ void nms_kernel(float* bboxes, int number_of_boxes, float threshold_iou, int* result);
+
+extern "C" __device__ float calculate_iou(float* a, float* b);
\ No newline at end of file