diff --git a/linux/CMakeLists.txt b/linux/CMakeLists.txt
new file mode 100644
index 0000000..ec1cc44
--- /dev/null
+++ b/linux/CMakeLists.txt
@@ -0,0 +1,48 @@
+cmake_minimum_required(VERSION 3.10) 
+
+set(ProjectName "DragGan-NCNN")
+project(${ProjectName}) 
+message("ProjectName: ${ProjectName}")
+
+set(CMAKE_BUILD_TYPE Release)
+set(CMAKE_AUTOMOC ON)
+set(CMAKE_AUTORCC ON)
+set(CMAKE_AUTOUIC ON)
+set(ncnn_DIR ${PROJECT_SOURCE_DIR}/lib/cmake/ncnn)  
+# add your own path to qt5 
+set(CMAKE_PREFIX_PATH "[PATH TO]/qt5/5.14.2/gcc_64/lib/cmake/")
+
+add_definitions("-fPIC")
+
+
+find_package(ncnn REQUIRED)
+find_package(Qt5 COMPONENTS
+        Core
+        Gui
+        Widgets
+        REQUIRED)
+
+find_package(OpenCV REQUIRED)
+
+
+add_executable(${ProjectName} 
+    main.cpp
+    mainwindow.cpp
+    mainwindow.h
+    mainwindow.ui
+    mainwindow.qrc
+) 
+
+target_include_directories(${ProjectName} PUBLIC 
+    ${OpenCV_INCLUDE_DIRS}
+    ${PROJECT_SOURCE_DIR}/include
+)
+
+target_link_libraries(${ProjectName} 
+    ${OpenCV_LIBS}
+    ncnn
+    Qt5::Core
+    Qt5::Gui
+    Qt5::Widgets
+) 
+
diff --git a/linux/DragGan-NCNN b/linux/DragGan-NCNN
new file mode 100755
index 0000000..cf340e3
Binary files /dev/null and b/linux/DragGan-NCNN differ
diff --git a/linux/assets/PUT *.bin & *.param FILES HERE b/linux/assets/PUT *.bin & *.param FILES HERE
new file mode 100644
index 0000000..e69de29
diff --git a/linux/include/ncnn/allocator.h b/linux/include/ncnn/allocator.h
new file mode 100644
index 0000000..3a5ebca
--- /dev/null
+++ b/linux/include/ncnn/allocator.h
@@ -0,0 +1,448 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2018 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef NCNN_ALLOCATOR_H
+#define NCNN_ALLOCATOR_H
+
+#ifdef _WIN32
+#define WIN32_LEAN_AND_MEAN
+#include <windows.h>
+#endif
+
+#include "platform.h"
+
+#include <stdlib.h>
+
+#if NCNN_VULKAN
+#include <vulkan/vulkan.h>
+#endif // NCNN_VULKAN
+
+#if NCNN_PLATFORM_API
+#if __ANDROID_API__ >= 26
+#include <android/hardware_buffer.h>
+#endif // __ANDROID_API__ >= 26
+#endif // NCNN_PLATFORM_API
+
+namespace ncnn {
+
+// the alignment of all the allocated buffers
+#if NCNN_AVX512
+#define NCNN_MALLOC_ALIGN 64
+#elif NCNN_AVX
+#define NCNN_MALLOC_ALIGN 32
+#else
+#define NCNN_MALLOC_ALIGN 16
+#endif
+
+// we have some optimized kernels that may overread buffer a bit in loop
+// it is common to interleave next-loop data load with arithmetic instructions
+// allocating more bytes keeps us safe from SEGV_ACCERR failure
+#define NCNN_MALLOC_OVERREAD 64
+
+// Aligns a pointer to the specified number of bytes
+// ptr Aligned pointer
+// n Alignment size that must be a power of two
+template<typename _Tp>
+static NCNN_FORCEINLINE _Tp* alignPtr(_Tp* ptr, int n = (int)sizeof(_Tp))
+{
+    return (_Tp*)(((size_t)ptr + n - 1) & -n);
+}
+
+// Aligns a buffer size to the specified number of bytes
+// The function returns the minimum number that is greater or equal to sz and is divisible by n
+// sz Buffer size to align
+// n Alignment size that must be a power of two
+static NCNN_FORCEINLINE size_t alignSize(size_t sz, int n)
+{
+    return (sz + n - 1) & -n;
+}
+
+static NCNN_FORCEINLINE void* fastMalloc(size_t size)
+{
+#if _MSC_VER
+    return _aligned_malloc(size, NCNN_MALLOC_ALIGN);
+#elif (defined(__unix__) || defined(__APPLE__)) && _POSIX_C_SOURCE >= 200112L || (__ANDROID__ && __ANDROID_API__ >= 17)
+    void* ptr = 0;
+    if (posix_memalign(&ptr, NCNN_MALLOC_ALIGN, size + NCNN_MALLOC_OVERREAD))
+        ptr = 0;
+    return ptr;
+#elif __ANDROID__ && __ANDROID_API__ < 17
+    return memalign(NCNN_MALLOC_ALIGN, size + NCNN_MALLOC_OVERREAD);
+#else
+    unsigned char* udata = (unsigned char*)malloc(size + sizeof(void*) + NCNN_MALLOC_ALIGN + NCNN_MALLOC_OVERREAD);
+    if (!udata)
+        return 0;
+    unsigned char** adata = alignPtr((unsigned char**)udata + 1, NCNN_MALLOC_ALIGN);
+    adata[-1] = udata;
+    return adata;
+#endif
+}
+
+static NCNN_FORCEINLINE void fastFree(void* ptr)
+{
+    if (ptr)
+    {
+#if _MSC_VER
+        _aligned_free(ptr);
+#elif (defined(__unix__) || defined(__APPLE__)) && _POSIX_C_SOURCE >= 200112L || (__ANDROID__ && __ANDROID_API__ >= 17)
+        free(ptr);
+#elif __ANDROID__ && __ANDROID_API__ < 17
+        free(ptr);
+#else
+        unsigned char* udata = ((unsigned char**)ptr)[-1];
+        free(udata);
+#endif
+    }
+}
+
+#if NCNN_THREADS
+// exchange-add operation for atomic operations on reference counters
+#if defined __riscv && !defined __riscv_atomic
+// riscv target without A extension
+static NCNN_FORCEINLINE int NCNN_XADD(int* addr, int delta)
+{
+    int tmp = *addr;
+    *addr += delta;
+    return tmp;
+}
+#elif defined __INTEL_COMPILER && !(defined WIN32 || defined _WIN32)
+// atomic increment on the linux version of the Intel(tm) compiler
+#define NCNN_XADD(addr, delta) (int)_InterlockedExchangeAdd(const_cast<void*>(reinterpret_cast<volatile void*>(addr)), delta)
+#elif defined __GNUC__
+#if defined __clang__ && __clang_major__ >= 3 && !defined __ANDROID__ && !defined __EMSCRIPTEN__ && !defined(__CUDACC__)
+#ifdef __ATOMIC_ACQ_REL
+#define NCNN_XADD(addr, delta) __c11_atomic_fetch_add((_Atomic(int)*)(addr), delta, __ATOMIC_ACQ_REL)
+#else
+#define NCNN_XADD(addr, delta) __atomic_fetch_add((_Atomic(int)*)(addr), delta, 4)
+#endif
+#else
+#if defined __ATOMIC_ACQ_REL && !defined __clang__
+// version for gcc >= 4.7
+#define NCNN_XADD(addr, delta) (int)__atomic_fetch_add((unsigned*)(addr), (unsigned)(delta), __ATOMIC_ACQ_REL)
+#else
+#define NCNN_XADD(addr, delta) (int)__sync_fetch_and_add((unsigned*)(addr), (unsigned)(delta))
+#endif
+#endif
+#elif defined _MSC_VER && !defined RC_INVOKED
+#define NCNN_XADD(addr, delta) (int)_InterlockedExchangeAdd((long volatile*)addr, delta)
+#else
+// thread-unsafe branch
+static NCNN_FORCEINLINE int NCNN_XADD(int* addr, int delta)
+{
+    int tmp = *addr;
+    *addr += delta;
+    return tmp;
+}
+#endif
+#else  // NCNN_THREADS
+static NCNN_FORCEINLINE int NCNN_XADD(int* addr, int delta)
+{
+    int tmp = *addr;
+    *addr += delta;
+    return tmp;
+}
+#endif // NCNN_THREADS
+
+class NCNN_EXPORT Allocator
+{
+public:
+    virtual ~Allocator();
+    virtual void* fastMalloc(size_t size) = 0;
+    virtual void fastFree(void* ptr) = 0;
+};
+
+class PoolAllocatorPrivate;
+class NCNN_EXPORT PoolAllocator : public Allocator
+{
+public:
+    PoolAllocator();
+    ~PoolAllocator();
+
+    // ratio range 0 ~ 1
+    // default cr = 0
+    void set_size_compare_ratio(float scr);
+
+    // budget drop threshold
+    // default threshold = 10
+    void set_size_drop_threshold(size_t);
+
+    // release all budgets immediately
+    void clear();
+
+    virtual void* fastMalloc(size_t size);
+    virtual void fastFree(void* ptr);
+
+private:
+    PoolAllocator(const PoolAllocator&);
+    PoolAllocator& operator=(const PoolAllocator&);
+
+private:
+    PoolAllocatorPrivate* const d;
+};
+
+class UnlockedPoolAllocatorPrivate;
+class NCNN_EXPORT UnlockedPoolAllocator : public Allocator
+{
+public:
+    UnlockedPoolAllocator();
+    ~UnlockedPoolAllocator();
+
+    // ratio range 0 ~ 1
+    // default cr = 0
+    void set_size_compare_ratio(float scr);
+
+    // budget drop threshold
+    // default threshold = 10
+    void set_size_drop_threshold(size_t);
+
+    // release all budgets immediately
+    void clear();
+
+    virtual void* fastMalloc(size_t size);
+    virtual void fastFree(void* ptr);
+
+private:
+    UnlockedPoolAllocator(const UnlockedPoolAllocator&);
+    UnlockedPoolAllocator& operator=(const UnlockedPoolAllocator&);
+
+private:
+    UnlockedPoolAllocatorPrivate* const d;
+};
+
+#if NCNN_VULKAN
+
+class VulkanDevice;
+
+class NCNN_EXPORT VkBufferMemory
+{
+public:
+    VkBuffer buffer;
+
+    // the base offset assigned by allocator
+    size_t offset;
+    size_t capacity;
+
+    VkDeviceMemory memory;
+    void* mapped_ptr;
+
+    // buffer state, modified by command functions internally
+    mutable VkAccessFlags access_flags;
+    mutable VkPipelineStageFlags stage_flags;
+
+    // initialize and modified by mat
+    int refcount;
+};
+
+class NCNN_EXPORT VkImageMemory
+{
+public:
+    VkImage image;
+    VkImageView imageview;
+
+    // underlying info assigned by allocator
+    int width;
+    int height;
+    int depth;
+    VkFormat format;
+
+    VkDeviceMemory memory;
+    void* mapped_ptr;
+
+    // the base offset assigned by allocator
+    size_t bind_offset;
+    size_t bind_capacity;
+
+    // image state, modified by command functions internally
+    mutable VkAccessFlags access_flags;
+    mutable VkImageLayout image_layout;
+    mutable VkPipelineStageFlags stage_flags;
+
+    // in-execution state, modified by command functions internally
+    mutable int command_refcount;
+
+    // initialize and modified by mat
+    int refcount;
+};
+
+class NCNN_EXPORT VkAllocator
+{
+public:
+    explicit VkAllocator(const VulkanDevice* _vkdev);
+    virtual ~VkAllocator();
+
+    virtual void clear();
+
+    virtual VkBufferMemory* fastMalloc(size_t size) = 0;
+    virtual void fastFree(VkBufferMemory* ptr) = 0;
+    virtual int flush(VkBufferMemory* ptr);
+    virtual int invalidate(VkBufferMemory* ptr);
+
+    virtual VkImageMemory* fastMalloc(int w, int h, int c, size_t elemsize, int elempack) = 0;
+    virtual void fastFree(VkImageMemory* ptr) = 0;
+
+public:
+    const VulkanDevice* vkdev;
+    uint32_t buffer_memory_type_index;
+    uint32_t image_memory_type_index;
+    uint32_t reserved_type_index;
+    bool mappable;
+    bool coherent;
+
+protected:
+    VkBuffer create_buffer(size_t size, VkBufferUsageFlags usage);
+    VkDeviceMemory allocate_memory(size_t size, uint32_t memory_type_index);
+    VkDeviceMemory allocate_dedicated_memory(size_t size, uint32_t memory_type_index, VkImage image, VkBuffer buffer);
+
+    VkImage create_image(int width, int height, int depth, VkFormat format, VkImageTiling tiling, VkImageUsageFlags usage);
+    VkImageView create_imageview(VkImage image, VkFormat format);
+};
+
+class VkBlobAllocatorPrivate;
+class NCNN_EXPORT VkBlobAllocator : public VkAllocator
+{
+public:
+    explicit VkBlobAllocator(const VulkanDevice* vkdev, size_t preferred_block_size = 16 * 1024 * 1024); // 16M
+    virtual ~VkBlobAllocator();
+
+public:
+    // release all budgets immediately
+    virtual void clear();
+
+    virtual VkBufferMemory* fastMalloc(size_t size);
+    virtual void fastFree(VkBufferMemory* ptr);
+    virtual VkImageMemory* fastMalloc(int w, int h, int c, size_t elemsize, int elempack);
+    virtual void fastFree(VkImageMemory* ptr);
+
+private:
+    VkBlobAllocator(const VkBlobAllocator&);
+    VkBlobAllocator& operator=(const VkBlobAllocator&);
+
+private:
+    VkBlobAllocatorPrivate* const d;
+};
+
+class VkWeightAllocatorPrivate;
+class NCNN_EXPORT VkWeightAllocator : public VkAllocator
+{
+public:
+    explicit VkWeightAllocator(const VulkanDevice* vkdev, size_t preferred_block_size = 8 * 1024 * 1024); // 8M
+    virtual ~VkWeightAllocator();
+
+public:
+    // release all blocks immediately
+    virtual void clear();
+
+public:
+    virtual VkBufferMemory* fastMalloc(size_t size);
+    virtual void fastFree(VkBufferMemory* ptr);
+    virtual VkImageMemory* fastMalloc(int w, int h, int c, size_t elemsize, int elempack);
+    virtual void fastFree(VkImageMemory* ptr);
+
+private:
+    VkWeightAllocator(const VkWeightAllocator&);
+    VkWeightAllocator& operator=(const VkWeightAllocator&);
+
+private:
+    VkWeightAllocatorPrivate* const d;
+};
+
+class VkStagingAllocatorPrivate;
+class NCNN_EXPORT VkStagingAllocator : public VkAllocator
+{
+public:
+    explicit VkStagingAllocator(const VulkanDevice* vkdev);
+    virtual ~VkStagingAllocator();
+
+public:
+    // ratio range 0 ~ 1
+    // default cr = 0.75
+    void set_size_compare_ratio(float scr);
+
+    // release all budgets immediately
+    virtual void clear();
+
+    virtual VkBufferMemory* fastMalloc(size_t size);
+    virtual void fastFree(VkBufferMemory* ptr);
+    virtual VkImageMemory* fastMalloc(int w, int h, int c, size_t elemsize, int elempack);
+    virtual void fastFree(VkImageMemory* ptr);
+
+private:
+    VkStagingAllocator(const VkStagingAllocator&);
+    VkStagingAllocator& operator=(const VkStagingAllocator&);
+
+private:
+    VkStagingAllocatorPrivate* const d;
+};
+
+class VkWeightStagingAllocatorPrivate;
+class NCNN_EXPORT VkWeightStagingAllocator : public VkAllocator
+{
+public:
+    explicit VkWeightStagingAllocator(const VulkanDevice* vkdev);
+    virtual ~VkWeightStagingAllocator();
+
+public:
+    virtual VkBufferMemory* fastMalloc(size_t size);
+    virtual void fastFree(VkBufferMemory* ptr);
+    virtual VkImageMemory* fastMalloc(int w, int h, int c, size_t elemsize, int elempack);
+    virtual void fastFree(VkImageMemory* ptr);
+
+private:
+    VkWeightStagingAllocator(const VkWeightStagingAllocator&);
+    VkWeightStagingAllocator& operator=(const VkWeightStagingAllocator&);
+
+private:
+    VkWeightStagingAllocatorPrivate* const d;
+};
+
+#if NCNN_PLATFORM_API
+#if __ANDROID_API__ >= 26
+class NCNN_EXPORT VkAndroidHardwareBufferImageAllocator : public VkAllocator
+{
+public:
+    VkAndroidHardwareBufferImageAllocator(const VulkanDevice* _vkdev, AHardwareBuffer* _hb);
+    virtual ~VkAndroidHardwareBufferImageAllocator();
+
+public:
+    virtual VkBufferMemory* fastMalloc(size_t size);
+    virtual void fastFree(VkBufferMemory* ptr);
+    virtual VkImageMemory* fastMalloc(int w, int h, int c, size_t elemsize, int elempack);
+    virtual void fastFree(VkImageMemory* ptr);
+
+private:
+    VkAndroidHardwareBufferImageAllocator(const VkAndroidHardwareBufferImageAllocator&);
+    VkAndroidHardwareBufferImageAllocator& operator=(const VkAndroidHardwareBufferImageAllocator&);
+
+public:
+    int init();
+
+    int width() const;
+    int height() const;
+    uint64_t external_format() const;
+
+public:
+    AHardwareBuffer* hb;
+    AHardwareBuffer_Desc bufferDesc;
+    VkAndroidHardwareBufferFormatPropertiesANDROID bufferFormatProperties;
+    VkAndroidHardwareBufferPropertiesANDROID bufferProperties;
+    VkSamplerYcbcrConversionKHR samplerYcbcrConversion;
+};
+#endif // __ANDROID_API__ >= 26
+#endif // NCNN_PLATFORM_API
+
+#endif // NCNN_VULKAN
+
+} // namespace ncnn
+
+#endif // NCNN_ALLOCATOR_H
diff --git a/linux/include/ncnn/benchmark.h b/linux/include/ncnn/benchmark.h
new file mode 100644
index 0000000..ed42c1a
--- /dev/null
+++ b/linux/include/ncnn/benchmark.h
@@ -0,0 +1,39 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef NCNN_BENCHMARK_H
+#define NCNN_BENCHMARK_H
+
+#include "layer.h"
+#include "mat.h"
+#include "platform.h"
+
+namespace ncnn {
+
+// get now timestamp in ms
+NCNN_EXPORT double get_current_time();
+
+// sleep milliseconds
+NCNN_EXPORT void sleep(unsigned long long int milliseconds = 1000);
+
+#if NCNN_BENCHMARK
+
+NCNN_EXPORT void benchmark(const Layer* layer, double start, double end);
+NCNN_EXPORT void benchmark(const Layer* layer, const Mat& bottom_blob, Mat& top_blob, double start, double end);
+
+#endif // NCNN_BENCHMARK
+
+} // namespace ncnn
+
+#endif // NCNN_BENCHMARK_H
diff --git a/linux/include/ncnn/blob.h b/linux/include/ncnn/blob.h
new file mode 100644
index 0000000..c9f144f
--- /dev/null
+++ b/linux/include/ncnn/blob.h
@@ -0,0 +1,44 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef NCNN_BLOB_H
+#define NCNN_BLOB_H
+
+#include "mat.h"
+#include "platform.h"
+
+namespace ncnn {
+
+class NCNN_EXPORT Blob
+{
+public:
+    // empty
+    Blob();
+
+public:
+#if NCNN_STRING
+    // blob name
+    std::string name;
+#endif // NCNN_STRING
+    // layer index which produce this blob as output
+    int producer;
+    // layer index which need this blob as input
+    int consumer;
+    // shape hint
+    Mat shape;
+};
+
+} // namespace ncnn
+
+#endif // NCNN_BLOB_H
diff --git a/linux/include/ncnn/c_api.h b/linux/include/ncnn/c_api.h
new file mode 100644
index 0000000..31d5b6d
--- /dev/null
+++ b/linux/include/ncnn/c_api.h
@@ -0,0 +1,347 @@
+/* Tencent is pleased to support the open source community by making ncnn available.
+ *
+ * Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+ *
+ * Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+ * in compliance with the License. You may obtain a copy of the License at
+ *
+ * https://opensource.org/licenses/BSD-3-Clause
+ *
+ * Unless required by applicable law or agreed to in writing, software distributed
+ * under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+ * CONDITIONS OF ANY KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations under the License.
+ */
+
+#ifndef NCNN_C_API_H
+#define NCNN_C_API_H
+
+#include "platform.h"
+
+#if NCNN_C_API
+
+#include <stddef.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+NCNN_EXPORT const char* ncnn_version();
+
+/* allocator api */
+typedef struct __ncnn_allocator_t* ncnn_allocator_t;
+struct NCNN_EXPORT __ncnn_allocator_t
+{
+    void* pthis;
+
+    void* (*fast_malloc)(ncnn_allocator_t allocator, size_t size);
+    void (*fast_free)(ncnn_allocator_t allocator, void* ptr);
+};
+
+NCNN_EXPORT ncnn_allocator_t ncnn_allocator_create_pool_allocator();
+NCNN_EXPORT ncnn_allocator_t ncnn_allocator_create_unlocked_pool_allocator();
+NCNN_EXPORT void ncnn_allocator_destroy(ncnn_allocator_t allocator);
+
+/* option api */
+typedef struct __ncnn_option_t* ncnn_option_t;
+
+NCNN_EXPORT ncnn_option_t ncnn_option_create();
+NCNN_EXPORT void ncnn_option_destroy(ncnn_option_t opt);
+
+NCNN_EXPORT int ncnn_option_get_num_threads(const ncnn_option_t opt);
+NCNN_EXPORT void ncnn_option_set_num_threads(ncnn_option_t opt, int num_threads);
+
+NCNN_EXPORT int ncnn_option_get_use_local_pool_allocator(const ncnn_option_t opt);
+NCNN_EXPORT void ncnn_option_set_use_local_pool_allocator(ncnn_option_t opt, int use_local_pool_allocator);
+
+NCNN_EXPORT void ncnn_option_set_blob_allocator(ncnn_option_t opt, ncnn_allocator_t allocator);
+NCNN_EXPORT void ncnn_option_set_workspace_allocator(ncnn_option_t opt, ncnn_allocator_t allocator);
+
+NCNN_EXPORT int ncnn_option_get_use_vulkan_compute(const ncnn_option_t opt);
+NCNN_EXPORT void ncnn_option_set_use_vulkan_compute(ncnn_option_t opt, int use_vulkan_compute);
+
+/* mat api */
+typedef struct __ncnn_mat_t* ncnn_mat_t;
+
+NCNN_EXPORT ncnn_mat_t ncnn_mat_create();
+NCNN_EXPORT ncnn_mat_t ncnn_mat_create_1d(int w, ncnn_allocator_t allocator);
+NCNN_EXPORT ncnn_mat_t ncnn_mat_create_2d(int w, int h, ncnn_allocator_t allocator);
+NCNN_EXPORT ncnn_mat_t ncnn_mat_create_3d(int w, int h, int c, ncnn_allocator_t allocator);
+NCNN_EXPORT ncnn_mat_t ncnn_mat_create_4d(int w, int h, int d, int c, ncnn_allocator_t allocator);
+NCNN_EXPORT ncnn_mat_t ncnn_mat_create_external_1d(int w, void* data, ncnn_allocator_t allocator);
+NCNN_EXPORT ncnn_mat_t ncnn_mat_create_external_2d(int w, int h, void* data, ncnn_allocator_t allocator);
+NCNN_EXPORT ncnn_mat_t ncnn_mat_create_external_3d(int w, int h, int c, void* data, ncnn_allocator_t allocator);
+NCNN_EXPORT ncnn_mat_t ncnn_mat_create_external_4d(int w, int h, int d, int c, void* data, ncnn_allocator_t allocator);
+NCNN_EXPORT ncnn_mat_t ncnn_mat_create_1d_elem(int w, size_t elemsize, int elempack, ncnn_allocator_t allocator);
+NCNN_EXPORT ncnn_mat_t ncnn_mat_create_2d_elem(int w, int h, size_t elemsize, int elempack, ncnn_allocator_t allocator);
+NCNN_EXPORT ncnn_mat_t ncnn_mat_create_3d_elem(int w, int h, int c, size_t elemsize, int elempack, ncnn_allocator_t allocator);
+NCNN_EXPORT ncnn_mat_t ncnn_mat_create_4d_elem(int w, int h, int d, int c, size_t elemsize, int elempack, ncnn_allocator_t allocator);
+NCNN_EXPORT ncnn_mat_t ncnn_mat_create_external_1d_elem(int w, void* data, size_t elemsize, int elempack, ncnn_allocator_t allocator);
+NCNN_EXPORT ncnn_mat_t ncnn_mat_create_external_2d_elem(int w, int h, void* data, size_t elemsize, int elempack, ncnn_allocator_t allocator);
+NCNN_EXPORT ncnn_mat_t ncnn_mat_create_external_3d_elem(int w, int h, int c, void* data, size_t elemsize, int elempack, ncnn_allocator_t allocator);
+NCNN_EXPORT ncnn_mat_t ncnn_mat_create_external_4d_elem(int w, int h, int d, int c, void* data, size_t elemsize, int elempack, ncnn_allocator_t allocator);
+NCNN_EXPORT void ncnn_mat_destroy(ncnn_mat_t mat);
+
+NCNN_EXPORT void ncnn_mat_fill_float(ncnn_mat_t mat, float v);
+
+NCNN_EXPORT ncnn_mat_t ncnn_mat_clone(const ncnn_mat_t mat, ncnn_allocator_t allocator);
+NCNN_EXPORT ncnn_mat_t ncnn_mat_reshape_1d(const ncnn_mat_t mat, int w, ncnn_allocator_t allocator);
+NCNN_EXPORT ncnn_mat_t ncnn_mat_reshape_2d(const ncnn_mat_t mat, int w, int h, ncnn_allocator_t allocator);
+NCNN_EXPORT ncnn_mat_t ncnn_mat_reshape_3d(const ncnn_mat_t mat, int w, int h, int c, ncnn_allocator_t allocator);
+NCNN_EXPORT ncnn_mat_t ncnn_mat_reshape_4d(const ncnn_mat_t mat, int w, int h, int d, int c, ncnn_allocator_t allocator);
+
+NCNN_EXPORT int ncnn_mat_get_dims(const ncnn_mat_t mat);
+NCNN_EXPORT int ncnn_mat_get_w(const ncnn_mat_t mat);
+NCNN_EXPORT int ncnn_mat_get_h(const ncnn_mat_t mat);
+NCNN_EXPORT int ncnn_mat_get_d(const ncnn_mat_t mat);
+NCNN_EXPORT int ncnn_mat_get_c(const ncnn_mat_t mat);
+NCNN_EXPORT size_t ncnn_mat_get_elemsize(const ncnn_mat_t mat);
+NCNN_EXPORT int ncnn_mat_get_elempack(const ncnn_mat_t mat);
+NCNN_EXPORT size_t ncnn_mat_get_cstep(const ncnn_mat_t mat);
+NCNN_EXPORT void* ncnn_mat_get_data(const ncnn_mat_t mat);
+
+NCNN_EXPORT void* ncnn_mat_get_channel_data(const ncnn_mat_t mat, int c);
+
+#if NCNN_PIXEL
+
+/* mat pixel api */
+#define NCNN_MAT_PIXEL_RGB       1
+#define NCNN_MAT_PIXEL_BGR       2
+#define NCNN_MAT_PIXEL_GRAY      3
+#define NCNN_MAT_PIXEL_RGBA      4
+#define NCNN_MAT_PIXEL_BGRA      5
+#define NCNN_MAT_PIXEL_X2Y(X, Y) (X | (Y << 16))
+NCNN_EXPORT ncnn_mat_t ncnn_mat_from_pixels(const unsigned char* pixels, int type, int w, int h, int stride, ncnn_allocator_t allocator);
+NCNN_EXPORT ncnn_mat_t ncnn_mat_from_pixels_resize(const unsigned char* pixels, int type, int w, int h, int stride, int target_width, int target_height, ncnn_allocator_t allocator);
+NCNN_EXPORT ncnn_mat_t ncnn_mat_from_pixels_roi(const unsigned char* pixels, int type, int w, int h, int stride, int roix, int roiy, int roiw, int roih, ncnn_allocator_t allocator);
+NCNN_EXPORT ncnn_mat_t ncnn_mat_from_pixels_roi_resize(const unsigned char* pixels, int type, int w, int h, int stride, int roix, int roiy, int roiw, int roih, int target_width, int target_height, ncnn_allocator_t allocator);
+NCNN_EXPORT void ncnn_mat_to_pixels(const ncnn_mat_t mat, unsigned char* pixels, int type, int stride);
+NCNN_EXPORT void ncnn_mat_to_pixels_resize(const ncnn_mat_t mat, unsigned char* pixels, int type, int target_width, int target_height, int target_stride);
+
+#endif /* NCNN_PIXEL */
+
+NCNN_EXPORT void ncnn_mat_substract_mean_normalize(ncnn_mat_t mat, const float* mean_vals, const float* norm_vals);
+
+NCNN_EXPORT void ncnn_convert_packing(const ncnn_mat_t src, ncnn_mat_t* dst, int elempack, const ncnn_option_t opt);
+NCNN_EXPORT void ncnn_flatten(const ncnn_mat_t src, ncnn_mat_t* dst, const ncnn_option_t opt);
+
+/* blob api */
+typedef struct __ncnn_blob_t* ncnn_blob_t;
+
+#if NCNN_STRING
+NCNN_EXPORT const char* ncnn_blob_get_name(const ncnn_blob_t blob);
+#endif /* NCNN_STRING */
+
+NCNN_EXPORT int ncnn_blob_get_producer(const ncnn_blob_t blob);
+NCNN_EXPORT int ncnn_blob_get_consumer(const ncnn_blob_t blob);
+
+NCNN_EXPORT void ncnn_blob_get_shape(const ncnn_blob_t blob, int* dims, int* w, int* h, int* c);
+
+/* paramdict api */
+typedef struct __ncnn_paramdict_t* ncnn_paramdict_t;
+
+NCNN_EXPORT ncnn_paramdict_t ncnn_paramdict_create();
+NCNN_EXPORT void ncnn_paramdict_destroy(ncnn_paramdict_t pd);
+
+NCNN_EXPORT int ncnn_paramdict_get_type(const ncnn_paramdict_t pd, int id);
+
+NCNN_EXPORT int ncnn_paramdict_get_int(const ncnn_paramdict_t pd, int id, int def);
+NCNN_EXPORT float ncnn_paramdict_get_float(const ncnn_paramdict_t pd, int id, float def);
+NCNN_EXPORT ncnn_mat_t ncnn_paramdict_get_array(const ncnn_paramdict_t pd, int id, const ncnn_mat_t def);
+
+NCNN_EXPORT void ncnn_paramdict_set_int(ncnn_paramdict_t pd, int id, int i);
+NCNN_EXPORT void ncnn_paramdict_set_float(ncnn_paramdict_t pd, int id, float f);
+NCNN_EXPORT void ncnn_paramdict_set_array(ncnn_paramdict_t pd, int id, const ncnn_mat_t v);
+
+/* datareader api */
+typedef struct __ncnn_datareader_t* ncnn_datareader_t;
+struct NCNN_EXPORT __ncnn_datareader_t
+{
+    void* pthis;
+
+#if NCNN_STRING
+    int (*scan)(ncnn_datareader_t dr, const char* format, void* p);
+#endif /* NCNN_STRING */
+    size_t (*read)(ncnn_datareader_t dr, void* buf, size_t size);
+};
+
+NCNN_EXPORT ncnn_datareader_t ncnn_datareader_create();
+#if NCNN_STDIO
+NCNN_EXPORT ncnn_datareader_t ncnn_datareader_create_from_stdio(FILE* fp);
+#endif /* NCNN_STDIO */
+NCNN_EXPORT ncnn_datareader_t ncnn_datareader_create_from_memory(const unsigned char** mem);
+NCNN_EXPORT void ncnn_datareader_destroy(ncnn_datareader_t dr);
+
+/* modelbin api */
+typedef struct __ncnn_modelbin_t* ncnn_modelbin_t;
+struct NCNN_EXPORT __ncnn_modelbin_t
+{
+    void* pthis;
+
+    ncnn_mat_t (*load_1d)(const ncnn_modelbin_t mb, int w, int type);
+    ncnn_mat_t (*load_2d)(const ncnn_modelbin_t mb, int w, int h, int type);
+    ncnn_mat_t (*load_3d)(const ncnn_modelbin_t mb, int w, int h, int c, int type);
+};
+
+NCNN_EXPORT ncnn_modelbin_t ncnn_modelbin_create_from_datareader(const ncnn_datareader_t dr);
+NCNN_EXPORT ncnn_modelbin_t ncnn_modelbin_create_from_mat_array(const ncnn_mat_t* weights, int n);
+NCNN_EXPORT void ncnn_modelbin_destroy(ncnn_modelbin_t mb);
+
+/* layer api */
+typedef struct __ncnn_layer_t* ncnn_layer_t;
+struct NCNN_EXPORT __ncnn_layer_t
+{
+    void* pthis;
+
+    int (*load_param)(ncnn_layer_t layer, const ncnn_paramdict_t pd);
+    int (*load_model)(ncnn_layer_t layer, const ncnn_modelbin_t mb);
+
+    int (*create_pipeline)(ncnn_layer_t layer, const ncnn_option_t opt);
+    int (*destroy_pipeline)(ncnn_layer_t layer, const ncnn_option_t opt);
+
+    int (*forward_1)(const ncnn_layer_t layer, const ncnn_mat_t bottom_blob, ncnn_mat_t* top_blob, const ncnn_option_t opt);
+    int (*forward_n)(const ncnn_layer_t layer, const ncnn_mat_t* bottom_blobs, int n, ncnn_mat_t* top_blobs, int n2, const ncnn_option_t opt);
+
+    int (*forward_inplace_1)(const ncnn_layer_t layer, ncnn_mat_t bottom_top_blob, const ncnn_option_t opt);
+    int (*forward_inplace_n)(const ncnn_layer_t layer, ncnn_mat_t* bottom_top_blobs, int n, const ncnn_option_t opt);
+};
+
+NCNN_EXPORT ncnn_layer_t ncnn_layer_create();
+NCNN_EXPORT ncnn_layer_t ncnn_layer_create_by_typeindex(int typeindex);
+#if NCNN_STRING
+NCNN_EXPORT ncnn_layer_t ncnn_layer_create_by_type(const char* type);
+NCNN_EXPORT int ncnn_layer_type_to_index(const char* type);
+#endif /* NCNN_STRING */
+NCNN_EXPORT void ncnn_layer_destroy(ncnn_layer_t layer);
+
+#if NCNN_STRING
+NCNN_EXPORT const char* ncnn_layer_get_name(const ncnn_layer_t layer);
+#endif /* NCNN_STRING */
+
+NCNN_EXPORT int ncnn_layer_get_typeindex(const ncnn_layer_t layer);
+#if NCNN_STRING
+NCNN_EXPORT const char* ncnn_layer_get_type(const ncnn_layer_t layer);
+#endif /* NCNN_STRING */
+
+NCNN_EXPORT int ncnn_layer_get_one_blob_only(const ncnn_layer_t layer);
+NCNN_EXPORT int ncnn_layer_get_support_inplace(const ncnn_layer_t layer);
+NCNN_EXPORT int ncnn_layer_get_support_vulkan(const ncnn_layer_t layer);
+NCNN_EXPORT int ncnn_layer_get_support_packing(const ncnn_layer_t layer);
+NCNN_EXPORT int ncnn_layer_get_support_bf16_storage(const ncnn_layer_t layer);
+NCNN_EXPORT int ncnn_layer_get_support_fp16_storage(const ncnn_layer_t layer);
+NCNN_EXPORT int ncnn_layer_get_support_image_storage(const ncnn_layer_t layer);
+
+NCNN_EXPORT void ncnn_layer_set_one_blob_only(ncnn_layer_t layer, int enable);
+NCNN_EXPORT void ncnn_layer_set_support_inplace(ncnn_layer_t layer, int enable);
+NCNN_EXPORT void ncnn_layer_set_support_vulkan(ncnn_layer_t layer, int enable);
+NCNN_EXPORT void ncnn_layer_set_support_packing(ncnn_layer_t layer, int enable);
+NCNN_EXPORT void ncnn_layer_set_support_bf16_storage(ncnn_layer_t layer, int enable);
+NCNN_EXPORT void ncnn_layer_set_support_fp16_storage(ncnn_layer_t layer, int enable);
+NCNN_EXPORT void ncnn_layer_set_support_image_storage(ncnn_layer_t layer, int enable);
+
+NCNN_EXPORT int ncnn_layer_get_bottom_count(const ncnn_layer_t layer);
+NCNN_EXPORT int ncnn_layer_get_bottom(const ncnn_layer_t layer, int i);
+NCNN_EXPORT int ncnn_layer_get_top_count(const ncnn_layer_t layer);
+NCNN_EXPORT int ncnn_layer_get_top(const ncnn_layer_t layer, int i);
+
+NCNN_EXPORT void ncnn_blob_get_bottom_shape(const ncnn_layer_t layer, int i, int* dims, int* w, int* h, int* c);
+NCNN_EXPORT void ncnn_blob_get_top_shape(const ncnn_layer_t layer, int i, int* dims, int* w, int* h, int* c);
+
+/* layer factory function */
+typedef ncnn_layer_t (*ncnn_layer_creator_t)(void* userdata);
+typedef void (*ncnn_layer_destroyer_t)(ncnn_layer_t layer, void* userdata);
+
+typedef struct __ncnn_net_custom_layer_factory_t* ncnn_net_custom_layer_factory_t;
+struct __ncnn_net_custom_layer_factory_t
+{
+    ncnn_layer_creator_t creator;
+    ncnn_layer_destroyer_t destroyer;
+    void* userdata;
+    ncnn_net_custom_layer_factory_t next;
+};
+
+/* net api */
+typedef struct __ncnn_net_t* ncnn_net_t;
+struct __ncnn_net_t
+{
+    void* pthis;
+
+    ncnn_net_custom_layer_factory_t custom_layer_factory;
+};
+
+NCNN_EXPORT ncnn_net_t ncnn_net_create();
+NCNN_EXPORT void ncnn_net_destroy(ncnn_net_t net);
+
+NCNN_EXPORT ncnn_option_t ncnn_net_get_option(ncnn_net_t net);
+NCNN_EXPORT void ncnn_net_set_option(ncnn_net_t net, ncnn_option_t opt);
+
+#if NCNN_STRING
+NCNN_EXPORT void ncnn_net_register_custom_layer_by_type(ncnn_net_t net, const char* type, ncnn_layer_creator_t creator, ncnn_layer_destroyer_t destroyer, void* userdata);
+#endif /* NCNN_STRING */
+NCNN_EXPORT void ncnn_net_register_custom_layer_by_typeindex(ncnn_net_t net, int typeindex, ncnn_layer_creator_t creator, ncnn_layer_destroyer_t destroyer, void* userdata);
+
+#if NCNN_STDIO
+#if NCNN_STRING
+NCNN_EXPORT int ncnn_net_load_param(ncnn_net_t net, const char* path);
+#endif /* NCNN_STRING */
+NCNN_EXPORT int ncnn_net_load_param_bin(ncnn_net_t net, const char* path);
+NCNN_EXPORT int ncnn_net_load_model(ncnn_net_t net, const char* path);
+#endif /* NCNN_STDIO */
+
+#if NCNN_STDIO
+#if NCNN_STRING
+NCNN_EXPORT int ncnn_net_load_param_memory(ncnn_net_t net, const char* mem);
+#endif /* NCNN_STRING */
+#endif /* NCNN_STDIO */
+NCNN_EXPORT int ncnn_net_load_param_bin_memory(ncnn_net_t net, const unsigned char* mem);
+NCNN_EXPORT int ncnn_net_load_model_memory(ncnn_net_t net, const unsigned char* mem);
+
+#if NCNN_STRING
+NCNN_EXPORT int ncnn_net_load_param_datareader(ncnn_net_t net, const ncnn_datareader_t dr);
+#endif /* NCNN_STRING */
+NCNN_EXPORT int ncnn_net_load_param_bin_datareader(ncnn_net_t net, const ncnn_datareader_t dr);
+NCNN_EXPORT int ncnn_net_load_model_datareader(ncnn_net_t net, const ncnn_datareader_t dr);
+
+NCNN_EXPORT void ncnn_net_clear(ncnn_net_t net);
+
+NCNN_EXPORT int ncnn_net_get_input_count(const ncnn_net_t net);
+NCNN_EXPORT int ncnn_net_get_output_count(const ncnn_net_t net);
+#if NCNN_STRING
+NCNN_EXPORT const char* ncnn_net_get_input_name(const ncnn_net_t net, int i);
+NCNN_EXPORT const char* ncnn_net_get_output_name(const ncnn_net_t net, int i);
+#endif /* NCNN_STRING */
+NCNN_EXPORT int ncnn_net_get_input_index(const ncnn_net_t net, int i);
+NCNN_EXPORT int ncnn_net_get_output_index(const ncnn_net_t net, int i);
+
+/* extractor api */
+typedef struct __ncnn_extractor_t* ncnn_extractor_t;
+
+NCNN_EXPORT ncnn_extractor_t ncnn_extractor_create(ncnn_net_t net);
+NCNN_EXPORT void ncnn_extractor_destroy(ncnn_extractor_t ex);
+
+NCNN_EXPORT void ncnn_extractor_set_option(ncnn_extractor_t ex, const ncnn_option_t opt);
+
+#if NCNN_STRING
+NCNN_EXPORT int ncnn_extractor_input(ncnn_extractor_t ex, const char* name, const ncnn_mat_t mat);
+NCNN_EXPORT int ncnn_extractor_extract(ncnn_extractor_t ex, const char* name, ncnn_mat_t* mat);
+#endif /* NCNN_STRING */
+NCNN_EXPORT int ncnn_extractor_input_index(ncnn_extractor_t ex, int index, const ncnn_mat_t mat);
+NCNN_EXPORT int ncnn_extractor_extract_index(ncnn_extractor_t ex, int index, ncnn_mat_t* mat);
+
+/* mat process api */
+#define NCNN_BORDER_CONSTANT    0
+#define NCNN_BORDER_REPLICATE   1
+#define NCNN_BORDER_REFLECT     2
+#define NCNN_BORDER_TRANSPARENT -233
+NCNN_EXPORT void ncnn_copy_make_border(const ncnn_mat_t src, ncnn_mat_t dst, int top, int bottom, int left, int right, int type, float v, const ncnn_option_t opt);
+NCNN_EXPORT void ncnn_copy_make_border_3d(const ncnn_mat_t src, ncnn_mat_t dst, int top, int bottom, int left, int right, int front, int behind, int type, float v, const ncnn_option_t opt);
+NCNN_EXPORT void ncnn_copy_cut_border(const ncnn_mat_t src, ncnn_mat_t dst, int top, int bottom, int left, int right, const ncnn_option_t opt);
+NCNN_EXPORT void ncnn_copy_cut_border_3d(const ncnn_mat_t src, ncnn_mat_t dst, int top, int bottom, int left, int right, int front, int behind, const ncnn_option_t opt);
+
+#ifdef __cplusplus
+} /* extern "C" */
+#endif
+
+#endif /* NCNN_C_API */
+
+#endif /* NCNN_C_API_H */
diff --git a/linux/include/ncnn/command.h b/linux/include/ncnn/command.h
new file mode 100644
index 0000000..337d085
--- /dev/null
+++ b/linux/include/ncnn/command.h
@@ -0,0 +1,136 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef NCNN_COMMAND_H
+#define NCNN_COMMAND_H
+
+#include "platform.h"
+
+#if NCNN_VULKAN
+
+#include "mat.h"
+
+#include <vulkan/vulkan.h>
+
+namespace ncnn {
+
+class Pipeline;
+#if NCNN_PLATFORM_API
+#if __ANDROID_API__ >= 26
+class ImportAndroidHardwareBufferPipeline;
+#endif // __ANDROID_API__ >= 26
+#endif // NCNN_PLATFORM_API
+class VkComputePrivate;
+class NCNN_EXPORT VkCompute
+{
+public:
+    explicit VkCompute(const VulkanDevice* vkdev);
+    virtual ~VkCompute();
+
+public:
+    void record_upload(const Mat& src, VkMat& dst, const Option& opt);
+
+    void record_upload(const Mat& src, VkImageMat& dst, const Option& opt);
+
+    void record_download(const VkMat& src, Mat& dst, const Option& opt);
+
+    void record_download(const VkImageMat& src, Mat& dst, const Option& opt);
+
+    void record_buffer_to_image(const VkMat& src, VkImageMat& dst, const Option& opt);
+
+    void record_image_to_buffer(const VkImageMat& src, VkMat& dst, const Option& opt);
+
+    void record_clone(const Mat& src, VkMat& dst, const Option& opt);
+
+    void record_clone(const Mat& src, VkImageMat& dst, const Option& opt);
+
+    void record_clone(const VkMat& src, Mat& dst, const Option& opt);
+
+    void record_clone(const VkImageMat& src, Mat& dst, const Option& opt);
+
+    void record_clone(const VkMat& src, VkMat& dst, const Option& opt);
+
+    void record_clone(const VkImageMat& src, VkImageMat& dst, const Option& opt);
+
+    void record_clone(const VkMat& src, VkImageMat& dst, const Option& opt);
+
+    void record_clone(const VkImageMat& src, VkMat& dst, const Option& opt);
+
+    void record_pipeline(const Pipeline* pipeline, const std::vector<VkMat>& bindings, const std::vector<vk_constant_type>& constants, const VkMat& dispatcher);
+
+    void record_pipeline(const Pipeline* pipeline, const std::vector<VkImageMat>& bindings, const std::vector<vk_constant_type>& constants, const VkImageMat& dispatcher);
+
+    void record_pipeline(const Pipeline* pipeline, const std::vector<VkMat>& buffer_bindings, const std::vector<VkImageMat>& image_bindings, const std::vector<vk_constant_type>& constants, const VkMat& dispatcher);
+    void record_pipeline(const Pipeline* pipeline, const std::vector<VkMat>& buffer_bindings, const std::vector<VkImageMat>& image_bindings, const std::vector<vk_constant_type>& constants, const VkImageMat& dispatcher);
+    void record_pipeline(const Pipeline* pipeline, const std::vector<VkMat>& buffer_bindings, const std::vector<VkImageMat>& image_bindings, const std::vector<vk_constant_type>& constants, const Mat& dispatcher);
+
+#if NCNN_BENCHMARK
+    void record_write_timestamp(uint32_t query);
+#endif // NCNN_BENCHMARK
+
+#if NCNN_PLATFORM_API
+#if __ANDROID_API__ >= 26
+    void record_import_android_hardware_buffer(const ImportAndroidHardwareBufferPipeline* pipeline, const VkImageMat& src, const VkMat& dst);
+
+    void record_import_android_hardware_buffer(const ImportAndroidHardwareBufferPipeline* pipeline, const VkImageMat& src, const VkImageMat& dst);
+#endif // __ANDROID_API__ >= 26
+#endif // NCNN_PLATFORM_API
+
+    int submit_and_wait();
+
+    int reset();
+
+#if NCNN_BENCHMARK
+    int create_query_pool(uint32_t query_count);
+
+    int get_query_pool_results(uint32_t first_query, uint32_t query_count, std::vector<uint64_t>& results);
+#endif // NCNN_BENCHMARK
+
+protected:
+    const VulkanDevice* vkdev;
+
+    void barrier_readwrite(const VkMat& binding);
+    void barrier_readwrite(const VkImageMat& binding);
+    void barrier_readonly(const VkImageMat& binding);
+
+private:
+    VkComputePrivate* const d;
+};
+
+class VkTransferPrivate;
+class NCNN_EXPORT VkTransfer
+{
+public:
+    explicit VkTransfer(const VulkanDevice* vkdev);
+    virtual ~VkTransfer();
+
+public:
+    void record_upload(const Mat& src, VkMat& dst, const Option& opt, bool flatten = true);
+
+    void record_upload(const Mat& src, VkImageMat& dst, const Option& opt);
+
+    int submit_and_wait();
+
+protected:
+    const VulkanDevice* vkdev;
+
+private:
+    VkTransferPrivate* const d;
+};
+
+} // namespace ncnn
+
+#endif // NCNN_VULKAN
+
+#endif // NCNN_COMMAND_H
diff --git a/linux/include/ncnn/cpu.h b/linux/include/ncnn/cpu.h
new file mode 100644
index 0000000..7d6bfce
--- /dev/null
+++ b/linux/include/ncnn/cpu.h
@@ -0,0 +1,178 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef NCNN_CPU_H
+#define NCNN_CPU_H
+
+#include <stddef.h>
+
+#if (defined _WIN32 && !(defined __MINGW32__))
+#define WIN32_LEAN_AND_MEAN
+#include <windows.h>
+#endif
+#if defined __ANDROID__ || defined __linux__
+#include <sched.h> // cpu_set_t
+#endif
+
+#include "platform.h"
+
+namespace ncnn {
+
+class NCNN_EXPORT CpuSet
+{
+public:
+    CpuSet();
+    void enable(int cpu);
+    void disable(int cpu);
+    void disable_all();
+    bool is_enabled(int cpu) const;
+    int num_enabled() const;
+
+public:
+#if (defined _WIN32 && !(defined __MINGW32__))
+    ULONG_PTR mask;
+#endif
+#if defined __ANDROID__ || defined __linux__
+    cpu_set_t cpu_set;
+#endif
+#if __APPLE__
+    unsigned int policy;
+#endif
+};
+
+// test optional cpu features
+// edsp = armv7 edsp
+NCNN_EXPORT int cpu_support_arm_edsp();
+// neon = armv7 neon or aarch64 asimd
+NCNN_EXPORT int cpu_support_arm_neon();
+// vfpv4 = armv7 fp16 + fma
+NCNN_EXPORT int cpu_support_arm_vfpv4();
+// asimdhp = aarch64 asimd half precision
+NCNN_EXPORT int cpu_support_arm_asimdhp();
+// cpuid = aarch64 cpuid info
+NCNN_EXPORT int cpu_support_arm_cpuid();
+// asimddp = aarch64 asimd dot product
+NCNN_EXPORT int cpu_support_arm_asimddp();
+// asimdfhm = aarch64 asimd fhm
+NCNN_EXPORT int cpu_support_arm_asimdfhm();
+// bf16 = aarch64 bf16
+NCNN_EXPORT int cpu_support_arm_bf16();
+// i8mm = aarch64 i8mm
+NCNN_EXPORT int cpu_support_arm_i8mm();
+// sve = aarch64 sve
+NCNN_EXPORT int cpu_support_arm_sve();
+// sve2 = aarch64 sve2
+NCNN_EXPORT int cpu_support_arm_sve2();
+// svebf16 = aarch64 svebf16
+NCNN_EXPORT int cpu_support_arm_svebf16();
+// svei8mm = aarch64 svei8mm
+NCNN_EXPORT int cpu_support_arm_svei8mm();
+// svef32mm = aarch64 svef32mm
+NCNN_EXPORT int cpu_support_arm_svef32mm();
+
+// avx = x86 avx
+NCNN_EXPORT int cpu_support_x86_avx();
+// fma = x86 fma
+NCNN_EXPORT int cpu_support_x86_fma();
+// xop = x86 xop
+NCNN_EXPORT int cpu_support_x86_xop();
+// f16c = x86 f16c
+NCNN_EXPORT int cpu_support_x86_f16c();
+// avx2 = x86 avx2 + fma + f16c
+NCNN_EXPORT int cpu_support_x86_avx2();
+// avx_vnni = x86 avx vnni
+NCNN_EXPORT int cpu_support_x86_avx_vnni();
+// avx512 = x86 avx512f + avx512cd + avx512bw + avx512dq + avx512vl
+NCNN_EXPORT int cpu_support_x86_avx512();
+// avx512_vnni = x86 avx512 vnni
+NCNN_EXPORT int cpu_support_x86_avx512_vnni();
+// avx512_bf16 = x86 avx512 bf16
+NCNN_EXPORT int cpu_support_x86_avx512_bf16();
+// avx512_fp16 = x86 avx512 fp16
+NCNN_EXPORT int cpu_support_x86_avx512_fp16();
+
+// lsx = loongarch lsx
+NCNN_EXPORT int cpu_support_loongarch_lsx();
+// lasx = loongarch lasx
+NCNN_EXPORT int cpu_support_loongarch_lasx();
+
+// msa = mips mas
+NCNN_EXPORT int cpu_support_mips_msa();
+// mmi = loongson mmi
+NCNN_EXPORT int cpu_support_loongson_mmi();
+
+// v = riscv vector
+NCNN_EXPORT int cpu_support_riscv_v();
+// zfh = riscv half-precision float
+NCNN_EXPORT int cpu_support_riscv_zfh();
+// vlenb = riscv vector length in bytes
+NCNN_EXPORT int cpu_riscv_vlenb();
+
+// cpu info
+NCNN_EXPORT int get_cpu_count();
+NCNN_EXPORT int get_little_cpu_count();
+NCNN_EXPORT int get_big_cpu_count();
+
+NCNN_EXPORT int get_physical_cpu_count();
+NCNN_EXPORT int get_physical_little_cpu_count();
+NCNN_EXPORT int get_physical_big_cpu_count();
+
+// cpu l2 varies from 64k to 1M, but l3 can be zero
+NCNN_EXPORT int get_cpu_level2_cache_size();
+NCNN_EXPORT int get_cpu_level3_cache_size();
+
+// bind all threads on little clusters if powersave enabled
+// affects HMP arch cpu like ARM big.LITTLE
+// only implemented on android at the moment
+// switching powersave is expensive and not thread-safe
+// 0 = all cores enabled(default)
+// 1 = only little clusters enabled
+// 2 = only big clusters enabled
+// return 0 if success for setter function
+NCNN_EXPORT int get_cpu_powersave();
+NCNN_EXPORT int set_cpu_powersave(int powersave);
+
+// convenient wrapper
+NCNN_EXPORT const CpuSet& get_cpu_thread_affinity_mask(int powersave);
+
+// set explicit thread affinity
+NCNN_EXPORT int set_cpu_thread_affinity(const CpuSet& thread_affinity_mask);
+
+// runtime thread affinity info
+NCNN_EXPORT int is_current_thread_running_on_a53_a55();
+
+// misc function wrapper for openmp routines
+NCNN_EXPORT int get_omp_num_threads();
+NCNN_EXPORT void set_omp_num_threads(int num_threads);
+
+NCNN_EXPORT int get_omp_dynamic();
+NCNN_EXPORT void set_omp_dynamic(int dynamic);
+
+NCNN_EXPORT int get_omp_thread_num();
+
+NCNN_EXPORT int get_kmp_blocktime();
+NCNN_EXPORT void set_kmp_blocktime(int time_ms);
+
+// need to flush denormals on Intel Chipset.
+// Other architectures such as ARM can be added as needed.
+// 0 = DAZ OFF, FTZ OFF
+// 1 = DAZ ON , FTZ OFF
+// 2 = DAZ OFF, FTZ ON
+// 3 = DAZ ON,  FTZ ON
+NCNN_EXPORT int get_flush_denormals();
+NCNN_EXPORT int set_flush_denormals(int flush_denormals);
+
+} // namespace ncnn
+
+#endif // NCNN_CPU_H
diff --git a/linux/include/ncnn/datareader.h b/linux/include/ncnn/datareader.h
new file mode 100644
index 0000000..ed2aba3
--- /dev/null
+++ b/linux/include/ncnn/datareader.h
@@ -0,0 +1,122 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef NCNN_DATAREADER_H
+#define NCNN_DATAREADER_H
+
+#include "platform.h"
+#if NCNN_STDIO
+#include <stdio.h>
+#endif
+
+#if NCNN_PLATFORM_API
+#if __ANDROID_API__ >= 9
+#include <android/asset_manager.h>
+#endif
+#endif // NCNN_PLATFORM_API
+
+namespace ncnn {
+
+// data read wrapper
+class NCNN_EXPORT DataReader
+{
+public:
+    DataReader();
+    virtual ~DataReader();
+
+#if NCNN_STRING
+    // parse plain param text
+    // return 1 if scan success
+    virtual int scan(const char* format, void* p) const;
+#endif // NCNN_STRING
+
+    // read binary param and model data
+    // return bytes read
+    virtual size_t read(void* buf, size_t size) const;
+
+    // get model data reference
+    // return bytes referenced
+    virtual size_t reference(size_t size, const void** buf) const;
+};
+
+#if NCNN_STDIO
+class DataReaderFromStdioPrivate;
+class NCNN_EXPORT DataReaderFromStdio : public DataReader
+{
+public:
+    explicit DataReaderFromStdio(FILE* fp);
+    virtual ~DataReaderFromStdio();
+
+#if NCNN_STRING
+    virtual int scan(const char* format, void* p) const;
+#endif // NCNN_STRING
+    virtual size_t read(void* buf, size_t size) const;
+
+private:
+    DataReaderFromStdio(const DataReaderFromStdio&);
+    DataReaderFromStdio& operator=(const DataReaderFromStdio&);
+
+private:
+    DataReaderFromStdioPrivate* const d;
+};
+#endif // NCNN_STDIO
+
+class DataReaderFromMemoryPrivate;
+class NCNN_EXPORT DataReaderFromMemory : public DataReader
+{
+public:
+    explicit DataReaderFromMemory(const unsigned char*& mem);
+    virtual ~DataReaderFromMemory();
+
+#if NCNN_STRING
+    virtual int scan(const char* format, void* p) const;
+#endif // NCNN_STRING
+    virtual size_t read(void* buf, size_t size) const;
+    virtual size_t reference(size_t size, const void** buf) const;
+
+private:
+    DataReaderFromMemory(const DataReaderFromMemory&);
+    DataReaderFromMemory& operator=(const DataReaderFromMemory&);
+
+private:
+    DataReaderFromMemoryPrivate* const d;
+};
+
+#if NCNN_PLATFORM_API
+#if __ANDROID_API__ >= 9
+class DataReaderFromAndroidAssetPrivate;
+class NCNN_EXPORT DataReaderFromAndroidAsset : public DataReader
+{
+public:
+    explicit DataReaderFromAndroidAsset(AAsset* asset);
+    virtual ~DataReaderFromAndroidAsset();
+
+#if NCNN_STRING
+    virtual int scan(const char* format, void* p) const;
+#endif // NCNN_STRING
+    virtual size_t read(void* buf, size_t size) const;
+
+private:
+    DataReaderFromAndroidAsset(const DataReaderFromAndroidAsset&);
+    DataReaderFromAndroidAsset& operator=(const DataReaderFromAndroidAsset&);
+
+private:
+    DataReaderFromAndroidAssetPrivate* const d;
+};
+#endif // __ANDROID_API__ >= 9
+#endif // NCNN_PLATFORM_API
+
+} // namespace ncnn
+
+#endif // NCNN_DATAREADER_H
diff --git a/linux/include/ncnn/gpu.h b/linux/include/ncnn/gpu.h
new file mode 100644
index 0000000..1eff228
--- /dev/null
+++ b/linux/include/ncnn/gpu.h
@@ -0,0 +1,392 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2018 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef NCNN_GPU_H
+#define NCNN_GPU_H
+
+#include "platform.h"
+
+#if NCNN_VULKAN
+
+#include "mat.h"
+
+#include <vulkan/vulkan.h>
+
+#include "vulkan_header_fix.h"
+
+namespace ncnn {
+
+// instance
+
+// Create VkInstance and initialize some objects that need to be calculated by GPU
+// Creates a VkInstance object, Checks the extended attributes supported by the Vulkan instance concerned,
+// Initializes, and creates Vulkan validation layers (if ENABLE_VALIDATION_LAYER is enabled),
+// Iterates over all supported physical devices, etc.
+NCNN_EXPORT int create_gpu_instance();
+
+// Get global VkInstance variable
+// Must be called after create_gpu_instance() and before destroy_gpu_instance()
+NCNN_EXPORT VkInstance get_gpu_instance();
+
+// Destroy VkInstance object and free the memory of the associated object
+// Usually called in the destructor of the main program exit
+NCNN_EXPORT void destroy_gpu_instance();
+
+// instance extension capability
+extern int support_VK_KHR_external_memory_capabilities;
+extern int support_VK_KHR_get_physical_device_properties2;
+extern int support_VK_KHR_get_surface_capabilities2;
+extern int support_VK_KHR_surface;
+extern int support_VK_EXT_debug_utils;
+extern int support_VK_EXT_validation_features;
+extern int support_VK_EXT_validation_flags;
+#if __ANDROID_API__ >= 26
+extern int support_VK_KHR_android_surface;
+#endif // __ANDROID_API__ >= 26
+
+// VK_KHR_cooperative_matrix
+extern PFN_vkGetPhysicalDeviceCooperativeMatrixPropertiesKHR vkGetPhysicalDeviceCooperativeMatrixPropertiesKHR;
+
+// VK_KHR_external_memory_capabilities
+extern PFN_vkGetPhysicalDeviceExternalBufferPropertiesKHR vkGetPhysicalDeviceExternalBufferPropertiesKHR;
+
+// VK_KHR_get_physical_device_properties2
+extern PFN_vkGetPhysicalDeviceFeatures2KHR vkGetPhysicalDeviceFeatures2KHR;
+extern PFN_vkGetPhysicalDeviceProperties2KHR vkGetPhysicalDeviceProperties2KHR;
+extern PFN_vkGetPhysicalDeviceFormatProperties2KHR vkGetPhysicalDeviceFormatProperties2KHR;
+extern PFN_vkGetPhysicalDeviceImageFormatProperties2KHR vkGetPhysicalDeviceImageFormatProperties2KHR;
+extern PFN_vkGetPhysicalDeviceQueueFamilyProperties2KHR vkGetPhysicalDeviceQueueFamilyProperties2KHR;
+extern PFN_vkGetPhysicalDeviceMemoryProperties2KHR vkGetPhysicalDeviceMemoryProperties2KHR;
+extern PFN_vkGetPhysicalDeviceSparseImageFormatProperties2KHR vkGetPhysicalDeviceSparseImageFormatProperties2KHR;
+
+// VK_KHR_get_surface_capabilities2
+extern PFN_vkGetPhysicalDeviceSurfaceCapabilities2KHR vkGetPhysicalDeviceSurfaceCapabilities2KHR;
+extern PFN_vkGetPhysicalDeviceSurfaceFormats2KHR vkGetPhysicalDeviceSurfaceFormats2KHR;
+
+// VK_KHR_surface
+extern PFN_vkDestroySurfaceKHR vkDestroySurfaceKHR;
+extern PFN_vkGetPhysicalDeviceSurfaceSupportKHR vkGetPhysicalDeviceSurfaceSupportKHR;
+extern PFN_vkGetPhysicalDeviceSurfaceCapabilitiesKHR vkGetPhysicalDeviceSurfaceCapabilitiesKHR;
+extern PFN_vkGetPhysicalDeviceSurfaceFormatsKHR vkGetPhysicalDeviceSurfaceFormatsKHR;
+extern PFN_vkGetPhysicalDeviceSurfacePresentModesKHR vkGetPhysicalDeviceSurfacePresentModesKHR;
+
+#if __ANDROID_API__ >= 26
+// VK_KHR_android_surface
+extern PFN_vkCreateAndroidSurfaceKHR vkCreateAndroidSurfaceKHR;
+#endif // __ANDROID_API__ >= 26
+
+// VK_NV_cooperative_matrix
+extern PFN_vkGetPhysicalDeviceCooperativeMatrixPropertiesNV vkGetPhysicalDeviceCooperativeMatrixPropertiesNV;
+
+// get info
+NCNN_EXPORT int get_gpu_count();
+NCNN_EXPORT int get_default_gpu_index();
+
+class GpuInfoPrivate;
+class NCNN_EXPORT GpuInfo
+{
+public:
+    explicit GpuInfo();
+    virtual ~GpuInfo();
+
+    // vulkan physical device
+    VkPhysicalDevice physical_device() const;
+
+    // memory properties
+    const VkPhysicalDeviceMemoryProperties& physical_device_memory_properties() const;
+
+    // info
+    uint32_t api_version() const;
+    uint32_t driver_version() const;
+    uint32_t vendor_id() const;
+    uint32_t device_id() const;
+    const char* device_name() const;
+    uint8_t* pipeline_cache_uuid() const;
+
+    // 0 = discrete gpu
+    // 1 = integrated gpu
+    // 2 = virtual gpu
+    // 3 = cpu
+    int type() const;
+
+    // hardware limit
+    uint32_t max_shared_memory_size() const;
+    uint32_t max_workgroup_count_x() const;
+    uint32_t max_workgroup_count_y() const;
+    uint32_t max_workgroup_count_z() const;
+    uint32_t max_workgroup_invocations() const;
+    uint32_t max_workgroup_size_x() const;
+    uint32_t max_workgroup_size_y() const;
+    uint32_t max_workgroup_size_z() const;
+    size_t memory_map_alignment() const;
+    size_t buffer_offset_alignment() const;
+    size_t non_coherent_atom_size() const;
+    size_t buffer_image_granularity() const;
+    uint32_t max_image_dimension_1d() const;
+    uint32_t max_image_dimension_2d() const;
+    uint32_t max_image_dimension_3d() const;
+    float timestamp_period() const;
+
+    // runtime
+    uint32_t compute_queue_family_index() const;
+    uint32_t graphics_queue_family_index() const;
+    uint32_t transfer_queue_family_index() const;
+
+    uint32_t compute_queue_count() const;
+    uint32_t graphics_queue_count() const;
+    uint32_t transfer_queue_count() const;
+
+    // property
+    bool unified_compute_transfer_queue() const;
+
+    // subgroup
+    uint32_t subgroup_size() const;
+    bool support_subgroup_basic() const;
+    bool support_subgroup_vote() const;
+    bool support_subgroup_ballot() const;
+    bool support_subgroup_shuffle() const;
+
+    // bug is not feature
+    bool bug_storage_buffer_no_l1() const;
+    bool bug_corrupted_online_pipeline_cache() const;
+    bool bug_buffer_image_load_zero() const;
+
+    // but sometimes bug is a feature
+    bool bug_implicit_fp16_arithmetic() const;
+
+    // fp16 and int8 feature
+    bool support_fp16_packed() const;
+    bool support_fp16_storage() const;
+    bool support_fp16_arithmetic() const;
+    bool support_int8_packed() const;
+    bool support_int8_storage() const;
+    bool support_int8_arithmetic() const;
+
+    // ycbcr conversion feature
+    bool support_ycbcr_conversion() const;
+
+    // cooperative matrix feature
+    bool support_cooperative_matrix() const;
+    bool support_cooperative_matrix_16_8_8() const;
+    bool support_cooperative_matrix_16_8_16() const;
+    bool support_cooperative_matrix_16_16_16() const;
+
+    // extension capability
+    int support_VK_KHR_8bit_storage() const;
+    int support_VK_KHR_16bit_storage() const;
+    int support_VK_KHR_bind_memory2() const;
+    int support_VK_KHR_buffer_device_address() const;
+    int support_VK_KHR_create_renderpass2() const;
+    int support_VK_KHR_cooperative_matrix() const;
+    int support_VK_KHR_dedicated_allocation() const;
+    int support_VK_KHR_descriptor_update_template() const;
+    int support_VK_KHR_external_memory() const;
+    int support_VK_KHR_get_memory_requirements2() const;
+    int support_VK_KHR_maintenance1() const;
+    int support_VK_KHR_maintenance2() const;
+    int support_VK_KHR_maintenance3() const;
+    int support_VK_KHR_multiview() const;
+    int support_VK_KHR_portability_subset() const;
+    int support_VK_KHR_push_descriptor() const;
+    int support_VK_KHR_sampler_ycbcr_conversion() const;
+    int support_VK_KHR_shader_float16_int8() const;
+    int support_VK_KHR_shader_float_controls() const;
+    int support_VK_KHR_storage_buffer_storage_class() const;
+    int support_VK_KHR_swapchain() const;
+    int support_VK_EXT_buffer_device_address() const;
+    int support_VK_EXT_descriptor_indexing() const;
+    int support_VK_EXT_memory_budget() const;
+    int support_VK_EXT_memory_priority() const;
+    int support_VK_EXT_queue_family_foreign() const;
+    int support_VK_AMD_device_coherent_memory() const;
+#if __ANDROID_API__ >= 26
+    int support_VK_ANDROID_external_memory_android_hardware_buffer() const;
+#endif // __ANDROID_API__ >= 26
+    int support_VK_NV_cooperative_matrix() const;
+
+private:
+    GpuInfo(const GpuInfo&);
+    GpuInfo& operator=(const GpuInfo&);
+
+private:
+    friend int create_gpu_instance();
+    GpuInfoPrivate* const d;
+};
+
+NCNN_EXPORT const GpuInfo& get_gpu_info(int device_index = get_default_gpu_index());
+
+class VkAllocator;
+class VkCompute;
+class Option;
+class PipelineCache;
+class VulkanDevicePrivate;
+class NCNN_EXPORT VulkanDevice
+{
+public:
+    VulkanDevice(int device_index = get_default_gpu_index());
+    ~VulkanDevice();
+
+    const GpuInfo& info;
+
+    VkDevice vkdevice() const;
+
+    VkShaderModule compile_shader_module(const uint32_t* spv_data, size_t spv_data_size) const;
+
+    // with fixed workgroup size
+    VkShaderModule compile_shader_module(const uint32_t* spv_data, size_t spv_data_size, uint32_t local_size_x, uint32_t local_size_y, uint32_t local_size_z) const;
+
+    // helper for creating pipeline
+    int create_descriptorset_layout(int binding_count, const int* binding_types, VkDescriptorSetLayout* descriptorset_layout) const;
+    int create_pipeline_layout(int push_constant_count, VkDescriptorSetLayout descriptorset_layout, VkPipelineLayout* pipeline_layout) const;
+    int create_pipeline(VkShaderModule shader_module, VkPipelineLayout pipeline_layout, const std::vector<vk_specialization_type>& specializations, VkPipeline* pipeline) const;
+    int create_descriptor_update_template(int binding_count, const int* binding_types, VkDescriptorSetLayout descriptorset_layout, VkPipelineLayout pipeline_layout, VkDescriptorUpdateTemplateKHR* descriptor_update_template) const;
+
+    uint32_t find_memory_index(uint32_t memory_type_bits, VkFlags required, VkFlags preferred, VkFlags preferred_not) const;
+    bool is_mappable(uint32_t memory_type_index) const;
+    bool is_coherent(uint32_t memory_type_index) const;
+
+    VkQueue acquire_queue(uint32_t queue_family_index) const;
+    void reclaim_queue(uint32_t queue_family_index, VkQueue queue) const;
+
+    // allocator on this device
+    VkAllocator* acquire_blob_allocator() const;
+    void reclaim_blob_allocator(VkAllocator* allocator) const;
+
+    VkAllocator* acquire_staging_allocator() const;
+    void reclaim_staging_allocator(VkAllocator* allocator) const;
+
+    // immutable sampler for texelfetch
+    const VkSampler* immutable_texelfetch_sampler() const;
+
+    // dummy buffer image
+    VkMat get_dummy_buffer() const;
+    VkImageMat get_dummy_image() const;
+    VkImageMat get_dummy_image_readonly() const;
+
+    // pipeline cache on this device
+    const PipelineCache* get_pipeline_cache() const;
+
+    // test image allocation
+    bool shape_support_image_storage(const Mat& shape) const;
+
+    // current gpu heap memory budget in MB
+    uint32_t get_heap_budget() const;
+
+    // utility operator
+    void convert_packing(const VkMat& src, VkMat& dst, int dst_elempack, VkCompute& cmd, const Option& opt) const;
+    void convert_packing(const VkImageMat& src, VkImageMat& dst, int dst_elempack, VkCompute& cmd, const Option& opt) const;
+    void convert_packing(const VkMat& src, VkImageMat& dst, int dst_elempack, VkCompute& cmd, const Option& opt) const;
+    void convert_packing(const VkImageMat& src, VkMat& dst, int dst_elempack, VkCompute& cmd, const Option& opt) const;
+
+    // VK_KHR_bind_memory2
+    PFN_vkBindBufferMemory2KHR vkBindBufferMemory2KHR;
+    PFN_vkBindImageMemory2KHR vkBindImageMemory2KHR;
+
+    // VK_KHR_buffer_device_address
+    PFN_vkGetBufferDeviceAddressKHR vkGetBufferDeviceAddressKHR;
+    PFN_vkGetBufferOpaqueCaptureAddressKHR vkGetBufferOpaqueCaptureAddressKHR;
+    PFN_vkGetDeviceMemoryOpaqueCaptureAddressKHR vkGetDeviceMemoryOpaqueCaptureAddressKHR;
+
+    // VK_KHR_create_renderpass2
+    PFN_vkCmdBeginRenderPass2KHR vkCmdBeginRenderPass2KHR;
+    PFN_vkCmdEndRenderPass2KHR vkCmdEndRenderPass2KHR;
+    PFN_vkCmdNextSubpass2KHR vkCmdNextSubpass2KHR;
+    PFN_vkCreateRenderPass2KHR vkCreateRenderPass2KHR;
+
+    // VK_KHR_descriptor_update_template
+    PFN_vkCreateDescriptorUpdateTemplateKHR vkCreateDescriptorUpdateTemplateKHR;
+    PFN_vkDestroyDescriptorUpdateTemplateKHR vkDestroyDescriptorUpdateTemplateKHR;
+    PFN_vkUpdateDescriptorSetWithTemplateKHR vkUpdateDescriptorSetWithTemplateKHR;
+
+    // VK_KHR_get_memory_requirements2
+    PFN_vkGetImageMemoryRequirements2KHR vkGetImageMemoryRequirements2KHR;
+    PFN_vkGetBufferMemoryRequirements2KHR vkGetBufferMemoryRequirements2KHR;
+    PFN_vkGetImageSparseMemoryRequirements2KHR vkGetImageSparseMemoryRequirements2KHR;
+
+    // VK_KHR_maintenance1
+    PFN_vkTrimCommandPoolKHR vkTrimCommandPoolKHR;
+
+    // VK_KHR_maintenance3
+    PFN_vkGetDescriptorSetLayoutSupportKHR vkGetDescriptorSetLayoutSupportKHR;
+
+    // VK_KHR_push_descriptor
+    PFN_vkCmdPushDescriptorSetWithTemplateKHR vkCmdPushDescriptorSetWithTemplateKHR;
+    PFN_vkCmdPushDescriptorSetKHR vkCmdPushDescriptorSetKHR;
+
+    // VK_KHR_sampler_ycbcr_conversion
+    PFN_vkCreateSamplerYcbcrConversionKHR vkCreateSamplerYcbcrConversionKHR;
+    PFN_vkDestroySamplerYcbcrConversionKHR vkDestroySamplerYcbcrConversionKHR;
+
+    // VK_KHR_swapchain
+    PFN_vkCreateSwapchainKHR vkCreateSwapchainKHR;
+    PFN_vkDestroySwapchainKHR vkDestroySwapchainKHR;
+    PFN_vkGetSwapchainImagesKHR vkGetSwapchainImagesKHR;
+    PFN_vkAcquireNextImageKHR vkAcquireNextImageKHR;
+    PFN_vkQueuePresentKHR vkQueuePresentKHR;
+
+    // VK_EXT_buffer_device_address
+    PFN_vkGetBufferDeviceAddressEXT vkGetBufferDeviceAddressEXT;
+
+#if __ANDROID_API__ >= 26
+    // VK_ANDROID_external_memory_android_hardware_buffer
+    PFN_vkGetAndroidHardwareBufferPropertiesANDROID vkGetAndroidHardwareBufferPropertiesANDROID;
+    PFN_vkGetMemoryAndroidHardwareBufferANDROID vkGetMemoryAndroidHardwareBufferANDROID;
+#endif // __ANDROID_API__ >= 26
+
+protected:
+    // device extension
+    int init_device_extension();
+
+private:
+    VulkanDevice(const VulkanDevice&);
+    VulkanDevice& operator=(const VulkanDevice&);
+
+private:
+    VulkanDevicePrivate* const d;
+};
+
+NCNN_EXPORT VulkanDevice* get_gpu_device(int device_index = get_default_gpu_index());
+
+// online spirv compilation
+NCNN_EXPORT int compile_spirv_module(const char* comp_string, const Option& opt, std::vector<uint32_t>& spirv);
+NCNN_EXPORT int compile_spirv_module(const char* comp_data, int comp_data_size, const Option& opt, std::vector<uint32_t>& spirv);
+NCNN_EXPORT int compile_spirv_module(int shader_type_index, const Option& opt, std::vector<uint32_t>& spirv);
+
+// info from spirv
+class NCNN_EXPORT ShaderInfo
+{
+public:
+    int specialization_count;
+    int binding_count;
+    int push_constant_count;
+
+    // 0 = null
+    // 1 = storage buffer
+    // 2 = storage image
+    // 3 = combined image sampler
+    int binding_types[16]; // 16 is large enough I think ...
+
+    int reserved_0;
+    int reserved_1;
+    int reserved_2;
+    int reserved_3;
+};
+
+NCNN_EXPORT int resolve_shader_info(const uint32_t* spv_data, size_t spv_data_size, ShaderInfo& shader_info);
+
+} // namespace ncnn
+
+#endif // NCNN_VULKAN
+
+#endif // NCNN_GPU_H
diff --git a/linux/include/ncnn/layer.h b/linux/include/ncnn/layer.h
new file mode 100644
index 0000000..ae4a843
--- /dev/null
+++ b/linux/include/ncnn/layer.h
@@ -0,0 +1,224 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef NCNN_LAYER_H
+#define NCNN_LAYER_H
+
+#include "mat.h"
+#include "modelbin.h"
+#include "option.h"
+#include "paramdict.h"
+#include "platform.h"
+
+#include <math.h>
+
+#if NCNN_VULKAN
+#include "command.h"
+#include "pipeline.h"
+
+#include <vulkan/vulkan.h>
+#endif // NCNN_VULKAN
+
+namespace ncnn {
+
+class NCNN_EXPORT Layer
+{
+public:
+    // empty
+    Layer();
+    // virtual destructor
+    virtual ~Layer();
+
+    // load layer specific parameter from parsed dict
+    // return 0 if success
+    virtual int load_param(const ParamDict& pd);
+
+    // load layer specific weight data from model binary
+    // return 0 if success
+    virtual int load_model(const ModelBin& mb);
+
+    // layer implementation specific setup
+    // return 0 if success
+    virtual int create_pipeline(const Option& opt);
+
+    // layer implementation specific clean
+    // return 0 if success
+    virtual int destroy_pipeline(const Option& opt);
+
+public:
+    // one input and one output blob
+    bool one_blob_only;
+
+    // support inplace inference
+    bool support_inplace;
+
+    // support vulkan compute
+    bool support_vulkan;
+
+    // accept input blob with packed storage
+    bool support_packing;
+
+    // accept bf16
+    bool support_bf16_storage;
+
+    // accept fp16
+    bool support_fp16_storage;
+
+    // accept int8
+    bool support_int8_storage;
+
+    // shader image storage
+    bool support_image_storage;
+
+    // shader tensor storage
+    bool support_tensor_storage;
+
+    bool support_reserved_00;
+
+    bool support_reserved_0;
+    bool support_reserved_1;
+    bool support_reserved_2;
+    bool support_reserved_3;
+    bool support_reserved_4;
+    bool support_reserved_5;
+    bool support_reserved_6;
+    bool support_reserved_7;
+    bool support_reserved_8;
+    bool support_reserved_9;
+
+    // feature disabled set
+    int featmask;
+
+public:
+    // implement inference
+    // return 0 if success
+    virtual int forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs, const Option& opt) const;
+    virtual int forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const;
+
+    // implement inplace inference
+    // return 0 if success
+    virtual int forward_inplace(std::vector<Mat>& bottom_top_blobs, const Option& opt) const;
+    virtual int forward_inplace(Mat& bottom_top_blob, const Option& opt) const;
+
+#if NCNN_VULKAN
+public:
+    // upload weight blob from host to device
+    virtual int upload_model(VkTransfer& cmd, const Option& opt);
+
+public:
+    // implement inference
+    // return 0 if success
+    virtual int forward(const std::vector<VkMat>& bottom_blobs, std::vector<VkMat>& top_blobs, VkCompute& cmd, const Option& opt) const;
+    virtual int forward(const VkMat& bottom_blob, VkMat& top_blob, VkCompute& cmd, const Option& opt) const;
+
+    // implement inference
+    // return 0 if success
+    virtual int forward(const std::vector<VkImageMat>& bottom_blobs, std::vector<VkImageMat>& top_blobs, VkCompute& cmd, const Option& opt) const;
+    virtual int forward(const VkImageMat& bottom_blob, VkImageMat& top_blob, VkCompute& cmd, const Option& opt) const;
+
+    // implement inplace inference
+    // return 0 if success
+    virtual int forward_inplace(std::vector<VkMat>& bottom_top_blobs, VkCompute& cmd, const Option& opt) const;
+    virtual int forward_inplace(VkMat& bottom_top_blob, VkCompute& cmd, const Option& opt) const;
+
+    // implement inplace inference
+    // return 0 if success
+    virtual int forward_inplace(std::vector<VkImageMat>& bottom_top_blobs, VkCompute& cmd, const Option& opt) const;
+    virtual int forward_inplace(VkImageMat& bottom_top_blob, VkCompute& cmd, const Option& opt) const;
+
+public:
+    // assigned immediately after creating this layer
+    const VulkanDevice* vkdev;
+#endif // NCNN_VULKAN
+
+public:
+    // custom user data
+    void* userdata;
+    // layer type index
+    int typeindex;
+#if NCNN_STRING
+    // layer type name
+    std::string type;
+    // layer name
+    std::string name;
+#endif // NCNN_STRING
+    // blob index which this layer needs as input
+    std::vector<int> bottoms;
+    // blob index which this layer produces as output
+    std::vector<int> tops;
+    // shape hint
+    std::vector<Mat> bottom_shapes;
+    std::vector<Mat> top_shapes;
+};
+
+// layer factory function
+typedef Layer* (*layer_creator_func)(void*);
+typedef void (*layer_destroyer_func)(Layer*, void*);
+
+struct layer_registry_entry
+{
+#if NCNN_STRING
+    // layer type name
+    const char* name;
+#endif // NCNN_STRING
+    // layer factory entry
+    layer_creator_func creator;
+};
+
+struct custom_layer_registry_entry
+{
+#if NCNN_STRING
+    // layer type name
+    const char* name;
+#endif // NCNN_STRING
+    // layer factory entry
+    layer_creator_func creator;
+    layer_destroyer_func destroyer;
+    void* userdata;
+};
+
+struct overwrite_builtin_layer_registry_entry
+{
+    // layer type index
+    int typeindex;
+    // layer factory entry
+    layer_creator_func creator;
+    layer_destroyer_func destroyer;
+    void* userdata;
+};
+
+#if NCNN_STRING
+// get layer type from type name
+NCNN_EXPORT int layer_to_index(const char* type);
+// create layer from type name
+NCNN_EXPORT Layer* create_layer(const char* type);
+#endif // NCNN_STRING
+// create layer from layer type
+NCNN_EXPORT Layer* create_layer(int index);
+
+#define DEFINE_LAYER_CREATOR(name)                          \
+    ::ncnn::Layer* name##_layer_creator(void* /*userdata*/) \
+    {                                                       \
+        return new name;                                    \
+    }
+
+#define DEFINE_LAYER_DESTROYER(name)                                      \
+    void name##_layer_destroyer(::ncnn::Layer* layer, void* /*userdata*/) \
+    {                                                                     \
+        delete layer;                                                     \
+    }
+
+} // namespace ncnn
+
+#endif // NCNN_LAYER_H
diff --git a/linux/include/ncnn/layer_shader_type.h b/linux/include/ncnn/layer_shader_type.h
new file mode 100644
index 0000000..c143e7d
--- /dev/null
+++ b/linux/include/ncnn/layer_shader_type.h
@@ -0,0 +1,29 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef NCNN_LAYER_SHADER_TYPE_H
+#define NCNN_LAYER_SHADER_TYPE_H
+
+namespace ncnn {
+
+namespace LayerShaderType {
+enum LayerShaderType
+{
+#include "layer_shader_type_enum.h"
+};
+} // namespace LayerShaderType
+
+} // namespace ncnn
+
+#endif // NCNN_LAYER_SHADER_TYPE_H
diff --git a/linux/include/ncnn/layer_shader_type_enum.h b/linux/include/ncnn/layer_shader_type_enum.h
new file mode 100644
index 0000000..75782d0
--- /dev/null
+++ b/linux/include/ncnn/layer_shader_type_enum.h
@@ -0,0 +1,398 @@
+// Layer Shader Enum header
+//
+// This file is auto-generated by cmake, don't edit it.
+
+absval = 0,
+absval_pack4 = 1,
+absval_pack8 = 2,
+batchnorm = 3,
+batchnorm_pack4 = 4,
+batchnorm_pack8 = 5,
+concat = 6,
+concat_pack4 = 7,
+concat_pack4to1 = 8,
+concat_pack8 = 9,
+concat_pack8to1 = 10,
+concat_pack8to4 = 11,
+convolution = 12,
+convolution_1x1s1d1 = 13,
+convolution_3x3s1d1_winograd23_transform_input = 14,
+convolution_3x3s1d1_winograd23_transform_output = 15,
+convolution_3x3s1d1_winograd43_transform_input = 16,
+convolution_3x3s1d1_winograd43_transform_output = 17,
+convolution_3x3s1d1_winograd_gemm = 18,
+convolution_gemm = 19,
+convolution_pack1to4 = 20,
+convolution_pack1to4_1x1s1d1 = 21,
+convolution_pack1to4_3x3s1d1_winograd_gemm = 22,
+convolution_pack1to4_gemm = 23,
+convolution_pack1to8 = 24,
+convolution_pack1to8_1x1s1d1 = 25,
+convolution_pack1to8_3x3s1d1_winograd_gemm = 26,
+convolution_pack1to8_gemm = 27,
+convolution_pack4 = 28,
+convolution_pack4_1x1s1d1 = 29,
+convolution_pack4_1x1s1d1_khr_cm_16_16_16 = 30,
+convolution_pack4_1x1s1d1_khr_cm_16_8_8 = 31,
+convolution_pack4_1x1s1d1_nv_cm_16_16_16 = 32,
+convolution_pack4_1x1s1d1_nv_cm_16_8_8 = 33,
+convolution_pack4_3x3s1d1_winograd23_transform_input = 34,
+convolution_pack4_3x3s1d1_winograd23_transform_output = 35,
+convolution_pack4_3x3s1d1_winograd43_transform_input = 36,
+convolution_pack4_3x3s1d1_winograd43_transform_output = 37,
+convolution_pack4_3x3s1d1_winograd_gemm = 38,
+convolution_pack4_3x3s1d1_winograd_gemm_khr_cm_16_16_16 = 39,
+convolution_pack4_3x3s1d1_winograd_gemm_khr_cm_16_8_8 = 40,
+convolution_pack4_3x3s1d1_winograd_gemm_nv_cm_16_16_16 = 41,
+convolution_pack4_3x3s1d1_winograd_gemm_nv_cm_16_8_8 = 42,
+convolution_pack4_gemm = 43,
+convolution_pack4_gemm_khr_cm_16_16_16 = 44,
+convolution_pack4_gemm_khr_cm_16_8_8 = 45,
+convolution_pack4_gemm_nv_cm_16_16_16 = 46,
+convolution_pack4_gemm_nv_cm_16_8_8 = 47,
+convolution_pack4to1 = 48,
+convolution_pack4to1_1x1s1d1 = 49,
+convolution_pack4to1_3x3s1d1_winograd_gemm = 50,
+convolution_pack4to1_gemm = 51,
+convolution_pack4to8 = 52,
+convolution_pack4to8_1x1s1d1 = 53,
+convolution_pack4to8_3x3s1d1_winograd_gemm = 54,
+convolution_pack4to8_gemm = 55,
+convolution_pack8 = 56,
+convolution_pack8_1x1s1d1 = 57,
+convolution_pack8_3x3s1d1_winograd23_transform_input = 58,
+convolution_pack8_3x3s1d1_winograd23_transform_output = 59,
+convolution_pack8_3x3s1d1_winograd43_transform_input = 60,
+convolution_pack8_3x3s1d1_winograd43_transform_output = 61,
+convolution_pack8_3x3s1d1_winograd_gemm = 62,
+convolution_pack8_gemm = 63,
+convolution_pack8to1 = 64,
+convolution_pack8to1_1x1s1d1 = 65,
+convolution_pack8to1_3x3s1d1_winograd_gemm = 66,
+convolution_pack8to1_gemm = 67,
+convolution_pack8to4 = 68,
+convolution_pack8to4_1x1s1d1 = 69,
+convolution_pack8to4_3x3s1d1_winograd_gemm = 70,
+convolution_pack8to4_gemm = 71,
+crop = 72,
+crop_pack1to4 = 73,
+crop_pack1to8 = 74,
+crop_pack4 = 75,
+crop_pack4to1 = 76,
+crop_pack4to8 = 77,
+crop_pack8 = 78,
+crop_pack8to1 = 79,
+crop_pack8to4 = 80,
+deconvolution = 81,
+deconvolution_col2im = 82,
+deconvolution_gemm = 83,
+deconvolution_pack1to4 = 84,
+deconvolution_pack1to4_gemm = 85,
+deconvolution_pack1to8 = 86,
+deconvolution_pack1to8_gemm = 87,
+deconvolution_pack4 = 88,
+deconvolution_pack4_col2im = 89,
+deconvolution_pack4_gemm = 90,
+deconvolution_pack4_gemm_khr_cm_16_16_16 = 91,
+deconvolution_pack4_gemm_khr_cm_16_8_8 = 92,
+deconvolution_pack4_gemm_nv_cm_16_16_16 = 93,
+deconvolution_pack4_gemm_nv_cm_16_8_8 = 94,
+deconvolution_pack4to1 = 95,
+deconvolution_pack4to1_gemm = 96,
+deconvolution_pack4to8 = 97,
+deconvolution_pack4to8_gemm = 98,
+deconvolution_pack8 = 99,
+deconvolution_pack8_col2im = 100,
+deconvolution_pack8_gemm = 101,
+deconvolution_pack8to1 = 102,
+deconvolution_pack8to1_gemm = 103,
+deconvolution_pack8to4 = 104,
+deconvolution_pack8to4_gemm = 105,
+dropout = 106,
+dropout_pack4 = 107,
+dropout_pack8 = 108,
+eltwise = 109,
+eltwise_pack4 = 110,
+eltwise_pack8 = 111,
+elu = 112,
+elu_pack4 = 113,
+elu_pack8 = 114,
+flatten = 115,
+flatten_pack1to4 = 116,
+flatten_pack1to8 = 117,
+flatten_pack4 = 118,
+flatten_pack4to8 = 119,
+flatten_pack8 = 120,
+innerproduct = 121,
+innerproduct_gemm = 122,
+innerproduct_gemm_wp1to4 = 123,
+innerproduct_gemm_wp1to8 = 124,
+innerproduct_gemm_wp4 = 125,
+innerproduct_gemm_wp4to1 = 126,
+innerproduct_gemm_wp4to8 = 127,
+innerproduct_gemm_wp8 = 128,
+innerproduct_gemm_wp8to1 = 129,
+innerproduct_gemm_wp8to4 = 130,
+innerproduct_pack1to4 = 131,
+innerproduct_pack1to8 = 132,
+innerproduct_pack4 = 133,
+innerproduct_pack4to1 = 134,
+innerproduct_pack4to8 = 135,
+innerproduct_pack8 = 136,
+innerproduct_pack8to1 = 137,
+innerproduct_pack8to4 = 138,
+innerproduct_reduce_sum8 = 139,
+innerproduct_reduce_sum8_pack4 = 140,
+innerproduct_reduce_sum8_pack8 = 141,
+innerproduct_sum8 = 142,
+innerproduct_sum8_pack1to4 = 143,
+innerproduct_sum8_pack1to8 = 144,
+innerproduct_sum8_pack4 = 145,
+innerproduct_sum8_pack4to1 = 146,
+innerproduct_sum8_pack4to8 = 147,
+innerproduct_sum8_pack8 = 148,
+innerproduct_sum8_pack8to1 = 149,
+innerproduct_sum8_pack8to4 = 150,
+lrn_norm = 151,
+lrn_norm_across_channel_pack4 = 152,
+lrn_norm_across_channel_pack8 = 153,
+lrn_norm_within_channel_pack4 = 154,
+lrn_norm_within_channel_pack8 = 155,
+lrn_square_pad = 156,
+lrn_square_pad_across_channel_pack4 = 157,
+lrn_square_pad_across_channel_pack8 = 158,
+lrn_square_pad_within_channel_pack4 = 159,
+lrn_square_pad_within_channel_pack8 = 160,
+pooling = 161,
+pooling_adaptive = 162,
+pooling_adaptive_pack4 = 163,
+pooling_adaptive_pack8 = 164,
+pooling_global = 165,
+pooling_global_pack4 = 166,
+pooling_global_pack8 = 167,
+pooling_pack4 = 168,
+pooling_pack8 = 169,
+prelu = 170,
+prelu_pack4 = 171,
+prelu_pack8 = 172,
+relu = 173,
+relu_pack4 = 174,
+relu_pack8 = 175,
+reshape = 176,
+reshape_pack1to4 = 177,
+reshape_pack1to8 = 178,
+reshape_pack4 = 179,
+reshape_pack4to1 = 180,
+reshape_pack4to8 = 181,
+reshape_pack8 = 182,
+reshape_pack8to1 = 183,
+reshape_pack8to4 = 184,
+scale = 185,
+scale_pack4 = 186,
+scale_pack8 = 187,
+sigmoid = 188,
+sigmoid_pack4 = 189,
+sigmoid_pack8 = 190,
+slice = 191,
+slice_pack1to4 = 192,
+slice_pack1to8 = 193,
+slice_pack4 = 194,
+slice_pack4to8 = 195,
+slice_pack8 = 196,
+softmax_div_sum = 197,
+softmax_div_sum_pack4 = 198,
+softmax_div_sum_pack8 = 199,
+softmax_exp_sub_max = 200,
+softmax_exp_sub_max_pack4 = 201,
+softmax_exp_sub_max_pack8 = 202,
+softmax_reduce_max = 203,
+softmax_reduce_max_pack4 = 204,
+softmax_reduce_max_pack8 = 205,
+softmax_reduce_sum = 206,
+softmax_reduce_sum_pack4 = 207,
+softmax_reduce_sum_pack8 = 208,
+tanh = 209,
+tanh_pack4 = 210,
+tanh_pack8 = 211,
+binaryop = 212,
+binaryop_broadcast = 213,
+binaryop_broadcast_pack1to4 = 214,
+binaryop_broadcast_pack1to8 = 215,
+binaryop_broadcast_pack4 = 216,
+binaryop_broadcast_pack8 = 217,
+binaryop_pack4 = 218,
+binaryop_pack8 = 219,
+unaryop = 220,
+unaryop_pack4 = 221,
+unaryop_pack8 = 222,
+convolutiondepthwise = 223,
+convolutiondepthwise_group = 224,
+convolutiondepthwise_group_pack1to4 = 225,
+convolutiondepthwise_group_pack1to8 = 226,
+convolutiondepthwise_group_pack4 = 227,
+convolutiondepthwise_group_pack4to1 = 228,
+convolutiondepthwise_group_pack4to8 = 229,
+convolutiondepthwise_group_pack8 = 230,
+convolutiondepthwise_group_pack8to1 = 231,
+convolutiondepthwise_group_pack8to4 = 232,
+convolutiondepthwise_pack4 = 233,
+convolutiondepthwise_pack8 = 234,
+padding = 235,
+padding_3d = 236,
+padding_3d_pack4 = 237,
+padding_3d_pack8 = 238,
+padding_pack1to4 = 239,
+padding_pack1to8 = 240,
+padding_pack4 = 241,
+padding_pack4to1 = 242,
+padding_pack4to8 = 243,
+padding_pack8 = 244,
+padding_pack8to1 = 245,
+padding_pack8to4 = 246,
+normalize_coeffs = 247,
+normalize_coeffs_pack4 = 248,
+normalize_coeffs_pack8 = 249,
+normalize_norm = 250,
+normalize_norm_pack4 = 251,
+normalize_norm_pack8 = 252,
+normalize_reduce_sum4_fp16_to_fp32 = 253,
+normalize_reduce_sum4_fp16_to_fp32_pack4 = 254,
+normalize_reduce_sum4_fp16_to_fp32_pack8 = 255,
+normalize_reduce_sum4_fp32 = 256,
+normalize_reduce_sum4_fp32_pack4 = 257,
+normalize_reduce_sum4_fp32_pack8 = 258,
+permute = 259,
+permute_pack1to4 = 260,
+permute_pack1to8 = 261,
+permute_pack4 = 262,
+permute_pack4to1 = 263,
+permute_pack4to8 = 264,
+permute_pack8 = 265,
+permute_pack8to1 = 266,
+permute_pack8to4 = 267,
+priorbox = 268,
+priorbox_mxnet = 269,
+interp = 270,
+interp_bicubic = 271,
+interp_bicubic_coeffs = 272,
+interp_bicubic_pack4 = 273,
+interp_bicubic_pack8 = 274,
+interp_pack4 = 275,
+interp_pack8 = 276,
+deconvolutiondepthwise = 277,
+deconvolutiondepthwise_group = 278,
+deconvolutiondepthwise_group_pack1to4 = 279,
+deconvolutiondepthwise_group_pack1to8 = 280,
+deconvolutiondepthwise_group_pack4 = 281,
+deconvolutiondepthwise_group_pack4to1 = 282,
+deconvolutiondepthwise_group_pack4to8 = 283,
+deconvolutiondepthwise_group_pack8 = 284,
+deconvolutiondepthwise_group_pack8to1 = 285,
+deconvolutiondepthwise_group_pack8to4 = 286,
+deconvolutiondepthwise_pack4 = 287,
+deconvolutiondepthwise_pack8 = 288,
+shufflechannel = 289,
+shufflechannel_pack4 = 290,
+shufflechannel_pack8 = 291,
+instancenorm_coeffs = 292,
+instancenorm_coeffs_pack4 = 293,
+instancenorm_coeffs_pack8 = 294,
+instancenorm_norm = 295,
+instancenorm_norm_pack4 = 296,
+instancenorm_norm_pack8 = 297,
+instancenorm_reduce_mean = 298,
+instancenorm_reduce_mean_pack4 = 299,
+instancenorm_reduce_mean_pack8 = 300,
+instancenorm_reduce_sum4_fp16_to_fp32 = 301,
+instancenorm_reduce_sum4_fp16_to_fp32_pack4 = 302,
+instancenorm_reduce_sum4_fp16_to_fp32_pack8 = 303,
+instancenorm_reduce_sum4_fp32 = 304,
+instancenorm_reduce_sum4_fp32_pack4 = 305,
+instancenorm_reduce_sum4_fp32_pack8 = 306,
+instancenorm_sub_mean_square = 307,
+instancenorm_sub_mean_square_pack4 = 308,
+instancenorm_sub_mean_square_pack8 = 309,
+clip = 310,
+clip_pack4 = 311,
+clip_pack8 = 312,
+reorg = 313,
+reorg_pack1to4 = 314,
+reorg_pack1to8 = 315,
+reorg_pack4 = 316,
+reorg_pack4to8 = 317,
+reorg_pack8 = 318,
+packing = 319,
+packing_fp16_to_fp32 = 320,
+packing_fp32_to_fp16 = 321,
+packing_pack1to4 = 322,
+packing_pack1to4_fp16_to_fp32 = 323,
+packing_pack1to4_fp32_to_fp16 = 324,
+packing_pack1to8 = 325,
+packing_pack1to8_fp16_to_fp32 = 326,
+packing_pack1to8_fp32_to_fp16 = 327,
+packing_pack4 = 328,
+packing_pack4_fp16_to_fp32 = 329,
+packing_pack4_fp32_to_fp16 = 330,
+packing_pack4to1 = 331,
+packing_pack4to1_fp16_to_fp32 = 332,
+packing_pack4to1_fp32_to_fp16 = 333,
+packing_pack4to8 = 334,
+packing_pack4to8_fp16_to_fp32 = 335,
+packing_pack4to8_fp32_to_fp16 = 336,
+packing_pack8 = 337,
+packing_pack8_fp16_to_fp32 = 338,
+packing_pack8_fp32_to_fp16 = 339,
+packing_pack8to1 = 340,
+packing_pack8to1_fp16_to_fp32 = 341,
+packing_pack8to1_fp32_to_fp16 = 342,
+packing_pack8to4 = 343,
+packing_pack8to4_fp16_to_fp32 = 344,
+packing_pack8to4_fp32_to_fp16 = 345,
+cast_fp16_to_fp32 = 346,
+cast_fp16_to_fp32_pack4 = 347,
+cast_fp16_to_fp32_pack8 = 348,
+cast_fp32_to_fp16 = 349,
+cast_fp32_to_fp16_pack4 = 350,
+cast_fp32_to_fp16_pack8 = 351,
+hardsigmoid = 352,
+hardsigmoid_pack4 = 353,
+hardsigmoid_pack8 = 354,
+hardswish = 355,
+hardswish_pack4 = 356,
+hardswish_pack8 = 357,
+pixelshuffle = 358,
+pixelshuffle_pack4 = 359,
+pixelshuffle_pack4to1 = 360,
+pixelshuffle_pack8 = 361,
+pixelshuffle_pack8to1 = 362,
+pixelshuffle_pack8to4 = 363,
+deepcopy = 364,
+deepcopy_pack4 = 365,
+deepcopy_pack8 = 366,
+mish = 367,
+mish_pack4 = 368,
+mish_pack8 = 369,
+swish = 370,
+swish_pack4 = 371,
+swish_pack8 = 372,
+gemm = 373,
+multiheadattention_qk_cross = 374,
+multiheadattention_qk_cross_pack1to4 = 375,
+multiheadattention_qk_cross_pack4 = 376,
+multiheadattention_qk_cross_pack4to1 = 377,
+multiheadattention_qkv_cross = 378,
+multiheadattention_qkv_cross_pack1to4 = 379,
+multiheadattention_qkv_cross_pack4 = 380,
+multiheadattention_qkv_cross_pack4to1 = 381,
+gelu = 382,
+gelu_pack4 = 383,
+gelu_pack8 = 384,
+erf = 385,
+erf_pack4 = 386,
+erf_pack8 = 387,
+celu = 388,
+celu_pack4 = 389,
+celu_pack8 = 390,
+convert_ycbcr = 391,
+vulkan_activation = 392,
+
diff --git a/linux/include/ncnn/layer_type.h b/linux/include/ncnn/layer_type.h
new file mode 100644
index 0000000..511c714
--- /dev/null
+++ b/linux/include/ncnn/layer_type.h
@@ -0,0 +1,30 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef NCNN_LAYER_TYPE_H
+#define NCNN_LAYER_TYPE_H
+
+namespace ncnn {
+
+namespace LayerType {
+enum LayerType
+{
+#include "layer_type_enum.h"
+    CustomBit = (1 << 8),
+};
+} // namespace LayerType
+
+} // namespace ncnn
+
+#endif // NCNN_LAYER_TYPE_H
diff --git a/linux/include/ncnn/layer_type_enum.h b/linux/include/ncnn/layer_type_enum.h
new file mode 100644
index 0000000..fd539bc
--- /dev/null
+++ b/linux/include/ncnn/layer_type_enum.h
@@ -0,0 +1,108 @@
+// Layer Type Enum header
+//
+// This file is auto-generated by cmake, don't edit it.
+
+AbsVal = 0,
+ArgMax = 1,
+BatchNorm = 2,
+Bias = 3,
+BNLL = 4,
+Concat = 5,
+Convolution = 6,
+Crop = 7,
+Deconvolution = 8,
+Dropout = 9,
+Eltwise = 10,
+ELU = 11,
+Embed = 12,
+Exp = 13,
+Flatten = 14,
+InnerProduct = 15,
+Input = 16,
+Log = 17,
+LRN = 18,
+MemoryData = 19,
+MVN = 20,
+Pooling = 21,
+Power = 22,
+PReLU = 23,
+Proposal = 24,
+Reduction = 25,
+ReLU = 26,
+Reshape = 27,
+ROIPooling = 28,
+Scale = 29,
+Sigmoid = 30,
+Slice = 31,
+Softmax = 32,
+Split = 33,
+SPP = 34,
+TanH = 35,
+Threshold = 36,
+Tile = 37,
+RNN = 38,
+LSTM = 39,
+BinaryOp = 40,
+UnaryOp = 41,
+ConvolutionDepthWise = 42,
+Padding = 43,
+Squeeze = 44,
+ExpandDims = 45,
+Normalize = 46,
+Permute = 47,
+PriorBox = 48,
+DetectionOutput = 49,
+Interp = 50,
+DeconvolutionDepthWise = 51,
+ShuffleChannel = 52,
+InstanceNorm = 53,
+Clip = 54,
+Reorg = 55,
+YoloDetectionOutput = 56,
+Quantize = 57,
+Dequantize = 58,
+Yolov3DetectionOutput = 59,
+PSROIPooling = 60,
+ROIAlign = 61,
+Packing = 62,
+Requantize = 63,
+Cast = 64,
+HardSigmoid = 65,
+SELU = 66,
+HardSwish = 67,
+Noop = 68,
+PixelShuffle = 69,
+DeepCopy = 70,
+Mish = 71,
+StatisticsPooling = 72,
+Swish = 73,
+Gemm = 74,
+GroupNorm = 75,
+LayerNorm = 76,
+Softplus = 77,
+GRU = 78,
+MultiHeadAttention = 79,
+GELU = 80,
+Convolution1D = 81,
+Pooling1D = 82,
+ConvolutionDepthWise1D = 83,
+Convolution3D = 84,
+ConvolutionDepthWise3D = 85,
+Pooling3D = 86,
+MatMul = 87,
+Deconvolution1D = 88,
+DeconvolutionDepthWise1D = 89,
+Deconvolution3D = 90,
+DeconvolutionDepthWise3D = 91,
+Einsum = 92,
+DeformableConv2D = 93,
+GLU = 94,
+Fold = 95,
+Unfold = 96,
+GridSample = 97,
+CumulativeSum = 98,
+CopyTo = 99,
+Erf = 100,
+Diag = 101,
+CELU = 102,
+
diff --git a/linux/include/ncnn/mat.h b/linux/include/ncnn/mat.h
new file mode 100644
index 0000000..c6f59ef
--- /dev/null
+++ b/linux/include/ncnn/mat.h
@@ -0,0 +1,1843 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef NCNN_MAT_H
+#define NCNN_MAT_H
+
+#include <stdlib.h>
+#include <string.h>
+#if __ARM_NEON
+#include <arm_neon.h>
+#endif
+#if __SSE2__
+#include <emmintrin.h>
+#if __AVX__
+#include <immintrin.h>
+#endif
+#endif
+#if __mips_msa
+#include <msa.h>
+#endif
+#if __loongarch_sx
+#include <lsxintrin.h>
+#endif
+#if __riscv_vector
+#include <riscv_vector.h>
+#include "cpu.h" // cpu_riscv_vlenb()
+#endif
+
+#include "allocator.h"
+#include "option.h"
+#include "platform.h"
+
+#if NCNN_VULKAN
+#include <vulkan/vulkan.h>
+#endif // NCNN_VULKAN
+
+#if NCNN_PIXEL
+#if NCNN_PLATFORM_API
+#if __ANDROID_API__ >= 9
+#include <android/bitmap.h>
+#include <jni.h>
+#endif // __ANDROID_API__ >= 9
+#endif // NCNN_PLATFORM_API
+#endif // NCNN_PIXEL
+
+namespace ncnn {
+
+#if NCNN_VULKAN
+class VkMat;
+class VkImageMat;
+#endif // NCNN_VULKAN
+
+// the three dimension matrix
+class NCNN_EXPORT Mat
+{
+public:
+    // empty
+    Mat();
+    // vec
+    Mat(int w, size_t elemsize = 4u, Allocator* allocator = 0);
+    // image
+    Mat(int w, int h, size_t elemsize = 4u, Allocator* allocator = 0);
+    // dim
+    Mat(int w, int h, int c, size_t elemsize = 4u, Allocator* allocator = 0);
+    // cube
+    Mat(int w, int h, int d, int c, size_t elemsize = 4u, Allocator* allocator = 0);
+    // packed vec
+    Mat(int w, size_t elemsize, int elempack, Allocator* allocator = 0);
+    // packed image
+    Mat(int w, int h, size_t elemsize, int elempack, Allocator* allocator = 0);
+    // packed dim
+    Mat(int w, int h, int c, size_t elemsize, int elempack, Allocator* allocator = 0);
+    // packed cube
+    Mat(int w, int h, int d, int c, size_t elemsize, int elempack, Allocator* allocator = 0);
+    // copy
+    Mat(const Mat& m);
+    // external vec
+    Mat(int w, void* data, size_t elemsize = 4u, Allocator* allocator = 0);
+    // external image
+    Mat(int w, int h, void* data, size_t elemsize = 4u, Allocator* allocator = 0);
+    // external dim
+    Mat(int w, int h, int c, void* data, size_t elemsize = 4u, Allocator* allocator = 0);
+    // external cube
+    Mat(int w, int h, int d, int c, void* data, size_t elemsize = 4u, Allocator* allocator = 0);
+    // external packed vec
+    Mat(int w, void* data, size_t elemsize, int elempack, Allocator* allocator = 0);
+    // external packed image
+    Mat(int w, int h, void* data, size_t elemsize, int elempack, Allocator* allocator = 0);
+    // external packed dim
+    Mat(int w, int h, int c, void* data, size_t elemsize, int elempack, Allocator* allocator = 0);
+    // external packed cube
+    Mat(int w, int h, int d, int c, void* data, size_t elemsize, int elempack, Allocator* allocator = 0);
+    // release
+    ~Mat();
+    // assign
+    Mat& operator=(const Mat& m);
+    // set all
+    void fill(float v);
+    void fill(int v);
+#if __ARM_NEON
+    void fill(float32x4_t _v);
+    void fill(uint16x4_t _v);
+    void fill(int32x4_t _v);
+    void fill(int32x4_t _v0, int32x4_t _v1);
+#if __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+    void fill(float16x4_t _v);
+    void fill(float16x8_t _v);
+#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+#endif // __ARM_NEON
+#if __SSE2__
+#if __AVX__
+#if __AVX512F__
+    void fill(__m512 _v);
+#endif // __AVX512F__
+    void fill(__m256 _v, int i = 0);
+#endif // __AVX__
+    void fill(__m128 _v);
+    void fill(__m128i _v);
+#endif // __SSE2__
+#if __mips_msa
+    void fill(v4f32 _v);
+#endif // __mips_msa
+#if __loongarch_sx
+    void fill(__m128 _v);
+#endif //__loongarch_sx
+#if __riscv_vector
+    void fill(vfloat32m1_t _v);
+    void fill(vuint16m1_t _v);
+    void fill(vint8m1_t _v);
+#if __riscv_zfh
+    void fill(vfloat16m1_t _v);
+#endif // __riscv_zfh
+#endif // __riscv_vector
+    template<typename T>
+    void fill(T v);
+    // deep copy
+    Mat clone(Allocator* allocator = 0) const;
+    // deep copy from other mat, inplace
+    void clone_from(const ncnn::Mat& mat, Allocator* allocator = 0);
+    // reshape vec
+    Mat reshape(int w, Allocator* allocator = 0) const;
+    // reshape image
+    Mat reshape(int w, int h, Allocator* allocator = 0) const;
+    // reshape dim
+    Mat reshape(int w, int h, int c, Allocator* allocator = 0) const;
+    // reshape cube
+    Mat reshape(int w, int h, int d, int c, Allocator* allocator = 0) const;
+    // allocate vec
+    void create(int w, size_t elemsize = 4u, Allocator* allocator = 0);
+    // allocate image
+    void create(int w, int h, size_t elemsize = 4u, Allocator* allocator = 0);
+    // allocate dim
+    void create(int w, int h, int c, size_t elemsize = 4u, Allocator* allocator = 0);
+    // allocate cube
+    void create(int w, int h, int d, int c, size_t elemsize = 4u, Allocator* allocator = 0);
+    // allocate packed vec
+    void create(int w, size_t elemsize, int elempack, Allocator* allocator = 0);
+    // allocate packed image
+    void create(int w, int h, size_t elemsize, int elempack, Allocator* allocator = 0);
+    // allocate packed dim
+    void create(int w, int h, int c, size_t elemsize, int elempack, Allocator* allocator = 0);
+    // allocate packed cube
+    void create(int w, int h, int d, int c, size_t elemsize, int elempack, Allocator* allocator = 0);
+    // allocate like
+    void create_like(const Mat& m, Allocator* allocator = 0);
+#if NCNN_VULKAN
+    // allocate like
+    void create_like(const VkMat& m, Allocator* allocator = 0);
+    // allocate like
+    void create_like(const VkImageMat& im, Allocator* allocator = 0);
+#endif // NCNN_VULKAN
+    // refcount++
+    void addref();
+    // refcount--
+    void release();
+
+    bool empty() const;
+    size_t total() const;
+
+    // bits per element
+    int elembits() const;
+
+    // shape only
+    Mat shape() const;
+
+    // data reference
+    Mat channel(int c);
+    const Mat channel(int c) const;
+    Mat depth(int z);
+    const Mat depth(int z) const;
+    float* row(int y);
+    const float* row(int y) const;
+    template<typename T>
+    T* row(int y);
+    template<typename T>
+    const T* row(int y) const;
+
+    // range reference
+    Mat channel_range(int c, int channels);
+    const Mat channel_range(int c, int channels) const;
+    Mat depth_range(int z, int depths);
+    const Mat depth_range(int z, int depths) const;
+    Mat row_range(int y, int rows);
+    const Mat row_range(int y, int rows) const;
+    Mat range(int x, int n);
+    const Mat range(int x, int n) const;
+
+    // access raw data
+    template<typename T>
+    operator T*();
+    template<typename T>
+    operator const T*() const;
+
+    // convenient access float vec element
+    float& operator[](size_t i);
+    const float& operator[](size_t i) const;
+
+#if NCNN_PIXEL
+    enum PixelType
+    {
+        PIXEL_CONVERT_SHIFT = 16,
+        PIXEL_FORMAT_MASK = 0x0000ffff,
+        PIXEL_CONVERT_MASK = 0xffff0000,
+
+        PIXEL_RGB = 1,
+        PIXEL_BGR = 2,
+        PIXEL_GRAY = 3,
+        PIXEL_RGBA = 4,
+        PIXEL_BGRA = 5,
+
+        PIXEL_RGB2BGR = PIXEL_RGB | (PIXEL_BGR << PIXEL_CONVERT_SHIFT),
+        PIXEL_RGB2GRAY = PIXEL_RGB | (PIXEL_GRAY << PIXEL_CONVERT_SHIFT),
+        PIXEL_RGB2RGBA = PIXEL_RGB | (PIXEL_RGBA << PIXEL_CONVERT_SHIFT),
+        PIXEL_RGB2BGRA = PIXEL_RGB | (PIXEL_BGRA << PIXEL_CONVERT_SHIFT),
+
+        PIXEL_BGR2RGB = PIXEL_BGR | (PIXEL_RGB << PIXEL_CONVERT_SHIFT),
+        PIXEL_BGR2GRAY = PIXEL_BGR | (PIXEL_GRAY << PIXEL_CONVERT_SHIFT),
+        PIXEL_BGR2RGBA = PIXEL_BGR | (PIXEL_RGBA << PIXEL_CONVERT_SHIFT),
+        PIXEL_BGR2BGRA = PIXEL_BGR | (PIXEL_BGRA << PIXEL_CONVERT_SHIFT),
+
+        PIXEL_GRAY2RGB = PIXEL_GRAY | (PIXEL_RGB << PIXEL_CONVERT_SHIFT),
+        PIXEL_GRAY2BGR = PIXEL_GRAY | (PIXEL_BGR << PIXEL_CONVERT_SHIFT),
+        PIXEL_GRAY2RGBA = PIXEL_GRAY | (PIXEL_RGBA << PIXEL_CONVERT_SHIFT),
+        PIXEL_GRAY2BGRA = PIXEL_GRAY | (PIXEL_BGRA << PIXEL_CONVERT_SHIFT),
+
+        PIXEL_RGBA2RGB = PIXEL_RGBA | (PIXEL_RGB << PIXEL_CONVERT_SHIFT),
+        PIXEL_RGBA2BGR = PIXEL_RGBA | (PIXEL_BGR << PIXEL_CONVERT_SHIFT),
+        PIXEL_RGBA2GRAY = PIXEL_RGBA | (PIXEL_GRAY << PIXEL_CONVERT_SHIFT),
+        PIXEL_RGBA2BGRA = PIXEL_RGBA | (PIXEL_BGRA << PIXEL_CONVERT_SHIFT),
+
+        PIXEL_BGRA2RGB = PIXEL_BGRA | (PIXEL_RGB << PIXEL_CONVERT_SHIFT),
+        PIXEL_BGRA2BGR = PIXEL_BGRA | (PIXEL_BGR << PIXEL_CONVERT_SHIFT),
+        PIXEL_BGRA2GRAY = PIXEL_BGRA | (PIXEL_GRAY << PIXEL_CONVERT_SHIFT),
+        PIXEL_BGRA2RGBA = PIXEL_BGRA | (PIXEL_RGBA << PIXEL_CONVERT_SHIFT),
+    };
+    // convenient construct from pixel data
+    static Mat from_pixels(const unsigned char* pixels, int type, int w, int h, Allocator* allocator = 0);
+    // convenient construct from pixel data with stride(bytes-per-row) parameter
+    static Mat from_pixels(const unsigned char* pixels, int type, int w, int h, int stride, Allocator* allocator = 0);
+    // convenient construct from pixel data and resize to specific size
+    static Mat from_pixels_resize(const unsigned char* pixels, int type, int w, int h, int target_width, int target_height, Allocator* allocator = 0);
+    // convenient construct from pixel data and resize to specific size with stride(bytes-per-row) parameter
+    static Mat from_pixels_resize(const unsigned char* pixels, int type, int w, int h, int stride, int target_width, int target_height, Allocator* allocator = 0);
+    // convenient construct from pixel data roi
+    static Mat from_pixels_roi(const unsigned char* pixels, int type, int w, int h, int roix, int roiy, int roiw, int roih, Allocator* allocator = 0);
+    // convenient construct from pixel data roi with stride(bytes-per-row) parameter
+    static Mat from_pixels_roi(const unsigned char* pixels, int type, int w, int h, int stride, int roix, int roiy, int roiw, int roih, Allocator* allocator = 0);
+    // convenient construct from pixel data roi and resize to specific size
+    static Mat from_pixels_roi_resize(const unsigned char* pixels, int type, int w, int h, int roix, int roiy, int roiw, int roih, int target_width, int target_height, Allocator* allocator = 0);
+    // convenient construct from pixel data roi and resize to specific size with stride(bytes-per-row) parameter
+    static Mat from_pixels_roi_resize(const unsigned char* pixels, int type, int w, int h, int stride, int roix, int roiy, int roiw, int roih, int target_width, int target_height, Allocator* allocator = 0);
+
+    // convenient export to pixel data
+    void to_pixels(unsigned char* pixels, int type) const;
+    // convenient export to pixel data with stride(bytes-per-row) parameter
+    void to_pixels(unsigned char* pixels, int type, int stride) const;
+    // convenient export to pixel data and resize to specific size
+    void to_pixels_resize(unsigned char* pixels, int type, int target_width, int target_height) const;
+    // convenient export to pixel data and resize to specific size with stride(bytes-per-row) parameter
+    void to_pixels_resize(unsigned char* pixels, int type, int target_width, int target_height, int target_stride) const;
+
+#if NCNN_PLATFORM_API
+#if __ANDROID_API__ >= 9
+    // convenient construct from android Bitmap
+    static Mat from_android_bitmap(JNIEnv* env, jobject bitmap, int type_to, Allocator* allocator = 0);
+    // convenient construct from android Bitmap and resize to specific size
+    static Mat from_android_bitmap_resize(JNIEnv* env, jobject bitmap, int type_to, int target_width, int target_height, Allocator* allocator = 0);
+    // convenient construct from android Bitmap roi
+    static Mat from_android_bitmap_roi(JNIEnv* env, jobject bitmap, int type_to, int roix, int roiy, int roiw, int roih, Allocator* allocator = 0);
+    // convenient construct from android Bitmap roi and resize to specific size
+    static Mat from_android_bitmap_roi_resize(JNIEnv* env, jobject bitmap, int type_to, int roix, int roiy, int roiw, int roih, int target_width, int target_height, Allocator* allocator = 0);
+    // convenient export to android Bitmap and resize to the android Bitmap size
+    void to_android_bitmap(JNIEnv* env, jobject bitmap, int type_from) const;
+#endif // __ANDROID_API__ >= 9
+#endif // NCNN_PLATFORM_API
+#endif // NCNN_PIXEL
+
+    // substract channel-wise mean values, then multiply by normalize values, pass 0 to skip
+    void substract_mean_normalize(const float* mean_vals, const float* norm_vals);
+
+    // convenient construct from half precision floating point data
+    static Mat from_float16(const unsigned short* data, int size);
+
+    // pointer to the data
+    void* data;
+
+    // pointer to the reference counter
+    // when points to user-allocated data, the pointer is NULL
+    int* refcount;
+
+    // element size in bytes
+    // 4 = float32/int32
+    // 2 = float16
+    // 1 = int8/uint8
+    // 0 = empty
+    size_t elemsize;
+
+    // packed count inside element
+    // c/1-d-h-w-1  c/1-h-w-1  h/1-w-1  w/1-1  scalar
+    // c/4-d-h-w-4  c/4-h-w-4  h/4-w-4  w/4-4  sse/neon
+    // c/8-d-h-w-8  c/8-h-w-8  h/8-w-8  w/8-8  avx/fp16
+    int elempack;
+
+    // the allocator
+    Allocator* allocator;
+
+    // the dimension rank
+    int dims;
+
+    int w;
+    int h;
+    int d;
+    int c;
+
+    size_t cstep;
+};
+
+#if NCNN_VULKAN
+
+// the three dimension matrix, vulkan version
+class NCNN_EXPORT VkMat
+{
+public:
+    // empty
+    VkMat();
+    // vec
+    VkMat(int w, size_t elemsize, VkAllocator* allocator);
+    // image
+    VkMat(int w, int h, size_t elemsize, VkAllocator* allocator);
+    // dim
+    VkMat(int w, int h, int c, size_t elemsize, VkAllocator* allocator);
+    // cube
+    VkMat(int w, int h, int d, int c, size_t elemsize, VkAllocator* allocator);
+    // packed vec
+    VkMat(int w, size_t elemsize, int elempack, VkAllocator* allocator);
+    // packed image
+    VkMat(int w, int h, size_t elemsize, int elempack, VkAllocator* allocator);
+    // packed dim
+    VkMat(int w, int h, int c, size_t elemsize, int elempack, VkAllocator* allocator);
+    // packed cube
+    VkMat(int w, int h, int d, int c, size_t elemsize, int elempack, VkAllocator* allocator);
+    // copy
+    VkMat(const VkMat& m);
+    // external vec
+    VkMat(int w, VkBufferMemory* data, size_t elemsize, VkAllocator* allocator);
+    // external image
+    VkMat(int w, int h, VkBufferMemory* data, size_t elemsize, VkAllocator* allocator);
+    // external dim
+    VkMat(int w, int h, int c, VkBufferMemory* data, size_t elemsize, VkAllocator* allocator);
+    // external cube
+    VkMat(int w, int h, int d, int c, VkBufferMemory* data, size_t elemsize, VkAllocator* allocator);
+    // external packed vec
+    VkMat(int w, VkBufferMemory* data, size_t elemsize, int elempack, VkAllocator* allocator);
+    // external packed image
+    VkMat(int w, int h, VkBufferMemory* data, size_t elemsize, int elempack, VkAllocator* allocator);
+    // external packed dim
+    VkMat(int w, int h, int c, VkBufferMemory* data, size_t elemsize, int elempack, VkAllocator* allocator);
+    // external packed cube
+    VkMat(int w, int h, int d, int c, VkBufferMemory* data, size_t elemsize, int elempack, VkAllocator* allocator);
+    // release
+    ~VkMat();
+    // assign
+    VkMat& operator=(const VkMat& m);
+    // allocate vec
+    void create(int w, size_t elemsize, VkAllocator* allocator);
+    // allocate image
+    void create(int w, int h, size_t elemsize, VkAllocator* allocator);
+    // allocate dim
+    void create(int w, int h, int c, size_t elemsize, VkAllocator* allocator);
+    // allocate cube
+    void create(int w, int h, int d, int c, size_t elemsize, VkAllocator* allocator);
+    // allocate packed vec
+    void create(int w, size_t elemsize, int elempack, VkAllocator* allocator);
+    // allocate packed image
+    void create(int w, int h, size_t elemsize, int elempack, VkAllocator* allocator);
+    // allocate packed dim
+    void create(int w, int h, int c, size_t elemsize, int elempack, VkAllocator* allocator);
+    // allocate packed cube
+    void create(int w, int h, int d, int c, size_t elemsize, int elempack, VkAllocator* allocator);
+    // allocate like
+    void create_like(const Mat& m, VkAllocator* allocator);
+    // allocate like
+    void create_like(const VkMat& m, VkAllocator* allocator);
+    // allocate like
+    void create_like(const VkImageMat& im, VkAllocator* allocator);
+
+    // mapped
+    Mat mapped() const;
+    void* mapped_ptr() const;
+
+    // refcount++
+    void addref();
+    // refcount--
+    void release();
+
+    bool empty() const;
+    size_t total() const;
+
+    // bits per element
+    int elembits() const;
+
+    // shape only
+    Mat shape() const;
+
+    // low-level reference
+    VkBuffer buffer() const;
+    size_t buffer_offset() const;
+    size_t buffer_capacity() const;
+
+    // device buffer
+    VkBufferMemory* data;
+
+    // pointer to the reference counter
+    // when points to user-allocated data, the pointer is NULL
+    int* refcount;
+
+    // element size in bytes
+    // 4 = float32/int32
+    // 2 = float16
+    // 1 = int8/uint8
+    // 0 = empty
+    size_t elemsize;
+
+    // packed count inside element
+    // c/1-d-h-w-1  c/1-h-w-1  h/1-w-1  w/1-1  scalar
+    // c/4-d-h-w-4  c/4-h-w-4  h/4-w-4  w/4-4  sse/neon
+    // c/8-d-h-w-8  c/8-h-w-8  h/8-w-8  w/8-8  avx/fp16
+    int elempack;
+
+    // the allocator
+    VkAllocator* allocator;
+
+    // the dimension rank
+    int dims;
+
+    int w;
+    int h;
+    int d;
+    int c;
+
+    size_t cstep;
+};
+
+class NCNN_EXPORT VkImageMat
+{
+public:
+    // empty
+    VkImageMat();
+    // vec
+    VkImageMat(int w, size_t elemsize, VkAllocator* allocator);
+    // image
+    VkImageMat(int w, int h, size_t elemsize, VkAllocator* allocator);
+    // dim
+    VkImageMat(int w, int h, int c, size_t elemsize, VkAllocator* allocator);
+    // cube
+    VkImageMat(int w, int h, int d, int c, size_t elemsize, VkAllocator* allocator);
+    // packed vec
+    VkImageMat(int w, size_t elemsize, int elempack, VkAllocator* allocator);
+    // packed image
+    VkImageMat(int w, int h, size_t elemsize, int elempack, VkAllocator* allocator);
+    // packed dim
+    VkImageMat(int w, int h, int c, size_t elemsize, int elempack, VkAllocator* allocator);
+    // packed cube
+    VkImageMat(int w, int h, int d, int c, size_t elemsize, int elempack, VkAllocator* allocator);
+    // copy
+    VkImageMat(const VkImageMat& m);
+    // external vec
+    VkImageMat(int w, VkImageMemory* data, size_t elemsize, VkAllocator* allocator);
+    // external image
+    VkImageMat(int w, int h, VkImageMemory* data, size_t elemsize, VkAllocator* allocator);
+    // external dim
+    VkImageMat(int w, int h, int c, VkImageMemory* data, size_t elemsize, VkAllocator* allocator);
+    // external cube
+    VkImageMat(int w, int h, int d, int c, VkImageMemory* data, size_t elemsize, VkAllocator* allocator);
+    // external packed vec
+    VkImageMat(int w, VkImageMemory* data, size_t elemsize, int elempack, VkAllocator* allocator);
+    // external packed image
+    VkImageMat(int w, int h, VkImageMemory* data, size_t elemsize, int elempack, VkAllocator* allocator);
+    // external packed dim
+    VkImageMat(int w, int h, int c, VkImageMemory* data, size_t elemsize, int elempack, VkAllocator* allocator);
+    // external packed cube
+    VkImageMat(int w, int h, int d, int c, VkImageMemory* data, size_t elemsize, int elempack, VkAllocator* allocator);
+    // release
+    ~VkImageMat();
+    // assign
+    VkImageMat& operator=(const VkImageMat& m);
+    // allocate vec
+    void create(int w, size_t elemsize, VkAllocator* allocator);
+    // allocate image
+    void create(int w, int h, size_t elemsize, VkAllocator* allocator);
+    // allocate dim
+    void create(int w, int h, int c, size_t elemsize, VkAllocator* allocator);
+    // allocate cube
+    void create(int w, int h, int d, int c, size_t elemsize, VkAllocator* allocator);
+    // allocate packed vec
+    void create(int w, size_t elemsize, int elempack, VkAllocator* allocator);
+    // allocate packed image
+    void create(int w, int h, size_t elemsize, int elempack, VkAllocator* allocator);
+    // allocate packed dim
+    void create(int w, int h, int c, size_t elemsize, int elempack, VkAllocator* allocator);
+    // allocate packed cube
+    void create(int w, int h, int d, int c, size_t elemsize, int elempack, VkAllocator* allocator);
+    // allocate like
+    void create_like(const Mat& m, VkAllocator* allocator);
+    // allocate like
+    void create_like(const VkMat& m, VkAllocator* allocator);
+    // allocate like
+    void create_like(const VkImageMat& im, VkAllocator* allocator);
+
+    // mapped
+    Mat mapped() const;
+    void* mapped_ptr() const;
+
+    // refcount++
+    void addref();
+    // refcount--
+    void release();
+
+    bool empty() const;
+    size_t total() const;
+
+    // bits per element
+    int elembits() const;
+
+    // shape only
+    Mat shape() const;
+
+    // low-level reference
+    VkImage image() const;
+    VkImageView imageview() const;
+
+#if NCNN_PLATFORM_API
+#if __ANDROID_API__ >= 26
+    // convenient construct from android hardware buffer
+    static VkImageMat from_android_hardware_buffer(VkAndroidHardwareBufferImageAllocator* allocator);
+#endif // __ANDROID_API__ >= 26
+#endif // NCNN_PLATFORM_API
+
+    // device image
+    VkImageMemory* data;
+
+    // pointer to the reference counter
+    // when points to user-allocated data, the pointer is NULL
+    int* refcount;
+
+    // element size in bytes
+    // 4 = float32/int32
+    // 2 = float16
+    // 1 = int8/uint8
+    // 0 = empty
+    size_t elemsize;
+
+    // packed count inside element
+    // c/1-d-h-w-1  c/1-h-w-1  h/1-w-1  w/1-1  scalar
+    // c/4-d-h-w-4  c/4-h-w-4  h/4-w-4  w/4-4  sse/neon
+    // c/8-d-h-w-8  c/8-h-w-8  h/8-w-8  w/8-8  avx/fp16
+    int elempack;
+
+    // the allocator
+    VkAllocator* allocator;
+
+    // the dimension rank
+    int dims;
+
+    int w;
+    int h;
+    int d;
+    int c;
+};
+
+// type for vulkan specialization constant and push constant
+union vk_specialization_type
+{
+    int i;
+    float f;
+    uint32_t u32;
+};
+union vk_constant_type
+{
+    int i;
+    float f;
+};
+#endif // NCNN_VULKAN
+
+// misc function
+#if NCNN_PIXEL
+// convert yuv420sp(nv21) to rgb, the fast approximate version
+NCNN_EXPORT void yuv420sp2rgb(const unsigned char* yuv420sp, int w, int h, unsigned char* rgb);
+// convert yuv420sp(nv12) to rgb, the fast approximate version
+NCNN_EXPORT void yuv420sp2rgb_nv12(const unsigned char* yuv420sp, int w, int h, unsigned char* rgb);
+// convert yuv420sp(nv21) to rgb with half resize, the faster approximate version
+NCNN_EXPORT void yuv420sp2rgb_half(const unsigned char* yuv420sp, int w, int h, unsigned char* rgb);
+// image pixel bilinear resize
+NCNN_EXPORT void resize_bilinear_c1(const unsigned char* src, int srcw, int srch, unsigned char* dst, int w, int h);
+NCNN_EXPORT void resize_bilinear_c2(const unsigned char* src, int srcw, int srch, unsigned char* dst, int w, int h);
+NCNN_EXPORT void resize_bilinear_c3(const unsigned char* src, int srcw, int srch, unsigned char* dst, int w, int h);
+NCNN_EXPORT void resize_bilinear_c4(const unsigned char* src, int srcw, int srch, unsigned char* dst, int w, int h);
+// image pixel bilinear resize with stride(bytes-per-row) parameter
+NCNN_EXPORT void resize_bilinear_c1(const unsigned char* src, int srcw, int srch, int srcstride, unsigned char* dst, int w, int h, int stride);
+NCNN_EXPORT void resize_bilinear_c2(const unsigned char* src, int srcw, int srch, int srcstride, unsigned char* dst, int w, int h, int stride);
+NCNN_EXPORT void resize_bilinear_c3(const unsigned char* src, int srcw, int srch, int srcstride, unsigned char* dst, int w, int h, int stride);
+NCNN_EXPORT void resize_bilinear_c4(const unsigned char* src, int srcw, int srch, int srcstride, unsigned char* dst, int w, int h, int stride);
+// image pixel bilinear resize, convenient wrapper for yuv420sp(nv21/nv12)
+NCNN_EXPORT void resize_bilinear_yuv420sp(const unsigned char* src, int srcw, int srch, unsigned char* dst, int w, int h);
+#endif // NCNN_PIXEL
+#if NCNN_PIXEL_ROTATE
+// type is the from type, 6 means rotating from 6 to 1
+//
+//     1        2       3      4         5            6           7          8
+//
+//   888888  888888      88  88      8888888888  88                  88  8888888888
+//   88          88      88  88      88  88      88  88          88  88      88  88
+//   8888      8888    8888  8888    88          8888888888  8888888888          88
+//   88          88      88  88
+//   88          88  888888  888888
+//
+// ref http://sylvana.net/jpegcrop/exif_orientation.html
+// image pixel kanna rotate
+NCNN_EXPORT void kanna_rotate_c1(const unsigned char* src, int srcw, int srch, unsigned char* dst, int w, int h, int type);
+NCNN_EXPORT void kanna_rotate_c2(const unsigned char* src, int srcw, int srch, unsigned char* dst, int w, int h, int type);
+NCNN_EXPORT void kanna_rotate_c3(const unsigned char* src, int srcw, int srch, unsigned char* dst, int w, int h, int type);
+NCNN_EXPORT void kanna_rotate_c4(const unsigned char* src, int srcw, int srch, unsigned char* dst, int w, int h, int type);
+// image pixel kanna rotate with stride(bytes-per-row) parameter
+NCNN_EXPORT void kanna_rotate_c1(const unsigned char* src, int srcw, int srch, int srcstride, unsigned char* dst, int w, int h, int stride, int type);
+NCNN_EXPORT void kanna_rotate_c2(const unsigned char* src, int srcw, int srch, int srcstride, unsigned char* dst, int w, int h, int stride, int type);
+NCNN_EXPORT void kanna_rotate_c3(const unsigned char* src, int srcw, int srch, int srcstride, unsigned char* dst, int w, int h, int stride, int type);
+NCNN_EXPORT void kanna_rotate_c4(const unsigned char* src, int srcw, int srch, int srcstride, unsigned char* dst, int w, int h, int stride, int type);
+// image pixel kanna rotate, convenient wrapper for yuv420sp(nv21/nv12)
+NCNN_EXPORT void kanna_rotate_yuv420sp(const unsigned char* src, int srcw, int srch, unsigned char* dst, int w, int h, int type);
+#endif // NCNN_PIXEL_ROTATE
+#if NCNN_PIXEL_AFFINE
+// resolve affine transform matrix from rotation angle, scale factor and x y offset
+NCNN_EXPORT void get_rotation_matrix(float angle, float scale, float dx, float dy, float* tm);
+// resolve affine transform matrix from two set of points, num_point must be >= 2
+NCNN_EXPORT void get_affine_transform(const float* points_from, const float* points_to, int num_point, float* tm);
+// resolve the inversion affine transform matrix
+NCNN_EXPORT void invert_affine_transform(const float* tm, float* tm_inv);
+// image pixel bilinear warpaffine inverse transform, set -233 for transparent border color, the color RGBA is little-endian encoded
+NCNN_EXPORT void warpaffine_bilinear_c1(const unsigned char* src, int srcw, int srch, unsigned char* dst, int w, int h, const float* tm, int type = 0, unsigned int v = 0);
+NCNN_EXPORT void warpaffine_bilinear_c2(const unsigned char* src, int srcw, int srch, unsigned char* dst, int w, int h, const float* tm, int type = 0, unsigned int v = 0);
+NCNN_EXPORT void warpaffine_bilinear_c3(const unsigned char* src, int srcw, int srch, unsigned char* dst, int w, int h, const float* tm, int type = 0, unsigned int v = 0);
+NCNN_EXPORT void warpaffine_bilinear_c4(const unsigned char* src, int srcw, int srch, unsigned char* dst, int w, int h, const float* tm, int type = 0, unsigned int v = 0);
+// image pixel bilinear warpaffine inverse transform with stride(bytes-per-row) parameter, set -233 for transparent border color, the color RGBA is little-endian encoded
+NCNN_EXPORT void warpaffine_bilinear_c1(const unsigned char* src, int srcw, int srch, int srcstride, unsigned char* dst, int w, int h, int stride, const float* tm, int type = 0, unsigned int v = 0);
+NCNN_EXPORT void warpaffine_bilinear_c2(const unsigned char* src, int srcw, int srch, int srcstride, unsigned char* dst, int w, int h, int stride, const float* tm, int type = 0, unsigned int v = 0);
+NCNN_EXPORT void warpaffine_bilinear_c3(const unsigned char* src, int srcw, int srch, int srcstride, unsigned char* dst, int w, int h, int stride, const float* tm, int type = 0, unsigned int v = 0);
+NCNN_EXPORT void warpaffine_bilinear_c4(const unsigned char* src, int srcw, int srch, int srcstride, unsigned char* dst, int w, int h, int stride, const float* tm, int type = 0, unsigned int v = 0);
+// image pixel bilinear warpaffine, convenient wrapper for yuv420sp(nv21/nv12), set -233 for transparent border color, the color YUV_ is little-endian encoded
+NCNN_EXPORT void warpaffine_bilinear_yuv420sp(const unsigned char* src, int srcw, int srch, unsigned char* dst, int w, int h, const float* tm, int type = 0, unsigned int v = 0);
+#endif // NCNN_PIXEL_AFFINE
+#if NCNN_PIXEL_DRAWING
+// draw rectangle, set thickness -1 for filled rectangle, the color RGBA is little-endian encoded
+NCNN_EXPORT void draw_rectangle_c1(unsigned char* pixels, int w, int h, int rx, int ry, int rw, int rh, unsigned int color, int thickness);
+NCNN_EXPORT void draw_rectangle_c2(unsigned char* pixels, int w, int h, int rx, int ry, int rw, int rh, unsigned int color, int thickness);
+NCNN_EXPORT void draw_rectangle_c3(unsigned char* pixels, int w, int h, int rx, int ry, int rw, int rh, unsigned int color, int thickness);
+NCNN_EXPORT void draw_rectangle_c4(unsigned char* pixels, int w, int h, int rx, int ry, int rw, int rh, unsigned int color, int thickness);
+// draw rectangle with stride(bytes-per-row) parameter, set thickness -1 for filled rectangle, the color RGBA is little-endian encoded
+NCNN_EXPORT void draw_rectangle_c1(unsigned char* pixels, int w, int h, int stride, int rx, int ry, int rw, int rh, unsigned int color, int thickness);
+NCNN_EXPORT void draw_rectangle_c2(unsigned char* pixels, int w, int h, int stride, int rx, int ry, int rw, int rh, unsigned int color, int thickness);
+NCNN_EXPORT void draw_rectangle_c3(unsigned char* pixels, int w, int h, int stride, int rx, int ry, int rw, int rh, unsigned int color, int thickness);
+NCNN_EXPORT void draw_rectangle_c4(unsigned char* pixels, int w, int h, int stride, int rx, int ry, int rw, int rh, unsigned int color, int thickness);
+// draw rectangle, convenient wrapper for yuv420sp(nv21/nv12), set thickness -1 for filled rectangle, the color YUV_ is little-endian encoded
+NCNN_EXPORT void draw_rectangle_yuv420sp(unsigned char* yuv420sp, int w, int h, int rx, int ry, int rw, int rh, unsigned int color, int thickness);
+// draw circle, set thickness -1 for filled circle, the color RGBA is little-endian encoded
+NCNN_EXPORT void draw_circle_c1(unsigned char* pixels, int w, int h, int cx, int cy, int radius, unsigned int color, int thickness);
+NCNN_EXPORT void draw_circle_c2(unsigned char* pixels, int w, int h, int cx, int cy, int radius, unsigned int color, int thickness);
+NCNN_EXPORT void draw_circle_c3(unsigned char* pixels, int w, int h, int cx, int cy, int radius, unsigned int color, int thickness);
+NCNN_EXPORT void draw_circle_c4(unsigned char* pixels, int w, int h, int cx, int cy, int radius, unsigned int color, int thickness);
+// draw circle with stride(bytes-per-row) parameter, set thickness -1 for filled circle, the color RGBA is little-endian encoded
+NCNN_EXPORT void draw_circle_c1(unsigned char* pixels, int w, int h, int stride, int cx, int cy, int radius, unsigned int color, int thickness);
+NCNN_EXPORT void draw_circle_c2(unsigned char* pixels, int w, int h, int stride, int cx, int cy, int radius, unsigned int color, int thickness);
+NCNN_EXPORT void draw_circle_c3(unsigned char* pixels, int w, int h, int stride, int cx, int cy, int radius, unsigned int color, int thickness);
+NCNN_EXPORT void draw_circle_c4(unsigned char* pixels, int w, int h, int stride, int cx, int cy, int radius, unsigned int color, int thickness);
+// draw circle, convenient wrapper for yuv420sp(nv21/nv12), set thickness -1 for filled circle, the color YUV_ is little-endian encoded
+NCNN_EXPORT void draw_circle_yuv420sp(unsigned char* yuv420sp, int w, int h, int cx, int cy, int radius, unsigned int color, int thickness);
+// draw line, the color RGBA is little-endian encoded
+NCNN_EXPORT void draw_line_c1(unsigned char* pixels, int w, int h, int x0, int y0, int x1, int y1, unsigned int color, int thickness);
+NCNN_EXPORT void draw_line_c2(unsigned char* pixels, int w, int h, int x0, int y0, int x1, int y1, unsigned int color, int thickness);
+NCNN_EXPORT void draw_line_c3(unsigned char* pixels, int w, int h, int x0, int y0, int x1, int y1, unsigned int color, int thickness);
+NCNN_EXPORT void draw_line_c4(unsigned char* pixels, int w, int h, int x0, int y0, int x1, int y1, unsigned int color, int thickness);
+// draw line with stride(bytes-per-row) parameter, the color RGBA is little-endian encoded
+NCNN_EXPORT void draw_line_c1(unsigned char* pixels, int w, int h, int stride, int x0, int y0, int x1, int y1, unsigned int color, int thickness);
+NCNN_EXPORT void draw_line_c2(unsigned char* pixels, int w, int h, int stride, int x0, int y0, int x1, int y1, unsigned int color, int thickness);
+NCNN_EXPORT void draw_line_c3(unsigned char* pixels, int w, int h, int stride, int x0, int y0, int x1, int y1, unsigned int color, int thickness);
+NCNN_EXPORT void draw_line_c4(unsigned char* pixels, int w, int h, int stride, int x0, int y0, int x1, int y1, unsigned int color, int thickness);
+// draw line, convenient wrapper for yuv420sp(nv21/nv12), the color YUV_ is little-endian encoded
+NCNN_EXPORT void draw_line_yuv420sp(unsigned char* yuv420sp, int w, int h, int x0, int y0, int x1, int y1, unsigned int color, int thickness);
+// resolve text bounding box size
+NCNN_EXPORT void get_text_drawing_size(const char* text, int fontpixelsize, int* w, int* h);
+// draw ascii printables and newline, the color RGBA is little-endian encoded
+NCNN_EXPORT void draw_text_c1(unsigned char* pixels, int w, int h, const char* text, int x, int y, int fontpixelsize, unsigned int color);
+NCNN_EXPORT void draw_text_c2(unsigned char* pixels, int w, int h, const char* text, int x, int y, int fontpixelsize, unsigned int color);
+NCNN_EXPORT void draw_text_c3(unsigned char* pixels, int w, int h, const char* text, int x, int y, int fontpixelsize, unsigned int color);
+NCNN_EXPORT void draw_text_c4(unsigned char* pixels, int w, int h, const char* text, int x, int y, int fontpixelsize, unsigned int color);
+// draw ascii printables and newline with stride(bytes-per-row) parameter, the color RGBA is little-endian encoded
+NCNN_EXPORT void draw_text_c1(unsigned char* pixels, int w, int h, int stride, const char* text, int x, int y, int fontpixelsize, unsigned int color);
+NCNN_EXPORT void draw_text_c2(unsigned char* pixels, int w, int h, int stride, const char* text, int x, int y, int fontpixelsize, unsigned int color);
+NCNN_EXPORT void draw_text_c3(unsigned char* pixels, int w, int h, int stride, const char* text, int x, int y, int fontpixelsize, unsigned int color);
+NCNN_EXPORT void draw_text_c4(unsigned char* pixels, int w, int h, int stride, const char* text, int x, int y, int fontpixelsize, unsigned int color);
+// draw ascii printables and newline, convenient wrapper for yuv420sp(nv21/nv12), the color YUV_ is little-endian encoded
+NCNN_EXPORT void draw_text_yuv420sp(unsigned char* yuv420sp, int w, int h, const char* text, int x, int y, int fontpixelsize, unsigned int color);
+#endif // NCNN_PIXEL_DRAWING
+
+// type conversion
+// convert float to half precision floating point
+NCNN_EXPORT unsigned short float32_to_float16(float value);
+// convert half precision floating point to float
+NCNN_EXPORT float float16_to_float32(unsigned short value);
+// convert float to brain half
+NCNN_EXPORT NCNN_FORCEINLINE unsigned short float32_to_bfloat16(float value)
+{
+    // 16 : 16
+    union
+    {
+        unsigned int u;
+        float f;
+    } tmp;
+    tmp.f = value;
+    return tmp.u >> 16;
+}
+// convert brain half to float
+NCNN_EXPORT NCNN_FORCEINLINE float bfloat16_to_float32(unsigned short value)
+{
+    // 16 : 16
+    union
+    {
+        unsigned int u;
+        float f;
+    } tmp;
+    tmp.u = value << 16;
+    return tmp.f;
+}
+
+// mat process
+enum BorderType
+{
+    BORDER_CONSTANT = 0,
+    BORDER_REPLICATE = 1,
+    BORDER_REFLECT = 2,
+    BORDER_TRANSPARENT = -233,
+};
+NCNN_EXPORT void copy_make_border(const Mat& src, Mat& dst, int top, int bottom, int left, int right, int type, float v, const Option& opt = Option());
+NCNN_EXPORT void copy_make_border_3d(const Mat& src, Mat& dst, int top, int bottom, int left, int right, int front, int behind, int type, float v, const Option& opt = Option());
+NCNN_EXPORT void copy_cut_border(const Mat& src, Mat& dst, int top, int bottom, int left, int right, const Option& opt = Option());
+NCNN_EXPORT void copy_cut_border_3d(const Mat& src, Mat& dst, int top, int bottom, int left, int right, int front, int behind, const Option& opt = Option());
+NCNN_EXPORT void resize_nearest(const Mat& src, Mat& dst, int w, int h, const Option& opt = Option());
+NCNN_EXPORT void resize_bilinear(const Mat& src, Mat& dst, int w, int h, const Option& opt = Option());
+NCNN_EXPORT void resize_bicubic(const Mat& src, Mat& dst, int w, int h, const Option& opt = Option());
+NCNN_EXPORT void convert_packing(const Mat& src, Mat& dst, int elempack, const Option& opt = Option());
+NCNN_EXPORT void flatten(const Mat& src, Mat& dst, const Option& opt = Option());
+NCNN_EXPORT void cast_float32_to_float16(const Mat& src, Mat& dst, const Option& opt = Option());
+NCNN_EXPORT void cast_float16_to_float32(const Mat& src, Mat& dst, const Option& opt = Option());
+NCNN_EXPORT void cast_int8_to_float32(const Mat& src, Mat& dst, const Option& opt = Option());
+NCNN_EXPORT void cast_float32_to_bfloat16(const Mat& src, Mat& dst, const Option& opt = Option());
+NCNN_EXPORT void cast_bfloat16_to_float32(const Mat& src, Mat& dst, const Option& opt = Option());
+NCNN_EXPORT void quantize_to_int8(const Mat& src, Mat& dst, const Mat& scale_data, const Option& opt = Option());
+NCNN_EXPORT void dequantize_from_int32(const Mat& src, Mat& dst, const Mat& scale_data, const Mat& bias_data, const Option& opt = Option());
+NCNN_EXPORT void requantize_from_int32_to_int8(const Mat& src, Mat& dst, const Mat& scale_in_data, const Mat& scale_out_data, const Mat& bias_data, int activation_type, const Mat& activation_params, const Option& opt = Option());
+
+NCNN_FORCEINLINE Mat::Mat()
+    : data(0), refcount(0), elemsize(0), elempack(0), allocator(0), dims(0), w(0), h(0), d(0), c(0), cstep(0)
+{
+}
+
+NCNN_FORCEINLINE Mat::Mat(int _w, size_t _elemsize, Allocator* _allocator)
+    : data(0), refcount(0), elemsize(0), elempack(0), allocator(0), dims(0), w(0), h(0), d(0), c(0), cstep(0)
+{
+    create(_w, _elemsize, _allocator);
+}
+
+NCNN_FORCEINLINE Mat::Mat(int _w, int _h, size_t _elemsize, Allocator* _allocator)
+    : data(0), refcount(0), elemsize(0), elempack(0), allocator(0), dims(0), w(0), h(0), d(0), c(0), cstep(0)
+{
+    create(_w, _h, _elemsize, _allocator);
+}
+
+NCNN_FORCEINLINE Mat::Mat(int _w, int _h, int _c, size_t _elemsize, Allocator* _allocator)
+    : data(0), refcount(0), elemsize(0), elempack(0), allocator(0), dims(0), w(0), h(0), d(0), c(0), cstep(0)
+{
+    create(_w, _h, _c, _elemsize, _allocator);
+}
+
+NCNN_FORCEINLINE Mat::Mat(int _w, int _h, int _d, int _c, size_t _elemsize, Allocator* _allocator)
+    : data(0), refcount(0), elemsize(0), elempack(0), allocator(0), dims(0), w(0), h(0), d(0), c(0), cstep(0)
+{
+    create(_w, _h, _d, _c, _elemsize, _allocator);
+}
+
+NCNN_FORCEINLINE Mat::Mat(int _w, size_t _elemsize, int _elempack, Allocator* _allocator)
+    : data(0), refcount(0), elemsize(0), elempack(0), allocator(0), dims(0), w(0), h(0), d(0), c(0), cstep(0)
+{
+    create(_w, _elemsize, _elempack, _allocator);
+}
+
+NCNN_FORCEINLINE Mat::Mat(int _w, int _h, size_t _elemsize, int _elempack, Allocator* _allocator)
+    : data(0), refcount(0), elemsize(0), elempack(0), allocator(0), dims(0), w(0), h(0), d(0), c(0), cstep(0)
+{
+    create(_w, _h, _elemsize, _elempack, _allocator);
+}
+
+NCNN_FORCEINLINE Mat::Mat(int _w, int _h, int _c, size_t _elemsize, int _elempack, Allocator* _allocator)
+    : data(0), refcount(0), elemsize(0), elempack(0), allocator(0), dims(0), w(0), h(0), d(0), c(0), cstep(0)
+{
+    create(_w, _h, _c, _elemsize, _elempack, _allocator);
+}
+
+NCNN_FORCEINLINE Mat::Mat(int _w, int _h, int _d, int _c, size_t _elemsize, int _elempack, Allocator* _allocator)
+    : data(0), refcount(0), elemsize(0), elempack(0), allocator(0), dims(0), w(0), h(0), d(0), c(0), cstep(0)
+{
+    create(_w, _h, _d, _c, _elemsize, _elempack, _allocator);
+}
+
+NCNN_FORCEINLINE Mat::Mat(const Mat& m)
+    : data(m.data), refcount(m.refcount), elemsize(m.elemsize), elempack(m.elempack), allocator(m.allocator), dims(m.dims), w(m.w), h(m.h), d(m.d), c(m.c), cstep(m.cstep)
+{
+    addref();
+}
+
+NCNN_FORCEINLINE Mat::Mat(int _w, void* _data, size_t _elemsize, Allocator* _allocator)
+    : data(_data), refcount(0), elemsize(_elemsize), elempack(1), allocator(_allocator), dims(1), w(_w), h(1), d(1), c(1)
+{
+    cstep = w;
+}
+
+NCNN_FORCEINLINE Mat::Mat(int _w, int _h, void* _data, size_t _elemsize, Allocator* _allocator)
+    : data(_data), refcount(0), elemsize(_elemsize), elempack(1), allocator(_allocator), dims(2), w(_w), h(_h), d(1), c(1)
+{
+    cstep = (size_t)w * h;
+}
+
+NCNN_FORCEINLINE Mat::Mat(int _w, int _h, int _c, void* _data, size_t _elemsize, Allocator* _allocator)
+    : data(_data), refcount(0), elemsize(_elemsize), elempack(1), allocator(_allocator), dims(3), w(_w), h(_h), d(1), c(_c)
+{
+    cstep = alignSize((size_t)w * h * elemsize, 16) / elemsize;
+}
+
+NCNN_FORCEINLINE Mat::Mat(int _w, int _h, int _d, int _c, void* _data, size_t _elemsize, Allocator* _allocator)
+    : data(_data), refcount(0), elemsize(_elemsize), elempack(1), allocator(_allocator), dims(4), w(_w), h(_h), d(_d), c(_c)
+{
+    cstep = alignSize((size_t)w * h * d * elemsize, 16) / elemsize;
+}
+
+NCNN_FORCEINLINE Mat::Mat(int _w, void* _data, size_t _elemsize, int _elempack, Allocator* _allocator)
+    : data(_data), refcount(0), elemsize(_elemsize), elempack(_elempack), allocator(_allocator), dims(1), w(_w), h(1), d(1), c(1)
+{
+    cstep = w;
+}
+
+NCNN_FORCEINLINE Mat::Mat(int _w, int _h, void* _data, size_t _elemsize, int _elempack, Allocator* _allocator)
+    : data(_data), refcount(0), elemsize(_elemsize), elempack(_elempack), allocator(_allocator), dims(2), w(_w), h(_h), d(1), c(1)
+{
+    cstep = (size_t)w * h;
+}
+
+NCNN_FORCEINLINE Mat::Mat(int _w, int _h, int _c, void* _data, size_t _elemsize, int _elempack, Allocator* _allocator)
+    : data(_data), refcount(0), elemsize(_elemsize), elempack(_elempack), allocator(_allocator), dims(3), w(_w), h(_h), d(1), c(_c)
+{
+    cstep = alignSize((size_t)w * h * elemsize, 16) / elemsize;
+}
+
+NCNN_FORCEINLINE Mat::Mat(int _w, int _h, int _d, int _c, void* _data, size_t _elemsize, int _elempack, Allocator* _allocator)
+    : data(_data), refcount(0), elemsize(_elemsize), elempack(_elempack), allocator(_allocator), dims(4), w(_w), h(_h), d(_d), c(_c)
+{
+    cstep = alignSize((size_t)w * h * d * elemsize, 16) / elemsize;
+}
+
+NCNN_FORCEINLINE Mat::~Mat()
+{
+    release();
+}
+
+NCNN_FORCEINLINE void Mat::fill(float _v)
+{
+    int size = (int)total();
+    float* ptr = (float*)data;
+
+    int i = 0;
+#if __ARM_NEON
+    float32x4_t _c = vdupq_n_f32(_v);
+    for (; i + 3 < size; i += 4)
+    {
+        vst1q_f32(ptr, _c);
+        ptr += 4;
+    }
+#endif // __ARM_NEON
+    for (; i < size; i++)
+    {
+        *ptr++ = _v;
+    }
+}
+
+NCNN_FORCEINLINE void Mat::fill(int _v)
+{
+    int size = (int)total();
+    int* ptr = (int*)data;
+
+    int i = 0;
+#if __ARM_NEON
+    int32x4_t _c = vdupq_n_s32(_v);
+    for (; i + 3 < size; i += 4)
+    {
+        vst1q_s32(ptr, _c);
+        ptr += 4;
+    }
+#endif // __ARM_NEON
+    for (; i < size; i++)
+    {
+        *ptr++ = _v;
+    }
+}
+
+#if __ARM_NEON
+NCNN_FORCEINLINE void Mat::fill(float32x4_t _v)
+{
+    int size = (int)total();
+    float* ptr = (float*)data;
+    for (int i = 0; i < size; i++)
+    {
+        vst1q_f32(ptr, _v);
+        ptr += 4;
+    }
+}
+
+NCNN_FORCEINLINE void Mat::fill(uint16x4_t _v)
+{
+    int size = (int)total();
+    unsigned short* ptr = (unsigned short*)data;
+    for (int i = 0; i < size; i++)
+    {
+        vst1_u16(ptr, _v);
+        ptr += 4;
+    }
+}
+
+NCNN_FORCEINLINE void Mat::fill(int32x4_t _v)
+{
+    int size = (int)total();
+    int* ptr = (int*)data;
+    for (int i = 0; i < size; i++)
+    {
+        vst1q_s32(ptr, _v);
+        ptr += 4;
+    }
+}
+
+NCNN_FORCEINLINE void Mat::fill(int32x4_t _v0, int32x4_t _v1)
+{
+    int size = (int)total();
+    int* ptr = (int*)data;
+    for (int i = 0; i < size; i++)
+    {
+        vst1q_s32(ptr, _v0);
+        vst1q_s32(ptr + 4, _v1);
+        ptr += 8;
+    }
+}
+#if __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+NCNN_FORCEINLINE void Mat::fill(float16x4_t _v)
+{
+    int size = (int)total();
+    __fp16* ptr = (__fp16*)data;
+    for (int i = 0; i < size; i++)
+    {
+        vst1_f16(ptr, _v);
+        ptr += 4;
+    }
+}
+
+NCNN_FORCEINLINE void Mat::fill(float16x8_t _v)
+{
+    int size = (int)total();
+    __fp16* ptr = (__fp16*)data;
+    for (int i = 0; i < size; i++)
+    {
+        vst1q_f16(ptr, _v);
+        ptr += 8;
+    }
+}
+#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+#endif // __ARM_NEON
+
+#if __SSE2__
+#if __AVX__
+#if __AVX512F__
+NCNN_FORCEINLINE void Mat::fill(__m512 _v)
+{
+    int size = (int)total();
+    float* ptr = (float*)data;
+    for (int i = 0; i < size; i++)
+    {
+        _mm512_storeu_ps(ptr, _v);
+        ptr += 16;
+    }
+}
+#endif // __AVX512F__
+NCNN_FORCEINLINE void Mat::fill(__m256 _v, int _i)
+{
+    // old gcc cannot overload __m128 and __m256 type
+    // add a dummy int parameter for different mangled function symbol
+    (void)_i;
+    int size = (int)total();
+    float* ptr = (float*)data;
+    for (int i = 0; i < size; i++)
+    {
+        _mm256_storeu_ps(ptr, _v);
+        ptr += 8;
+    }
+}
+#endif // __AVX__
+NCNN_FORCEINLINE void Mat::fill(__m128 _v)
+{
+    int size = (int)total();
+    float* ptr = (float*)data;
+    for (int i = 0; i < size; i++)
+    {
+        _mm_storeu_ps(ptr, _v);
+        ptr += 4;
+    }
+}
+NCNN_FORCEINLINE void Mat::fill(__m128i _v)
+{
+    int size = (int)total();
+    unsigned short* ptr = (unsigned short*)data;
+    for (int i = 0; i < size; i++)
+    {
+        _mm_store_si128((__m128i*)ptr, _v);
+        ptr += 8;
+    }
+}
+#endif // __SSE2__
+
+#if __mips_msa
+NCNN_FORCEINLINE void Mat::fill(v4f32 _v)
+{
+    int size = (int)total();
+    float* ptr = (float*)data;
+    for (int i = 0; i < size; i++)
+    {
+        __msa_st_w((v4i32)_v, ptr, 0);
+        ptr += 4;
+    }
+}
+#endif // __mips_msa
+
+#if __loongarch_sx
+NCNN_FORCEINLINE void Mat::fill(__m128 _v)
+{
+    int size = (int)total();
+    float* ptr = (float*)data;
+    for (int i = 0; i < size; i++)
+    {
+        __lsx_vst(_v, ptr, 0);
+        ptr += 4;
+    }
+}
+#endif // __loongarch_sx
+#if __riscv_vector
+NCNN_FORCEINLINE void Mat::fill(vfloat32m1_t _v)
+{
+    const int packn = cpu_riscv_vlenb() / 4;
+    const size_t vl = vsetvl_e32m1(packn);
+
+    int size = (int)total();
+    float* ptr = (float*)data;
+    for (int i = 0; i < size; i++)
+    {
+        vse32_v_f32m1(ptr, _v, vl);
+        ptr += packn;
+    }
+}
+
+NCNN_FORCEINLINE void Mat::fill(vuint16m1_t _v)
+{
+    const int packn = cpu_riscv_vlenb() / 2;
+    const size_t vl = vsetvl_e16m1(packn);
+
+    int size = (int)total();
+    unsigned short* ptr = (unsigned short*)data;
+    for (int i = 0; i < size; i++)
+    {
+        vse16_v_u16m1(ptr, _v, vl);
+        ptr += packn;
+    }
+}
+
+NCNN_FORCEINLINE void Mat::fill(vint8m1_t _v)
+{
+    const int packn = cpu_riscv_vlenb() / 1;
+    const size_t vl = vsetvl_e8m1(packn);
+
+    int size = (int)total();
+    signed char* ptr = (signed char*)data;
+    for (int i = 0; i < size; i++)
+    {
+        vse8_v_i8m1(ptr, _v, vl);
+        ptr += packn;
+    }
+}
+#if __riscv_zfh
+NCNN_FORCEINLINE void Mat::fill(vfloat16m1_t _v)
+{
+    const int packn = cpu_riscv_vlenb() / 2;
+    const size_t vl = vsetvl_e16m1(packn);
+
+    int size = (int)total();
+    __fp16* ptr = (__fp16*)data;
+    for (int i = 0; i < size; i++)
+    {
+        vse16_v_f16m1(ptr, _v, vl);
+        ptr += packn;
+    }
+}
+#endif // __riscv_zfh
+#endif // __riscv_vector
+
+template<typename T>
+NCNN_FORCEINLINE void Mat::fill(T _v)
+{
+    int size = (int)total();
+    T* ptr = (T*)data;
+    for (int i = 0; i < size; i++)
+    {
+        ptr[i] = _v;
+    }
+}
+
+NCNN_FORCEINLINE Mat& Mat::operator=(const Mat& m)
+{
+    if (this == &m)
+        return *this;
+
+    if (m.refcount)
+        NCNN_XADD(m.refcount, 1);
+
+    release();
+
+    data = m.data;
+    refcount = m.refcount;
+    elemsize = m.elemsize;
+    elempack = m.elempack;
+    allocator = m.allocator;
+
+    dims = m.dims;
+    w = m.w;
+    h = m.h;
+    d = m.d;
+    c = m.c;
+
+    cstep = m.cstep;
+
+    return *this;
+}
+
+NCNN_FORCEINLINE void Mat::addref()
+{
+    if (refcount)
+        NCNN_XADD(refcount, 1);
+}
+
+NCNN_FORCEINLINE void Mat::release()
+{
+    if (refcount && NCNN_XADD(refcount, -1) == 1)
+    {
+        if (allocator)
+            allocator->fastFree(data);
+        else
+            fastFree(data);
+    }
+
+    data = 0;
+
+    elemsize = 0;
+    elempack = 0;
+
+    dims = 0;
+    w = 0;
+    h = 0;
+    d = 0;
+    c = 0;
+
+    cstep = 0;
+
+    refcount = 0;
+}
+
+NCNN_FORCEINLINE bool Mat::empty() const
+{
+    return data == 0 || total() == 0;
+}
+
+NCNN_FORCEINLINE size_t Mat::total() const
+{
+    return cstep * c;
+}
+
+NCNN_FORCEINLINE int Mat::elembits() const
+{
+    return elempack ? static_cast<int>(elemsize * 8) / elempack : 0;
+}
+
+NCNN_FORCEINLINE Mat Mat::shape() const
+{
+    if (dims == 1)
+        return Mat(w * elempack, (void*)0);
+    if (dims == 2)
+        return Mat(w, h * elempack, (void*)0);
+    if (dims == 3)
+        return Mat(w, h, c * elempack, (void*)0);
+    if (dims == 4)
+        return Mat(w, h, d, c * elempack, (void*)0);
+
+    return Mat();
+}
+
+NCNN_FORCEINLINE Mat Mat::channel(int _c)
+{
+    Mat m(w, h, d, (unsigned char*)data + cstep * _c * elemsize, elemsize, elempack, allocator);
+    m.dims = dims - 1;
+    if (dims == 4)
+        m.cstep = (size_t)w * h;
+    return m;
+}
+
+NCNN_FORCEINLINE const Mat Mat::channel(int _c) const
+{
+    Mat m(w, h, d, (unsigned char*)data + cstep * _c * elemsize, elemsize, elempack, allocator);
+    m.dims = dims - 1;
+    if (dims == 4)
+        m.cstep = (size_t)w * h;
+    return m;
+}
+
+NCNN_FORCEINLINE Mat Mat::depth(int z)
+{
+    return Mat(w, h, (unsigned char*)data + (size_t)w * h * z * elemsize, elemsize, elempack, allocator);
+}
+
+NCNN_FORCEINLINE const Mat Mat::depth(int z) const
+{
+    return Mat(w, h, (unsigned char*)data + (size_t)w * h * z * elemsize, elemsize, elempack, allocator);
+}
+
+NCNN_FORCEINLINE float* Mat::row(int y)
+{
+    return (float*)((unsigned char*)data + (size_t)w * y * elemsize);
+}
+
+NCNN_FORCEINLINE const float* Mat::row(int y) const
+{
+    return (const float*)((unsigned char*)data + (size_t)w * y * elemsize);
+}
+
+template<typename T>
+NCNN_FORCEINLINE T* Mat::row(int y)
+{
+    return (T*)((unsigned char*)data + (size_t)w * y * elemsize);
+}
+
+template<typename T>
+NCNN_FORCEINLINE const T* Mat::row(int y) const
+{
+    return (const T*)((unsigned char*)data + (size_t)w * y * elemsize);
+}
+
+NCNN_FORCEINLINE Mat Mat::channel_range(int _c, int channels)
+{
+    Mat m(w, h, d, channels, (unsigned char*)data + cstep * _c * elemsize, elemsize, elempack, allocator);
+    m.dims = dims;
+    return m;
+}
+
+NCNN_FORCEINLINE const Mat Mat::channel_range(int _c, int channels) const
+{
+    Mat m(w, h, d, channels, (unsigned char*)data + cstep * _c * elemsize, elemsize, elempack, allocator);
+    m.dims = dims;
+    return m;
+}
+
+NCNN_FORCEINLINE Mat Mat::depth_range(int z, int depths)
+{
+    Mat m(w, h, depths, (unsigned char*)data + (size_t)w * h * z * elemsize, elemsize, elempack, allocator);
+    m.cstep = (size_t)w * h;
+    return m;
+}
+
+NCNN_FORCEINLINE const Mat Mat::depth_range(int z, int depths) const
+{
+    Mat m(w, h, depths, (unsigned char*)data + (size_t)w * h * z * elemsize, elemsize, elempack, allocator);
+    m.cstep = (size_t)w * h;
+    return m;
+}
+
+NCNN_FORCEINLINE Mat Mat::row_range(int y, int rows)
+{
+    return Mat(w, rows, (unsigned char*)data + (size_t)w * y * elemsize, elemsize, elempack, allocator);
+}
+
+NCNN_FORCEINLINE const Mat Mat::row_range(int y, int rows) const
+{
+    return Mat(w, rows, (unsigned char*)data + (size_t)w * y * elemsize, elemsize, elempack, allocator);
+}
+
+NCNN_FORCEINLINE Mat Mat::range(int x, int n)
+{
+    return Mat(n, (unsigned char*)data + x * elemsize, elemsize, elempack, allocator);
+}
+
+NCNN_FORCEINLINE const Mat Mat::range(int x, int n) const
+{
+    return Mat(n, (unsigned char*)data + x * elemsize, elemsize, elempack, allocator);
+}
+
+template<typename T>
+NCNN_FORCEINLINE Mat::operator T*()
+{
+    return (T*)data;
+}
+
+template<typename T>
+NCNN_FORCEINLINE Mat::operator const T*() const
+{
+    return (const T*)data;
+}
+
+NCNN_FORCEINLINE float& Mat::operator[](size_t i)
+{
+    return ((float*)data)[i];
+}
+
+NCNN_FORCEINLINE const float& Mat::operator[](size_t i) const
+{
+    return ((const float*)data)[i];
+}
+
+#if NCNN_VULKAN
+
+NCNN_FORCEINLINE VkMat::VkMat()
+    : data(0), refcount(0), elemsize(0), elempack(0), allocator(0), dims(0), w(0), h(0), d(0), c(0), cstep(0)
+{
+}
+
+NCNN_FORCEINLINE VkMat::VkMat(int _w, size_t _elemsize, VkAllocator* _allocator)
+    : data(0), refcount(0), elemsize(0), elempack(0), allocator(0), dims(0), w(0), h(0), d(0), c(0), cstep(0)
+{
+    create(_w, _elemsize, _allocator);
+}
+
+NCNN_FORCEINLINE VkMat::VkMat(int _w, int _h, size_t _elemsize, VkAllocator* _allocator)
+    : data(0), refcount(0), elemsize(0), elempack(0), allocator(0), dims(0), w(0), h(0), d(0), c(0), cstep(0)
+{
+    create(_w, _h, _elemsize, _allocator);
+}
+
+NCNN_FORCEINLINE VkMat::VkMat(int _w, int _h, int _c, size_t _elemsize, VkAllocator* _allocator)
+    : data(0), refcount(0), elemsize(0), elempack(0), allocator(0), dims(0), w(0), h(0), d(0), c(0), cstep(0)
+{
+    create(_w, _h, _c, _elemsize, _allocator);
+}
+
+NCNN_FORCEINLINE VkMat::VkMat(int _w, int _h, int _d, int _c, size_t _elemsize, VkAllocator* _allocator)
+    : data(0), refcount(0), elemsize(0), elempack(0), allocator(0), dims(0), w(0), h(0), d(0), c(0), cstep(0)
+{
+    create(_w, _h, _d, _c, _elemsize, _allocator);
+}
+
+NCNN_FORCEINLINE VkMat::VkMat(int _w, size_t _elemsize, int _elempack, VkAllocator* _allocator)
+    : data(0), refcount(0), elemsize(0), elempack(0), allocator(0), dims(0), w(0), h(0), d(0), c(0), cstep(0)
+{
+    create(_w, _elemsize, _elempack, _allocator);
+}
+
+NCNN_FORCEINLINE VkMat::VkMat(int _w, int _h, size_t _elemsize, int _elempack, VkAllocator* _allocator)
+    : data(0), refcount(0), elemsize(0), elempack(0), allocator(0), dims(0), w(0), h(0), d(0), c(0), cstep(0)
+{
+    create(_w, _h, _elemsize, _elempack, _allocator);
+}
+
+NCNN_FORCEINLINE VkMat::VkMat(int _w, int _h, int _c, size_t _elemsize, int _elempack, VkAllocator* _allocator)
+    : data(0), refcount(0), elemsize(0), elempack(0), allocator(0), dims(0), w(0), h(0), d(0), c(0), cstep(0)
+{
+    create(_w, _h, _c, _elemsize, _elempack, _allocator);
+}
+
+NCNN_FORCEINLINE VkMat::VkMat(int _w, int _h, int _d, int _c, size_t _elemsize, int _elempack, VkAllocator* _allocator)
+    : data(0), refcount(0), elemsize(0), elempack(0), allocator(0), dims(0), w(0), h(0), d(0), c(0), cstep(0)
+{
+    create(_w, _h, _d, _c, _elemsize, _elempack, _allocator);
+}
+
+NCNN_FORCEINLINE VkMat::VkMat(const VkMat& m)
+    : data(m.data), refcount(m.refcount), elemsize(m.elemsize), elempack(m.elempack), allocator(m.allocator), dims(m.dims), w(m.w), h(m.h), d(m.d), c(m.c)
+{
+    addref();
+
+    cstep = m.cstep;
+}
+
+NCNN_FORCEINLINE VkMat::VkMat(int _w, VkBufferMemory* _data, size_t _elemsize, VkAllocator* _allocator)
+    : data(_data), refcount(0), elemsize(_elemsize), elempack(1), allocator(_allocator), dims(1), w(_w), h(1), d(1), c(1)
+{
+    cstep = w;
+}
+
+NCNN_FORCEINLINE VkMat::VkMat(int _w, int _h, VkBufferMemory* _data, size_t _elemsize, VkAllocator* _allocator)
+    : data(_data), refcount(0), elemsize(_elemsize), elempack(1), allocator(_allocator), dims(2), w(_w), h(_h), d(1), c(1)
+{
+    cstep = w * h;
+}
+
+NCNN_FORCEINLINE VkMat::VkMat(int _w, int _h, int _c, VkBufferMemory* _data, size_t _elemsize, VkAllocator* _allocator)
+    : data(_data), refcount(0), elemsize(_elemsize), elempack(1), allocator(_allocator), dims(3), w(_w), h(_h), d(1), c(_c)
+{
+    cstep = alignSize(w * h * elemsize, 16) / elemsize;
+}
+
+NCNN_FORCEINLINE VkMat::VkMat(int _w, int _h, int _d, int _c, VkBufferMemory* _data, size_t _elemsize, VkAllocator* _allocator)
+    : data(_data), refcount(0), elemsize(_elemsize), elempack(1), allocator(_allocator), dims(4), w(_w), h(_h), d(_d), c(_c)
+{
+    cstep = alignSize(w * h * d * elemsize, 16) / elemsize;
+}
+
+NCNN_FORCEINLINE VkMat::VkMat(int _w, VkBufferMemory* _data, size_t _elemsize, int _elempack, VkAllocator* _allocator)
+    : data(_data), refcount(0), elemsize(_elemsize), elempack(_elempack), allocator(_allocator), dims(1), w(_w), h(1), d(1), c(1)
+{
+    cstep = w;
+}
+
+NCNN_FORCEINLINE VkMat::VkMat(int _w, int _h, VkBufferMemory* _data, size_t _elemsize, int _elempack, VkAllocator* _allocator)
+    : data(_data), refcount(0), elemsize(_elemsize), elempack(_elempack), allocator(_allocator), dims(2), w(_w), h(_h), d(1), c(1)
+{
+    cstep = w * h;
+}
+
+NCNN_FORCEINLINE VkMat::VkMat(int _w, int _h, int _c, VkBufferMemory* _data, size_t _elemsize, int _elempack, VkAllocator* _allocator)
+    : data(_data), refcount(0), elemsize(_elemsize), elempack(_elempack), allocator(_allocator), dims(3), w(_w), h(_h), d(1), c(_c)
+{
+    cstep = alignSize(w * h * elemsize, 16) / elemsize;
+}
+
+NCNN_FORCEINLINE VkMat::VkMat(int _w, int _h, int _d, int _c, VkBufferMemory* _data, size_t _elemsize, int _elempack, VkAllocator* _allocator)
+    : data(_data), refcount(0), elemsize(_elemsize), elempack(_elempack), allocator(_allocator), dims(4), w(_w), h(_h), d(_d), c(_c)
+{
+    cstep = alignSize(w * h * d * elemsize, 16) / elemsize;
+}
+
+NCNN_FORCEINLINE VkMat::~VkMat()
+{
+    release();
+}
+
+NCNN_FORCEINLINE VkMat& VkMat::operator=(const VkMat& m)
+{
+    if (this == &m)
+        return *this;
+
+    if (m.refcount)
+        NCNN_XADD(m.refcount, 1);
+
+    release();
+
+    data = m.data;
+    refcount = m.refcount;
+    elemsize = m.elemsize;
+    elempack = m.elempack;
+    allocator = m.allocator;
+
+    dims = m.dims;
+    w = m.w;
+    h = m.h;
+    d = m.d;
+    c = m.c;
+
+    cstep = m.cstep;
+
+    return *this;
+}
+
+NCNN_FORCEINLINE Mat VkMat::mapped() const
+{
+    if (!allocator->mappable)
+        return Mat();
+
+    if (dims == 1)
+        return Mat(w, mapped_ptr(), elemsize, elempack, 0);
+
+    if (dims == 2)
+        return Mat(w, h, mapped_ptr(), elemsize, elempack, 0);
+
+    if (dims == 3)
+        return Mat(w, h, c, mapped_ptr(), elemsize, elempack, 0);
+
+    if (dims == 4)
+        return Mat(w, h, d, c, mapped_ptr(), elemsize, elempack, 0);
+
+    return Mat();
+}
+
+NCNN_FORCEINLINE void* VkMat::mapped_ptr() const
+{
+    if (!allocator->mappable)
+        return 0;
+
+    return (unsigned char*)data->mapped_ptr + data->offset;
+}
+
+NCNN_FORCEINLINE void VkMat::addref()
+{
+    if (refcount)
+        NCNN_XADD(refcount, 1);
+}
+
+NCNN_FORCEINLINE void VkMat::release()
+{
+    if (refcount && NCNN_XADD(refcount, -1) == 1)
+    {
+        if (allocator && data)
+        {
+            allocator->fastFree(data);
+        }
+    }
+
+    data = 0;
+
+    elemsize = 0;
+    elempack = 0;
+
+    dims = 0;
+    w = 0;
+    h = 0;
+    d = 0;
+    c = 0;
+
+    cstep = 0;
+
+    refcount = 0;
+}
+
+NCNN_FORCEINLINE bool VkMat::empty() const
+{
+    return data == 0 || total() == 0;
+}
+
+NCNN_FORCEINLINE size_t VkMat::total() const
+{
+    return cstep * c;
+}
+
+NCNN_FORCEINLINE int VkMat::elembits() const
+{
+    return elempack ? static_cast<int>(elemsize) * 8 / elempack : 0;
+}
+
+NCNN_FORCEINLINE Mat VkMat::shape() const
+{
+    if (dims == 1)
+        return Mat(w * elempack, (void*)0);
+    if (dims == 2)
+        return Mat(w, h * elempack, (void*)0);
+    if (dims == 3)
+        return Mat(w, h, c * elempack, (void*)0);
+    if (dims == 4)
+        return Mat(w, h, d, c * elempack, (void*)0);
+
+    return Mat();
+}
+
+NCNN_FORCEINLINE VkBuffer VkMat::buffer() const
+{
+    return data->buffer;
+}
+
+NCNN_FORCEINLINE size_t VkMat::buffer_offset() const
+{
+    return data->offset;
+}
+
+NCNN_FORCEINLINE size_t VkMat::buffer_capacity() const
+{
+    return data->capacity;
+}
+
+NCNN_FORCEINLINE VkImageMat::VkImageMat()
+    : data(0), refcount(0), elemsize(0), elempack(0), allocator(0), dims(0), w(0), h(0), d(0), c(0)
+{
+}
+
+NCNN_FORCEINLINE VkImageMat::VkImageMat(int _w, size_t _elemsize, VkAllocator* _allocator)
+    : data(0), refcount(0), elemsize(0), elempack(0), allocator(0), dims(0), w(0), h(0), d(0), c(0)
+{
+    create(_w, _elemsize, _allocator);
+}
+
+NCNN_FORCEINLINE VkImageMat::VkImageMat(int _w, int _h, size_t _elemsize, VkAllocator* _allocator)
+    : data(0), refcount(0), elemsize(0), elempack(0), allocator(0), dims(0), w(0), h(0), d(0), c(0)
+{
+    create(_w, _h, _elemsize, _allocator);
+}
+
+NCNN_FORCEINLINE VkImageMat::VkImageMat(int _w, int _h, int _c, size_t _elemsize, VkAllocator* _allocator)
+    : data(0), refcount(0), elemsize(0), elempack(0), allocator(0), dims(0), w(0), h(0), d(0), c(0)
+{
+    create(_w, _h, _c, _elemsize, _allocator);
+}
+
+NCNN_FORCEINLINE VkImageMat::VkImageMat(int _w, int _h, int _d, int _c, size_t _elemsize, VkAllocator* _allocator)
+    : data(0), refcount(0), elemsize(0), elempack(0), allocator(0), dims(0), w(0), h(0), d(0), c(0)
+{
+    create(_w, _h, _d, _c, _elemsize, _allocator);
+}
+
+NCNN_FORCEINLINE VkImageMat::VkImageMat(int _w, size_t _elemsize, int _elempack, VkAllocator* _allocator)
+    : data(0), refcount(0), elemsize(0), elempack(0), allocator(0), dims(0), w(0), h(0), d(0), c(0)
+{
+    create(_w, _elemsize, _elempack, _allocator);
+}
+
+NCNN_FORCEINLINE VkImageMat::VkImageMat(int _w, int _h, size_t _elemsize, int _elempack, VkAllocator* _allocator)
+    : data(0), refcount(0), elemsize(0), elempack(0), allocator(0), dims(0), w(0), h(0), d(0), c(0)
+{
+    create(_w, _h, _elemsize, _elempack, _allocator);
+}
+
+NCNN_FORCEINLINE VkImageMat::VkImageMat(int _w, int _h, int _c, size_t _elemsize, int _elempack, VkAllocator* _allocator)
+    : data(0), refcount(0), elemsize(0), elempack(0), allocator(0), dims(0), w(0), h(0), d(0), c(0)
+{
+    create(_w, _h, _c, _elemsize, _elempack, _allocator);
+}
+
+NCNN_FORCEINLINE VkImageMat::VkImageMat(int _w, int _h, int _d, int _c, size_t _elemsize, int _elempack, VkAllocator* _allocator)
+    : data(0), refcount(0), elemsize(0), elempack(0), allocator(0), dims(0), w(0), h(0), d(0), c(0)
+{
+    create(_w, _h, _d, _c, _elemsize, _elempack, _allocator);
+}
+
+NCNN_FORCEINLINE VkImageMat::VkImageMat(const VkImageMat& m)
+    : data(m.data), refcount(m.refcount), elemsize(m.elemsize), elempack(m.elempack), allocator(m.allocator), dims(m.dims), w(m.w), h(m.h), d(m.d), c(m.c)
+{
+    addref();
+}
+
+NCNN_FORCEINLINE VkImageMat::VkImageMat(int _w, VkImageMemory* _data, size_t _elemsize, VkAllocator* _allocator)
+    : data(_data), refcount(0), elemsize(_elemsize), elempack(1), allocator(_allocator), dims(1), w(_w), h(1), d(1), c(1)
+{
+}
+
+NCNN_FORCEINLINE VkImageMat::VkImageMat(int _w, int _h, VkImageMemory* _data, size_t _elemsize, VkAllocator* _allocator)
+    : data(_data), refcount(0), elemsize(_elemsize), elempack(1), allocator(_allocator), dims(2), w(_w), h(_h), d(1), c(1)
+{
+}
+
+NCNN_FORCEINLINE VkImageMat::VkImageMat(int _w, int _h, int _c, VkImageMemory* _data, size_t _elemsize, VkAllocator* _allocator)
+    : data(_data), refcount(0), elemsize(_elemsize), elempack(1), allocator(_allocator), dims(3), w(_w), h(_h), d(1), c(_c)
+{
+}
+
+NCNN_FORCEINLINE VkImageMat::VkImageMat(int _w, int _h, int _d, int _c, VkImageMemory* _data, size_t _elemsize, VkAllocator* _allocator)
+    : data(_data), refcount(0), elemsize(_elemsize), elempack(1), allocator(_allocator), dims(4), w(_w), h(_h), d(_d), c(_c)
+{
+}
+
+NCNN_FORCEINLINE VkImageMat::VkImageMat(int _w, VkImageMemory* _data, size_t _elemsize, int _elempack, VkAllocator* _allocator)
+    : data(_data), refcount(0), elemsize(_elemsize), elempack(_elempack), allocator(_allocator), dims(1), w(_w), h(1), d(1), c(1)
+{
+}
+
+NCNN_FORCEINLINE VkImageMat::VkImageMat(int _w, int _h, VkImageMemory* _data, size_t _elemsize, int _elempack, VkAllocator* _allocator)
+    : data(_data), refcount(0), elemsize(_elemsize), elempack(_elempack), allocator(_allocator), dims(2), w(_w), h(_h), d(1), c(1)
+{
+}
+
+NCNN_FORCEINLINE VkImageMat::VkImageMat(int _w, int _h, int _c, VkImageMemory* _data, size_t _elemsize, int _elempack, VkAllocator* _allocator)
+    : data(_data), refcount(0), elemsize(_elemsize), elempack(_elempack), allocator(_allocator), dims(3), w(_w), h(_h), d(1), c(_c)
+{
+}
+
+NCNN_FORCEINLINE VkImageMat::VkImageMat(int _w, int _h, int _d, int _c, VkImageMemory* _data, size_t _elemsize, int _elempack, VkAllocator* _allocator)
+    : data(_data), refcount(0), elemsize(_elemsize), elempack(_elempack), allocator(_allocator), dims(4), w(_w), h(_h), d(_d), c(_c)
+{
+}
+
+NCNN_FORCEINLINE VkImageMat::~VkImageMat()
+{
+    release();
+}
+
+NCNN_FORCEINLINE VkImageMat& VkImageMat::operator=(const VkImageMat& m)
+{
+    if (this == &m)
+        return *this;
+
+    if (m.refcount)
+        NCNN_XADD(m.refcount, 1);
+
+    release();
+
+    data = m.data;
+    refcount = m.refcount;
+    elemsize = m.elemsize;
+    elempack = m.elempack;
+    allocator = m.allocator;
+
+    dims = m.dims;
+    w = m.w;
+    h = m.h;
+    d = m.d;
+    c = m.c;
+
+    return *this;
+}
+
+NCNN_FORCEINLINE Mat VkImageMat::mapped() const
+{
+    if (!allocator->mappable || !data->mapped_ptr)
+        return Mat();
+
+    if (dims == 1)
+        return Mat(w, mapped_ptr(), elemsize, elempack, 0);
+
+    if (dims == 2)
+        return Mat(w, h, mapped_ptr(), elemsize, elempack, 0);
+
+    if (dims == 3)
+        return Mat(w, h, c, mapped_ptr(), elemsize, elempack, 0);
+
+    if (dims == 4)
+        return Mat(w, h, d, c, mapped_ptr(), elemsize, elempack, 0);
+
+    return Mat();
+}
+
+NCNN_FORCEINLINE void* VkImageMat::mapped_ptr() const
+{
+    if (!allocator->mappable || !data->mapped_ptr)
+        return 0;
+
+    return (unsigned char*)data->mapped_ptr + data->bind_offset;
+}
+
+NCNN_FORCEINLINE void VkImageMat::addref()
+{
+    if (refcount)
+        NCNN_XADD(refcount, 1);
+}
+
+NCNN_FORCEINLINE void VkImageMat::release()
+{
+    if (refcount && NCNN_XADD(refcount, -1) == 1)
+    {
+        if (allocator && data)
+        {
+            allocator->fastFree(data);
+        }
+    }
+
+    data = 0;
+
+    elemsize = 0;
+    elempack = 0;
+
+    dims = 0;
+    w = 0;
+    h = 0;
+    d = 0;
+    c = 0;
+
+    refcount = 0;
+}
+
+NCNN_FORCEINLINE bool VkImageMat::empty() const
+{
+    return data == 0 || total() == 0;
+}
+
+NCNN_FORCEINLINE size_t VkImageMat::total() const
+{
+    return w * h * d * c;
+}
+
+NCNN_FORCEINLINE int VkImageMat::elembits() const
+{
+    return elempack ? static_cast<int>(elemsize) * 8 / elempack : 0;
+}
+
+NCNN_FORCEINLINE Mat VkImageMat::shape() const
+{
+    if (dims == 1)
+        return Mat(w * elempack, (void*)0);
+    if (dims == 2)
+        return Mat(w, h * elempack, (void*)0);
+    if (dims == 3)
+        return Mat(w, h, c * elempack, (void*)0);
+    if (dims == 4)
+        return Mat(w, h, d, c * elempack, (void*)0);
+
+    return Mat();
+}
+
+NCNN_FORCEINLINE VkImage VkImageMat::image() const
+{
+    return data->image;
+}
+
+NCNN_FORCEINLINE VkImageView VkImageMat::imageview() const
+{
+    return data->imageview;
+}
+
+#endif // NCNN_VULKAN
+
+} // namespace ncnn
+
+#endif // NCNN_MAT_H
diff --git a/linux/include/ncnn/modelbin.h b/linux/include/ncnn/modelbin.h
new file mode 100644
index 0000000..aada5f6
--- /dev/null
+++ b/linux/include/ncnn/modelbin.h
@@ -0,0 +1,80 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef NCNN_MODELBIN_H
+#define NCNN_MODELBIN_H
+
+#include "mat.h"
+
+namespace ncnn {
+
+class DataReader;
+class NCNN_EXPORT ModelBin
+{
+public:
+    ModelBin();
+    virtual ~ModelBin();
+    // element type
+    // 0 = auto
+    // 1 = float32
+    // 2 = float16
+    // 3 = int8
+    // load vec
+    virtual Mat load(int w, int type) const;
+    // load image
+    virtual Mat load(int w, int h, int type) const;
+    // load dim
+    virtual Mat load(int w, int h, int c, int type) const;
+    // load cube
+    virtual Mat load(int w, int h, int d, int c, int type) const;
+};
+
+class ModelBinFromDataReaderPrivate;
+class NCNN_EXPORT ModelBinFromDataReader : public ModelBin
+{
+public:
+    explicit ModelBinFromDataReader(const DataReader& dr);
+    virtual ~ModelBinFromDataReader();
+
+    virtual Mat load(int w, int type) const;
+
+private:
+    ModelBinFromDataReader(const ModelBinFromDataReader&);
+    ModelBinFromDataReader& operator=(const ModelBinFromDataReader&);
+
+private:
+    ModelBinFromDataReaderPrivate* const d;
+};
+
+class ModelBinFromMatArrayPrivate;
+class NCNN_EXPORT ModelBinFromMatArray : public ModelBin
+{
+public:
+    // construct from weight blob array
+    explicit ModelBinFromMatArray(const Mat* weights);
+    virtual ~ModelBinFromMatArray();
+
+    virtual Mat load(int w, int type) const;
+
+private:
+    ModelBinFromMatArray(const ModelBinFromMatArray&);
+    ModelBinFromMatArray& operator=(const ModelBinFromMatArray&);
+
+private:
+    ModelBinFromMatArrayPrivate* const d;
+};
+
+} // namespace ncnn
+
+#endif // NCNN_MODELBIN_H
diff --git a/linux/include/ncnn/ncnn_export.h b/linux/include/ncnn/ncnn_export.h
new file mode 100644
index 0000000..e2f5fde
--- /dev/null
+++ b/linux/include/ncnn/ncnn_export.h
@@ -0,0 +1,42 @@
+
+#ifndef NCNN_EXPORT_H
+#define NCNN_EXPORT_H
+
+#ifdef NCNN_STATIC_DEFINE
+#  define NCNN_EXPORT
+#  define NCNN_NO_EXPORT
+#else
+#  ifndef NCNN_EXPORT
+#    ifdef ncnn_EXPORTS
+        /* We are building this library */
+#      define NCNN_EXPORT __attribute__((visibility("default")))
+#    else
+        /* We are using this library */
+#      define NCNN_EXPORT __attribute__((visibility("default")))
+#    endif
+#  endif
+
+#  ifndef NCNN_NO_EXPORT
+#    define NCNN_NO_EXPORT __attribute__((visibility("hidden")))
+#  endif
+#endif
+
+#ifndef NCNN_DEPRECATED
+#  define NCNN_DEPRECATED __attribute__ ((__deprecated__))
+#endif
+
+#ifndef NCNN_DEPRECATED_EXPORT
+#  define NCNN_DEPRECATED_EXPORT NCNN_EXPORT NCNN_DEPRECATED
+#endif
+
+#ifndef NCNN_DEPRECATED_NO_EXPORT
+#  define NCNN_DEPRECATED_NO_EXPORT NCNN_NO_EXPORT NCNN_DEPRECATED
+#endif
+
+#if 0 /* DEFINE_NO_DEPRECATED */
+#  ifndef NCNN_NO_DEPRECATED
+#    define NCNN_NO_DEPRECATED
+#  endif
+#endif
+
+#endif /* NCNN_EXPORT_H */
diff --git a/linux/include/ncnn/net.h b/linux/include/ncnn/net.h
new file mode 100644
index 0000000..98e3ec3
--- /dev/null
+++ b/linux/include/ncnn/net.h
@@ -0,0 +1,274 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef NCNN_NET_H
+#define NCNN_NET_H
+
+#include "blob.h"
+#include "layer.h"
+#include "mat.h"
+#include "option.h"
+#include "platform.h"
+
+#if NCNN_PLATFORM_API
+#if __ANDROID_API__ >= 9
+#include <android/asset_manager.h>
+#endif // __ANDROID_API__ >= 9
+#endif // NCNN_PLATFORM_API
+
+namespace ncnn {
+
+#if NCNN_VULKAN
+class VkCompute;
+#endif // NCNN_VULKAN
+class DataReader;
+class Extractor;
+class NetPrivate;
+class NCNN_EXPORT Net
+{
+public:
+    // empty init
+    Net();
+    // clear and destroy
+    virtual ~Net();
+
+public:
+    // option can be changed before loading
+    Option opt;
+
+#if NCNN_VULKAN
+    // set gpu device by index
+    void set_vulkan_device(int device_index);
+
+    // set gpu device by device handle, no owner transfer
+    void set_vulkan_device(const VulkanDevice* vkdev);
+
+    const VulkanDevice* vulkan_device() const;
+#endif // NCNN_VULKAN
+
+#if NCNN_STRING
+    // register custom layer or overwrite built-in layer by layer type name
+    // return 0 if success
+    int register_custom_layer(const char* type, layer_creator_func creator, layer_destroyer_func destroyer = 0, void* userdata = 0);
+    virtual int custom_layer_to_index(const char* type);
+#endif // NCNN_STRING
+    // register custom layer or overwrite built-in layer by layer type
+    // return 0 if success
+    int register_custom_layer(int index, layer_creator_func creator, layer_destroyer_func destroyer = 0, void* userdata = 0);
+
+#if NCNN_STRING
+    int load_param(const DataReader& dr);
+#endif // NCNN_STRING
+
+    int load_param_bin(const DataReader& dr);
+
+    int load_model(const DataReader& dr);
+
+#if NCNN_STDIO
+#if NCNN_STRING
+    // load network structure from plain param file
+    // return 0 if success
+    int load_param(FILE* fp);
+    int load_param(const char* protopath);
+    int load_param_mem(const char* mem);
+#endif // NCNN_STRING
+    // load network structure from binary param file
+    // return 0 if success
+    int load_param_bin(FILE* fp);
+    int load_param_bin(const char* protopath);
+
+    // load network weight data from model file
+    // return 0 if success
+    int load_model(FILE* fp);
+    int load_model(const char* modelpath);
+#endif // NCNN_STDIO
+
+    // load network structure from external memory
+    // memory pointer must be 32-bit aligned
+    // return bytes consumed
+    int load_param(const unsigned char* mem);
+
+    // reference network weight data from external memory
+    // weight data is not copied but referenced
+    // so external memory should be retained when used
+    // memory pointer must be 32-bit aligned
+    // return bytes consumed
+    int load_model(const unsigned char* mem);
+
+#if NCNN_PLATFORM_API
+#if __ANDROID_API__ >= 9
+#if NCNN_STRING
+    // convenient load network structure from android asset plain param file
+    int load_param(AAsset* asset);
+    int load_param(AAssetManager* mgr, const char* assetpath);
+#endif // NCNN_STRING
+    // convenient load network structure from android asset binary param file
+    int load_param_bin(AAsset* asset);
+    int load_param_bin(AAssetManager* mgr, const char* assetpath);
+
+    // convenient load network weight data from android asset model file
+    int load_model(AAsset* asset);
+    int load_model(AAssetManager* mgr, const char* assetpath);
+#endif // __ANDROID_API__ >= 9
+#endif // NCNN_PLATFORM_API
+
+    // unload network structure and weight data
+    void clear();
+
+    // construct an Extractor from network
+    Extractor create_extractor() const;
+
+    // get input/output indexes/names
+    const std::vector<int>& input_indexes() const;
+    const std::vector<int>& output_indexes() const;
+#if NCNN_STRING
+    const std::vector<const char*>& input_names() const;
+    const std::vector<const char*>& output_names() const;
+#endif
+
+    const std::vector<Blob>& blobs() const;
+    const std::vector<Layer*>& layers() const;
+
+    std::vector<Blob>& mutable_blobs();
+    std::vector<Layer*>& mutable_layers();
+
+protected:
+    friend class Extractor;
+#if NCNN_STRING
+    int find_blob_index_by_name(const char* name) const;
+    int find_layer_index_by_name(const char* name) const;
+    virtual Layer* create_custom_layer(const char* type);
+    virtual Layer* create_overwrite_builtin_layer(const char* type);
+#endif // NCNN_STRING
+    virtual Layer* create_custom_layer(int index);
+    virtual Layer* create_overwrite_builtin_layer(int typeindex);
+
+private:
+    Net(const Net&);
+    Net& operator=(const Net&);
+
+private:
+    NetPrivate* const d;
+};
+
+class ExtractorPrivate;
+class NCNN_EXPORT Extractor
+{
+public:
+    virtual ~Extractor();
+
+    // copy
+    Extractor(const Extractor&);
+
+    // assign
+    Extractor& operator=(const Extractor&);
+
+    // clear blob mats and alloctors
+    void clear();
+
+    // enable light mode
+    // intermediate blob will be recycled when enabled
+    // enabled by default
+    void set_light_mode(bool enable);
+
+    // set thread count for this extractor
+    // this will overwrite the global setting
+    // default count is system depended
+    void set_num_threads(int num_threads);
+
+    // set blob memory allocator
+    void set_blob_allocator(Allocator* allocator);
+
+    // set workspace memory allocator
+    void set_workspace_allocator(Allocator* allocator);
+
+#if NCNN_VULKAN
+    void set_vulkan_compute(bool enable);
+
+    void set_blob_vkallocator(VkAllocator* allocator);
+
+    void set_workspace_vkallocator(VkAllocator* allocator);
+
+    void set_staging_vkallocator(VkAllocator* allocator);
+#endif // NCNN_VULKAN
+
+#if NCNN_STRING
+    // set input by blob name
+    // return 0 if success
+    int input(const char* blob_name, const Mat& in);
+
+    // get result by blob name
+    // return 0 if success
+    // type = 0, default
+    // type = 1, do not convert fp16/bf16 or / and packing
+    int extract(const char* blob_name, Mat& feat, int type = 0);
+#endif // NCNN_STRING
+
+    // set input by blob index
+    // return 0 if success
+    int input(int blob_index, const Mat& in);
+
+    // get result by blob index
+    // return 0 if success
+    // type = 0, default
+    // type = 1, do not convert fp16/bf16 or / and packing
+    int extract(int blob_index, Mat& feat, int type = 0);
+
+#if NCNN_VULKAN
+#if NCNN_STRING
+    // set input by blob name
+    // return 0 if success
+    int input(const char* blob_name, const VkMat& in);
+
+    // get result by blob name
+    // return 0 if success
+    int extract(const char* blob_name, VkMat& feat, VkCompute& cmd);
+
+    // set input by blob name
+    // return 0 if success
+    int input(const char* blob_name, const VkImageMat& in);
+
+    // get result by blob name
+    // return 0 if success
+    int extract(const char* blob_name, VkImageMat& feat, VkCompute& cmd);
+#endif // NCNN_STRING
+
+    // set input by blob index
+    // return 0 if success
+    int input(int blob_index, const VkMat& in);
+
+    // get result by blob index
+    // return 0 if success
+    int extract(int blob_index, VkMat& feat, VkCompute& cmd);
+
+    // set input by blob index
+    // return 0 if success
+    int input(int blob_index, const VkImageMat& in);
+
+    // get result by blob index
+    // return 0 if success
+    int extract(int blob_index, VkImageMat& feat, VkCompute& cmd);
+#endif // NCNN_VULKAN
+
+protected:
+    friend Extractor Net::create_extractor() const;
+    Extractor(const Net* net, size_t blob_count);
+
+private:
+    ExtractorPrivate* const d;
+};
+
+} // namespace ncnn
+
+#endif // NCNN_NET_H
diff --git a/linux/include/ncnn/option.h b/linux/include/ncnn/option.h
new file mode 100644
index 0000000..7d0cc60
--- /dev/null
+++ b/linux/include/ncnn/option.h
@@ -0,0 +1,156 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef NCNN_OPTION_H
+#define NCNN_OPTION_H
+
+#include "platform.h"
+
+namespace ncnn {
+
+#if NCNN_VULKAN
+class VkAllocator;
+class PipelineCache;
+#endif // NCNN_VULKAN
+
+class Allocator;
+class NCNN_EXPORT Option
+{
+public:
+    // default option
+    Option();
+
+public:
+    // light mode
+    // intermediate blob will be recycled when enabled
+    // enabled by default
+    bool lightmode;
+
+    // thread count
+    // default value is the one returned by get_cpu_count()
+    int num_threads;
+
+    // blob memory allocator
+    Allocator* blob_allocator;
+
+    // workspace memory allocator
+    Allocator* workspace_allocator;
+
+#if NCNN_VULKAN
+    // blob memory allocator
+    VkAllocator* blob_vkallocator;
+
+    // workspace memory allocator
+    VkAllocator* workspace_vkallocator;
+
+    // staging memory allocator
+    VkAllocator* staging_vkallocator;
+
+    // pipeline cache
+    PipelineCache* pipeline_cache;
+#endif // NCNN_VULKAN
+
+    // the time openmp threads busy-wait for more work before going to sleep
+    // default value is 20ms to keep the cores enabled
+    // without too much extra power consumption afterwards
+    int openmp_blocktime;
+
+    // enable winograd convolution optimization
+    // improve convolution 3x3 stride1 performance, may consume more memory
+    // changes should be applied before loading network structure and weight
+    // enabled by default
+    bool use_winograd_convolution;
+
+    // enable sgemm convolution optimization
+    // improve convolution 1x1 stride1 performance, may consume more memory
+    // changes should be applied before loading network structure and weight
+    // enabled by default
+    bool use_sgemm_convolution;
+
+    // enable quantized int8 inference
+    // use low-precision int8 path for quantized model
+    // changes should be applied before loading network structure and weight
+    // enabled by default
+    bool use_int8_inference;
+
+    // enable vulkan compute
+    bool use_vulkan_compute;
+
+    // enable bf16 data type for storage
+    // improve most operator performance on all arm devices, may consume more memory
+    bool use_bf16_storage;
+
+    // enable options for gpu inference
+    bool use_fp16_packed;
+    bool use_fp16_storage;
+    bool use_fp16_arithmetic;
+    bool use_int8_packed;
+    bool use_int8_storage;
+    bool use_int8_arithmetic;
+
+    // enable simd-friendly packed memory layout
+    // improve all operator performance on all arm devices, will consume more memory
+    // changes should be applied before loading network structure and weight
+    // enabled by default
+    bool use_packing_layout;
+
+    bool use_shader_pack8;
+
+    // subgroup option
+    bool use_subgroup_basic;
+    bool use_subgroup_vote;
+    bool use_subgroup_ballot;
+    bool use_subgroup_shuffle;
+
+    // turn on for adreno
+    bool use_image_storage;
+    bool use_tensor_storage;
+
+    bool use_reserved_0;
+
+    // enable DAZ(Denormals-Are-Zero) and FTZ(Flush-To-Zero)
+    // default value is 3
+    // 0 = DAZ OFF, FTZ OFF
+    // 1 = DAZ ON , FTZ OFF
+    // 2 = DAZ OFF, FTZ ON
+    // 3 = DAZ ON,  FTZ ON
+    int flush_denormals;
+
+    bool use_local_pool_allocator;
+
+    // enable local memory optimization for gpu inference
+    bool use_shader_local_memory;
+
+    // enable cooperative matrix optimization for gpu inference
+    bool use_cooperative_matrix;
+
+    // more fine-grained control of winograd convolution
+    bool use_winograd23_convolution;
+    bool use_winograd43_convolution;
+    bool use_winograd63_convolution;
+
+    // this option is turned on for A53/A55 automatically
+    // but you can force this on/off if you wish
+    bool use_a53_a55_optimized_kernel;
+
+    bool use_reserved_7;
+    bool use_reserved_8;
+    bool use_reserved_9;
+    bool use_reserved_10;
+    bool use_reserved_11;
+};
+
+} // namespace ncnn
+
+#endif // NCNN_OPTION_H
diff --git a/linux/include/ncnn/paramdict.h b/linux/include/ncnn/paramdict.h
new file mode 100644
index 0000000..c2ef160
--- /dev/null
+++ b/linux/include/ncnn/paramdict.h
@@ -0,0 +1,73 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef NCNN_PARAMDICT_H
+#define NCNN_PARAMDICT_H
+
+#include "mat.h"
+
+// at most 32 parameters
+#define NCNN_MAX_PARAM_COUNT 32
+
+namespace ncnn {
+
+class DataReader;
+class Net;
+class ParamDictPrivate;
+class NCNN_EXPORT ParamDict
+{
+public:
+    // empty
+    ParamDict();
+
+    virtual ~ParamDict();
+
+    // copy
+    ParamDict(const ParamDict&);
+
+    // assign
+    ParamDict& operator=(const ParamDict&);
+
+    // get type
+    int type(int id) const;
+
+    // get int
+    int get(int id, int def) const;
+    // get float
+    float get(int id, float def) const;
+    // get array
+    Mat get(int id, const Mat& def) const;
+
+    // set int
+    void set(int id, int i);
+    // set float
+    void set(int id, float f);
+    // set array
+    void set(int id, const Mat& v);
+
+protected:
+    friend class Net;
+
+    void clear();
+
+    int load_param(const DataReader& dr);
+    int load_param_bin(const DataReader& dr);
+
+private:
+    ParamDictPrivate* const d;
+};
+
+} // namespace ncnn
+
+#endif // NCNN_PARAMDICT_H
diff --git a/linux/include/ncnn/pipeline.h b/linux/include/ncnn/pipeline.h
new file mode 100644
index 0000000..c284a14
--- /dev/null
+++ b/linux/include/ncnn/pipeline.h
@@ -0,0 +1,113 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef NCNN_PIPELINE_H
+#define NCNN_PIPELINE_H
+
+#include "mat.h"
+#include "platform.h"
+#if NCNN_VULKAN
+#include "gpu.h"
+
+#include <vulkan/vulkan.h>
+#endif // NCNN_VULKAN
+
+namespace ncnn {
+
+#if NCNN_VULKAN
+class Option;
+class PipelinePrivate;
+class NCNN_EXPORT Pipeline
+{
+public:
+    explicit Pipeline(const VulkanDevice* vkdev);
+    virtual ~Pipeline();
+
+public:
+    void set_optimal_local_size_xyz(int w = 4, int h = 4, int c = 4);
+    void set_optimal_local_size_xyz(const Mat& local_size_xyz);
+    void set_local_size_xyz(int w, int h, int c);
+
+    int create(const uint32_t* spv_data, size_t spv_data_size, const std::vector<vk_specialization_type>& specializations);
+
+    int create(int shader_type_index, const Option& opt, const std::vector<vk_specialization_type>& specializations);
+
+public:
+    VkShaderModule shader_module() const;
+    VkDescriptorSetLayout descriptorset_layout() const;
+    VkPipelineLayout pipeline_layout() const;
+    VkPipeline pipeline() const;
+    VkDescriptorUpdateTemplateKHR descriptor_update_template() const;
+
+    const ShaderInfo& shader_info() const;
+
+    uint32_t local_size_x() const;
+    uint32_t local_size_y() const;
+    uint32_t local_size_z() const;
+
+protected:
+    void set_shader_module(VkShaderModule shader_module);
+    void set_descriptorset_layout(VkDescriptorSetLayout descriptorset_layout);
+    void set_pipeline_layout(VkPipelineLayout pipeline_layout);
+    void set_pipeline(VkPipeline pipeline);
+    void set_descriptor_update_template(VkDescriptorUpdateTemplateKHR descriptor_update_template);
+
+    void set_shader_info(const ShaderInfo& shader_info);
+
+public:
+    const VulkanDevice* vkdev;
+
+private:
+    Pipeline(const Pipeline&);
+    Pipeline& operator=(const Pipeline&);
+
+private:
+    PipelinePrivate* const d;
+};
+
+#if NCNN_PLATFORM_API
+#if __ANDROID_API__ >= 26
+class VkCompute;
+class NCNN_EXPORT ImportAndroidHardwareBufferPipeline : private Pipeline
+{
+public:
+    explicit ImportAndroidHardwareBufferPipeline(const VulkanDevice* vkdev);
+    virtual ~ImportAndroidHardwareBufferPipeline();
+
+    int create(VkAndroidHardwareBufferImageAllocator* ahb_im_allocator, int type_to, int rotate_from, const Option& opt);
+    int create(VkAndroidHardwareBufferImageAllocator* ahb_im_allocator, int type_to, int rotate_from, int target_width, int target_height, const Option& opt);
+    void destroy();
+
+    friend class VkCompute;
+
+protected:
+    int create_shader_module(const Option& opt);
+    int create_sampler(VkAndroidHardwareBufferImageAllocator* ahb_im_allocator);
+    int create_descriptorset_layout();
+
+public:
+    int type_to;
+    int rotate_from;
+    bool need_resize;
+
+    VkSampler sampler;
+};
+#endif // __ANDROID_API__ >= 26
+#endif // NCNN_PLATFORM_API
+
+#endif // NCNN_VULKAN
+
+} // namespace ncnn
+
+#endif // NCNN_PIPELINE_H
diff --git a/linux/include/ncnn/pipelinecache.h b/linux/include/ncnn/pipelinecache.h
new file mode 100644
index 0000000..bb6b8fb
--- /dev/null
+++ b/linux/include/ncnn/pipelinecache.h
@@ -0,0 +1,85 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef NCNN_PIPELINECACHE_H
+#define NCNN_PIPELINECACHE_H
+
+#include "platform.h"
+
+#if NCNN_VULKAN
+#include <vulkan/vulkan.h>
+#endif // NCNN_VULKAN
+
+#include "mat.h"
+#include "gpu.h"
+
+namespace ncnn {
+
+#if NCNN_VULKAN
+
+class VulkanDevice;
+class PipelineCachePrivate;
+class NCNN_EXPORT PipelineCache
+{
+public:
+    explicit PipelineCache(const VulkanDevice* _vkdev);
+
+    virtual ~PipelineCache();
+
+    void clear();
+
+    int get_pipeline(const uint32_t* spv_data, size_t spv_data_size, const std::vector<vk_specialization_type>& specializations,
+                     uint32_t local_size_x, uint32_t local_size_y, uint32_t local_size_z,
+                     VkShaderModule* shader_module,
+                     VkDescriptorSetLayout* descriptorset_layout,
+                     VkPipelineLayout* pipeline_layout,
+                     VkPipeline* pipeline,
+                     VkDescriptorUpdateTemplateKHR* descriptor_update_template,
+                     ShaderInfo& shader_info) const;
+
+    int get_pipeline(int shader_type_index, const Option& opt, const std::vector<vk_specialization_type>& specializations,
+                     uint32_t local_size_x, uint32_t local_size_y, uint32_t local_size_z,
+                     VkShaderModule* shader_module,
+                     VkDescriptorSetLayout* descriptorset_layout,
+                     VkPipelineLayout* pipeline_layout,
+                     VkPipeline* pipeline,
+                     VkDescriptorUpdateTemplateKHR* descriptor_update_template,
+                     ShaderInfo& shader_info) const;
+
+protected:
+    int create_shader_module(int shader_type_index, const Option& opt, uint32_t local_size_x, uint32_t local_size_y, uint32_t local_size_z,
+                             VkShaderModule* _shader_module, ShaderInfo& si) const;
+
+    int new_pipeline(VkShaderModule shader_module, const ShaderInfo& shader_info, const std::vector<vk_specialization_type>& specializations,
+                     VkDescriptorSetLayout* descriptorset_layout,
+                     VkPipelineLayout* pipeline_layout,
+                     VkPipeline* pipeline,
+                     VkDescriptorUpdateTemplateKHR* descriptor_update_template) const;
+
+protected:
+    const VulkanDevice* vkdev;
+
+private:
+    PipelineCache(const PipelineCache&);
+    PipelineCache& operator=(const PipelineCache&);
+
+private:
+    PipelineCachePrivate* const d;
+};
+
+#endif // NCNN_VULKAN
+
+} // namespace ncnn
+
+#endif // NCNN_PIPELINECACHE_H
diff --git a/linux/include/ncnn/platform.h b/linux/include/ncnn/platform.h
new file mode 100644
index 0000000..4a3ce3d
--- /dev/null
+++ b/linux/include/ncnn/platform.h
@@ -0,0 +1,284 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef NCNN_PLATFORM_H
+#define NCNN_PLATFORM_H
+
+#define NCNN_STDIO 1
+#define NCNN_STRING 1
+#define NCNN_SIMPLEOCV 0
+#define NCNN_SIMPLEOMP 0
+#define NCNN_SIMPLESTL 0
+#define NCNN_THREADS 1
+#define NCNN_BENCHMARK 0
+#define NCNN_C_API 1
+#define NCNN_PLATFORM_API 1
+#define NCNN_PIXEL 1
+#define NCNN_PIXEL_ROTATE 1
+#define NCNN_PIXEL_AFFINE 1
+#define NCNN_PIXEL_DRAWING 1
+#define NCNN_VULKAN 1
+#define NCNN_SYSTEM_GLSLANG 0
+#define NCNN_RUNTIME_CPU 1
+#define NCNN_GNU_INLINE_ASM 1
+#define NCNN_AVX 1
+#define NCNN_XOP 1
+#define NCNN_FMA 1
+#define NCNN_F16C 1
+#define NCNN_AVX2 1
+#define NCNN_AVXVNNI 0
+#define NCNN_AVX512 1
+#define NCNN_AVX512VNNI 1
+#define NCNN_AVX512BF16 0
+#define NCNN_AVX512FP16 0
+#define NCNN_VFPV4 0
+#define NCNN_ARM82 0
+#define NCNN_ARM82DOT 0
+#define NCNN_ARM82FP16FML 0
+#define NCNN_ARM84BF16 0
+#define NCNN_ARM84I8MM 0
+#define NCNN_ARM86SVE 0
+#define NCNN_ARM86SVE2 0
+#define NCNN_ARM86SVEBF16 0
+#define NCNN_ARM86SVEI8MM 0
+#define NCNN_ARM86SVEF32MM 0
+#define NCNN_MSA 0
+#define NCNN_LSX 0
+#define NCNN_MMI 0
+#define NCNN_RVV 0
+#define NCNN_INT8 1
+#define NCNN_BF16 1
+#define NCNN_FORCE_INLINE 1
+
+#define NCNN_VERSION_STRING "1.0.20230920"
+
+#include "ncnn_export.h"
+
+#ifdef __cplusplus
+
+#if NCNN_THREADS
+#if (defined _WIN32 && !(defined __MINGW32__))
+#define WIN32_LEAN_AND_MEAN
+#include <windows.h>
+#include <process.h>
+#else
+#include <pthread.h>
+#endif
+#endif // NCNN_THREADS
+
+#if __ANDROID_API__ >= 26
+#define VK_USE_PLATFORM_ANDROID_KHR
+#endif // __ANDROID_API__ >= 26
+
+namespace ncnn {
+
+#if NCNN_THREADS
+#if (defined _WIN32 && !(defined __MINGW32__))
+class NCNN_EXPORT Mutex
+{
+public:
+    Mutex() { InitializeSRWLock(&srwlock); }
+    ~Mutex() {}
+    void lock() { AcquireSRWLockExclusive(&srwlock); }
+    void unlock() { ReleaseSRWLockExclusive(&srwlock); }
+private:
+    friend class ConditionVariable;
+    // NOTE SRWLock is available from windows vista
+    SRWLOCK srwlock;
+};
+
+class NCNN_EXPORT ConditionVariable
+{
+public:
+    ConditionVariable() { InitializeConditionVariable(&condvar); }
+    ~ConditionVariable() {}
+    void wait(Mutex& mutex) { SleepConditionVariableSRW(&condvar, &mutex.srwlock, INFINITE, 0); }
+    void broadcast() { WakeAllConditionVariable(&condvar); }
+    void signal() { WakeConditionVariable(&condvar); }
+private:
+    CONDITION_VARIABLE condvar;
+};
+
+static unsigned __stdcall start_wrapper(void* args);
+class NCNN_EXPORT Thread
+{
+public:
+    Thread(void* (*start)(void*), void* args = 0) { _start = start; _args = args; handle = (HANDLE)_beginthreadex(0, 0, start_wrapper, this, 0, 0); }
+    ~Thread() {}
+    void join() { WaitForSingleObject(handle, INFINITE); CloseHandle(handle); }
+private:
+    friend unsigned __stdcall start_wrapper(void* args)
+    {
+        Thread* t = (Thread*)args;
+        t->_start(t->_args);
+        return 0;
+    }
+    HANDLE handle;
+    void* (*_start)(void*);
+    void* _args;
+};
+
+class NCNN_EXPORT ThreadLocalStorage
+{
+public:
+    ThreadLocalStorage() { key = TlsAlloc(); }
+    ~ThreadLocalStorage() { TlsFree(key); }
+    void set(void* value) { TlsSetValue(key, (LPVOID)value); }
+    void* get() { return (void*)TlsGetValue(key); }
+private:
+    DWORD key;
+};
+#else // (defined _WIN32 && !(defined __MINGW32__))
+class NCNN_EXPORT Mutex
+{
+public:
+    Mutex() { pthread_mutex_init(&mutex, 0); }
+    ~Mutex() { pthread_mutex_destroy(&mutex); }
+    void lock() { pthread_mutex_lock(&mutex); }
+    void unlock() { pthread_mutex_unlock(&mutex); }
+private:
+    friend class ConditionVariable;
+    pthread_mutex_t mutex;
+};
+
+class NCNN_EXPORT ConditionVariable
+{
+public:
+    ConditionVariable() { pthread_cond_init(&cond, 0); }
+    ~ConditionVariable() { pthread_cond_destroy(&cond); }
+    void wait(Mutex& mutex) { pthread_cond_wait(&cond, &mutex.mutex); }
+    void broadcast() { pthread_cond_broadcast(&cond); }
+    void signal() { pthread_cond_signal(&cond); }
+private:
+    pthread_cond_t cond;
+};
+
+class NCNN_EXPORT Thread
+{
+public:
+    Thread(void* (*start)(void*), void* args = 0) { pthread_create(&t, 0, start, args); }
+    ~Thread() {}
+    void join() { pthread_join(t, 0); }
+private:
+    pthread_t t;
+};
+
+class NCNN_EXPORT ThreadLocalStorage
+{
+public:
+    ThreadLocalStorage() { pthread_key_create(&key, 0); }
+    ~ThreadLocalStorage() { pthread_key_delete(key); }
+    void set(void* value) { pthread_setspecific(key, value); }
+    void* get() { return pthread_getspecific(key); }
+private:
+    pthread_key_t key;
+};
+#endif // (defined _WIN32 && !(defined __MINGW32__))
+#else // NCNN_THREADS
+class NCNN_EXPORT Mutex
+{
+public:
+    Mutex() {}
+    ~Mutex() {}
+    void lock() {}
+    void unlock() {}
+};
+
+class NCNN_EXPORT ConditionVariable
+{
+public:
+    ConditionVariable() {}
+    ~ConditionVariable() {}
+    void wait(Mutex& /*mutex*/) {}
+    void broadcast() {}
+    void signal() {}
+};
+
+class NCNN_EXPORT Thread
+{
+public:
+    Thread(void* (*/*start*/)(void*), void* /*args*/ = 0) {}
+    ~Thread() {}
+    void join() {}
+};
+
+class NCNN_EXPORT ThreadLocalStorage
+{
+public:
+    ThreadLocalStorage() { data = 0; }
+    ~ThreadLocalStorage() {}
+    void set(void* value) { data = value; }
+    void* get() { return data; }
+private:
+    void* data;
+};
+#endif // NCNN_THREADS
+
+class NCNN_EXPORT MutexLockGuard
+{
+public:
+    MutexLockGuard(Mutex& _mutex) : mutex(_mutex) { mutex.lock(); }
+    ~MutexLockGuard() { mutex.unlock(); }
+private:
+    Mutex& mutex;
+};
+
+} // namespace ncnn
+
+#if NCNN_SIMPLESTL
+#include "simplestl.h"
+#else
+#include <algorithm>
+#include <list>
+#include <vector>
+#include <string>
+#endif
+
+#endif // __cplusplus
+
+#if NCNN_STDIO
+#if NCNN_PLATFORM_API && __ANDROID_API__ >= 8
+#include <android/log.h>
+#define NCNN_LOGE(...) do { \
+    fprintf(stderr, ##__VA_ARGS__); fprintf(stderr, "\n"); \
+    __android_log_print(ANDROID_LOG_WARN, "ncnn", ##__VA_ARGS__); } while(0)
+#else // NCNN_PLATFORM_API && __ANDROID_API__ >= 8
+#include <stdio.h>
+#define NCNN_LOGE(...) do { \
+    fprintf(stderr, ##__VA_ARGS__); fprintf(stderr, "\n"); } while(0)
+#endif // NCNN_PLATFORM_API && __ANDROID_API__ >= 8
+#else
+#define NCNN_LOGE(...)
+#endif
+
+
+#if NCNN_FORCE_INLINE
+#ifdef _MSC_VER
+    #define NCNN_FORCEINLINE __forceinline
+#elif defined(__GNUC__)
+    #define NCNN_FORCEINLINE inline __attribute__((__always_inline__))
+#elif defined(__CLANG__)
+    #if __has_attribute(__always_inline__)
+        #define NCNN_FORCEINLINE inline __attribute__((__always_inline__))
+    #else
+        #define NCNN_FORCEINLINE inline
+    #endif
+#else
+    #define NCNN_FORCEINLINE inline
+#endif
+#else
+    #define NCNN_FORCEINLINE inline
+#endif
+
+#endif // NCNN_PLATFORM_H
diff --git a/linux/include/ncnn/simpleocv.h b/linux/include/ncnn/simpleocv.h
new file mode 100644
index 0000000..54b22d9
--- /dev/null
+++ b/linux/include/ncnn/simpleocv.h
@@ -0,0 +1,503 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef NCNN_SIMPLEOCV_H
+#define NCNN_SIMPLEOCV_H
+
+#include "platform.h"
+
+#if NCNN_SIMPLEOCV
+
+#include <limits.h>
+#include <string.h>
+#include "allocator.h"
+#include "mat.h"
+
+#if defined(_MSC_VER) || defined(__GNUC__)
+#pragma push_macro("min")
+#pragma push_macro("max")
+#undef min
+#undef max
+#endif
+
+#ifndef NCNN_XADD
+using ncnn::NCNN_XADD;
+#endif
+
+typedef unsigned char uchar;
+typedef unsigned short ushort;
+typedef unsigned int uint;
+
+enum
+{
+    CV_LOAD_IMAGE_UNCHANGED = -1,
+    CV_LOAD_IMAGE_GRAYSCALE = 0,
+    CV_LOAD_IMAGE_COLOR = 1,
+};
+
+enum
+{
+    CV_IMWRITE_JPEG_QUALITY = 1
+};
+
+// minimal opencv style data structure implementation
+namespace cv {
+
+template<typename _Tp>
+static inline _Tp saturate_cast(int v)
+{
+    return _Tp(v);
+}
+template<>
+inline uchar saturate_cast<uchar>(int v)
+{
+    return (uchar)((unsigned)v <= UCHAR_MAX ? v : v > 0 ? UCHAR_MAX : 0);
+}
+
+template<typename _Tp>
+struct Scalar_
+{
+    Scalar_()
+    {
+        v[0] = 0;
+        v[1] = 0;
+        v[2] = 0;
+        v[3] = 0;
+    }
+    Scalar_(_Tp _v0)
+    {
+        v[0] = _v0;
+        v[1] = 0;
+        v[2] = 0;
+        v[3] = 0;
+    }
+    Scalar_(_Tp _v0, _Tp _v1, _Tp _v2)
+    {
+        v[0] = _v0;
+        v[1] = _v1;
+        v[2] = _v2;
+        v[3] = 0;
+    }
+    Scalar_(_Tp _v0, _Tp _v1, _Tp _v2, _Tp _v3)
+    {
+        v[0] = _v0;
+        v[1] = _v1;
+        v[2] = _v2;
+        v[3] = _v3;
+    }
+
+    const _Tp operator[](const int i) const
+    {
+        return v[i];
+    }
+
+    _Tp operator[](const int i)
+    {
+        return v[i];
+    }
+
+    _Tp v[4];
+};
+
+typedef Scalar_<uchar> Scalar;
+
+template<typename _Tp>
+struct Point_
+{
+    Point_()
+        : x(0), y(0)
+    {
+    }
+    Point_(_Tp _x, _Tp _y)
+        : x(_x), y(_y)
+    {
+    }
+
+    template<typename _Tp2>
+    operator Point_<_Tp2>() const
+    {
+        return Point_<_Tp2>(saturate_cast<_Tp2>(x), saturate_cast<_Tp2>(y));
+    }
+
+    _Tp x;
+    _Tp y;
+};
+
+typedef Point_<int> Point;
+typedef Point_<float> Point2f;
+
+template<typename _Tp>
+struct Size_
+{
+    Size_()
+        : width(0), height(0)
+    {
+    }
+    Size_(_Tp _w, _Tp _h)
+        : width(_w), height(_h)
+    {
+    }
+
+    template<typename _Tp2>
+    operator Size_<_Tp2>() const
+    {
+        return Size_<_Tp2>(saturate_cast<_Tp2>(width), saturate_cast<_Tp2>(height));
+    }
+
+    _Tp width;
+    _Tp height;
+};
+
+typedef Size_<int> Size;
+typedef Size_<float> Size2f;
+
+template<typename _Tp>
+struct Rect_
+{
+    Rect_()
+        : x(0), y(0), width(0), height(0)
+    {
+    }
+    Rect_(_Tp _x, _Tp _y, _Tp _w, _Tp _h)
+        : x(_x), y(_y), width(_w), height(_h)
+    {
+    }
+    Rect_(Point_<_Tp> _p, Size_<_Tp> _size)
+        : x(_p.x), y(_p.y), width(_size.width), height(_size.height)
+    {
+    }
+
+    template<typename _Tp2>
+    operator Rect_<_Tp2>() const
+    {
+        return Rect_<_Tp2>(saturate_cast<_Tp2>(x), saturate_cast<_Tp2>(y), saturate_cast<_Tp2>(width), saturate_cast<_Tp2>(height));
+    }
+
+    _Tp x;
+    _Tp y;
+    _Tp width;
+    _Tp height;
+
+    // area
+    _Tp area() const
+    {
+        return width * height;
+    }
+};
+
+template<typename _Tp>
+static inline Rect_<_Tp>& operator&=(Rect_<_Tp>& a, const Rect_<_Tp>& b)
+{
+    _Tp x1 = std::max(a.x, b.x), y1 = std::max(a.y, b.y);
+    a.width = std::min(a.x + a.width, b.x + b.width) - x1;
+    a.height = std::min(a.y + a.height, b.y + b.height) - y1;
+    a.x = x1;
+    a.y = y1;
+    if (a.width <= 0 || a.height <= 0)
+        a = Rect_<_Tp>();
+    return a;
+}
+
+template<typename _Tp>
+static inline Rect_<_Tp>& operator|=(Rect_<_Tp>& a, const Rect_<_Tp>& b)
+{
+    _Tp x1 = std::min(a.x, b.x), y1 = std::min(a.y, b.y);
+    a.width = std::max(a.x + a.width, b.x + b.width) - x1;
+    a.height = std::max(a.y + a.height, b.y + b.height) - y1;
+    a.x = x1;
+    a.y = y1;
+    return a;
+}
+
+template<typename _Tp>
+static inline Rect_<_Tp> operator&(const Rect_<_Tp>& a, const Rect_<_Tp>& b)
+{
+    Rect_<_Tp> c = a;
+    return c &= b;
+}
+
+template<typename _Tp>
+static inline Rect_<_Tp> operator|(const Rect_<_Tp>& a, const Rect_<_Tp>& b)
+{
+    Rect_<_Tp> c = a;
+    return c |= b;
+}
+
+typedef Rect_<int> Rect;
+typedef Rect_<float> Rect2f;
+
+#define CV_8UC1  1
+#define CV_8UC3  3
+#define CV_8UC4  4
+#define CV_32FC1 4
+
+struct NCNN_EXPORT Mat
+{
+    Mat()
+        : data(0), refcount(0), rows(0), cols(0), c(0)
+    {
+    }
+
+    Mat(int _rows, int _cols, int flags)
+        : data(0), refcount(0)
+    {
+        create(_rows, _cols, flags);
+    }
+
+    // copy
+    Mat(const Mat& m)
+        : data(m.data), refcount(m.refcount)
+    {
+        if (refcount)
+            NCNN_XADD(refcount, 1);
+
+        rows = m.rows;
+        cols = m.cols;
+        c = m.c;
+    }
+
+    Mat(int _rows, int _cols, int flags, void* _data)
+        : data((unsigned char*)_data), refcount(0)
+    {
+        rows = _rows;
+        cols = _cols;
+        c = flags;
+    }
+
+    ~Mat()
+    {
+        release();
+    }
+
+    // assign
+    Mat& operator=(const Mat& m)
+    {
+        if (this == &m)
+            return *this;
+
+        if (m.refcount)
+            NCNN_XADD(m.refcount, 1);
+
+        release();
+
+        data = m.data;
+        refcount = m.refcount;
+
+        rows = m.rows;
+        cols = m.cols;
+        c = m.c;
+
+        return *this;
+    }
+
+    Mat& operator=(const Scalar& s)
+    {
+        if (total() > 0)
+        {
+            uchar* p = data;
+            for (int i = 0; i < cols * rows; i++)
+            {
+                for (int j = 0; j < c; j++)
+                {
+                    *p++ = s[j];
+                }
+            }
+        }
+
+        return *this;
+    }
+
+    void create(int _rows, int _cols, int flags)
+    {
+        release();
+
+        rows = _rows;
+        cols = _cols;
+        c = flags;
+
+        if (total() > 0)
+        {
+            // refcount address must be aligned, so we expand totalsize here
+            size_t totalsize = (total() + 3) >> 2 << 2;
+            data = (uchar*)ncnn::fastMalloc(totalsize + (int)sizeof(*refcount));
+            refcount = (int*)(((uchar*)data) + totalsize);
+            *refcount = 1;
+        }
+    }
+
+    void release()
+    {
+        if (refcount && NCNN_XADD(refcount, -1) == 1)
+            ncnn::fastFree(data);
+
+        data = 0;
+
+        rows = 0;
+        cols = 0;
+        c = 0;
+
+        refcount = 0;
+    }
+
+    Mat clone() const
+    {
+        if (empty())
+            return Mat();
+
+        Mat m(rows, cols, c);
+
+        if (total() > 0)
+        {
+            memcpy(m.data, data, total());
+        }
+
+        return m;
+    }
+
+    bool empty() const
+    {
+        return data == 0 || total() == 0;
+    }
+
+    int channels() const
+    {
+        return c;
+    }
+
+    int type() const
+    {
+        return c;
+    }
+
+    size_t total() const
+    {
+        return cols * rows * c;
+    }
+
+    const uchar* ptr(int y) const
+    {
+        return data + y * cols * c;
+    }
+
+    uchar* ptr(int y)
+    {
+        return data + y * cols * c;
+    }
+
+    template<typename _Tp>
+    const _Tp* ptr(int y) const
+    {
+        return (const _Tp*)data + y * cols * c;
+    }
+
+    template<typename _Tp>
+    _Tp* ptr(int y)
+    {
+        return (_Tp*)data + y * cols * c;
+    }
+
+    // roi
+    Mat operator()(const Rect& roi) const
+    {
+        if (empty())
+            return Mat();
+
+        Mat m(roi.height, roi.width, c);
+
+        int sy = roi.y;
+        for (int y = 0; y < roi.height; y++)
+        {
+            const uchar* sptr = ptr(sy) + roi.x * c;
+            uchar* dptr = m.ptr(y);
+            memcpy(dptr, sptr, roi.width * c);
+            sy++;
+        }
+
+        return m;
+    }
+
+    uchar* data;
+
+    // pointer to the reference counter;
+    // when points to user-allocated data, the pointer is NULL
+    int* refcount;
+
+    int rows;
+    int cols;
+
+    int c;
+};
+
+enum ImreadModes
+{
+    IMREAD_UNCHANGED = -1,
+    IMREAD_GRAYSCALE = 0,
+    IMREAD_COLOR = 1
+};
+
+NCNN_EXPORT Mat imread(const std::string& path, int flags = IMREAD_COLOR);
+
+NCNN_EXPORT Mat imdecode(const std::vector<uchar>& buf, int flags = IMREAD_COLOR);
+
+enum ImwriteFlags
+{
+    IMWRITE_JPEG_QUALITY = 1
+};
+
+NCNN_EXPORT bool imwrite(const std::string& path, const Mat& m, const std::vector<int>& params = std::vector<int>());
+
+NCNN_EXPORT void imshow(const std::string& name, const Mat& m);
+
+NCNN_EXPORT int waitKey(int delay = 0);
+
+#if NCNN_PIXEL
+NCNN_EXPORT void resize(const Mat& src, Mat& dst, const Size& size, float sw = 0.f, float sh = 0.f, int flags = 0);
+#endif // NCNN_PIXEL
+
+#if NCNN_PIXEL_DRAWING
+
+enum
+{
+    FILLED = -1
+};
+
+NCNN_EXPORT void rectangle(Mat& img, Point pt1, Point pt2, const Scalar& color, int thickness = 1);
+
+NCNN_EXPORT void rectangle(Mat& img, Rect rec, const Scalar& color, int thickness = 1);
+
+NCNN_EXPORT void circle(Mat& img, Point center, int radius, const Scalar& color, int thickness = 1);
+
+NCNN_EXPORT void line(Mat& img, Point p0, Point p1, const Scalar& color, int thickness = 1);
+
+enum
+{
+    FONT_HERSHEY_SIMPLEX = 0
+};
+
+NCNN_EXPORT void putText(Mat& img, const std::string& text, Point org, int fontFace, double fontScale, Scalar color, int thickness = 1);
+
+NCNN_EXPORT Size getTextSize(const std::string& text, int fontFace, double fontScale, int thickness, int* baseLine);
+
+#endif // NCNN_PIXEL_DRAWING
+
+} // namespace cv
+
+#if defined(_MSC_VER) || defined(__GNUC__)
+#pragma pop_macro("min")
+#pragma pop_macro("max")
+#endif
+
+#endif // NCNN_SIMPLEOCV
+
+#endif // NCNN_SIMPLEOCV_H
diff --git a/linux/include/ncnn/simpleomp.h b/linux/include/ncnn/simpleomp.h
new file mode 100644
index 0000000..13e2452
--- /dev/null
+++ b/linux/include/ncnn/simpleomp.h
@@ -0,0 +1,53 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef NCNN_SIMPLEOMP_H
+#define NCNN_SIMPLEOMP_H
+
+#include "platform.h"
+
+#if NCNN_SIMPLEOMP
+
+#include <stdint.h>
+
+// This minimal openmp runtime implementation only supports the llvm openmp abi
+// and only supports #pragma omp parallel for num_threads(X)
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+NCNN_EXPORT int omp_get_max_threads();
+
+NCNN_EXPORT void omp_set_num_threads(int num_threads);
+
+NCNN_EXPORT int omp_get_dynamic();
+
+NCNN_EXPORT void omp_set_dynamic(int dynamic);
+
+NCNN_EXPORT int omp_get_num_threads();
+
+NCNN_EXPORT int omp_get_thread_num();
+
+NCNN_EXPORT int kmp_get_blocktime();
+
+NCNN_EXPORT void kmp_set_blocktime(int blocktime);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif // NCNN_SIMPLEOMP
+
+#endif // NCNN_SIMPLEOMP_H
diff --git a/linux/include/ncnn/simplestl.h b/linux/include/ncnn/simplestl.h
new file mode 100644
index 0000000..00ff468
--- /dev/null
+++ b/linux/include/ncnn/simplestl.h
@@ -0,0 +1,565 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef NCNN_SIMPLESTL_H
+#define NCNN_SIMPLESTL_H
+
+#include <stddef.h>
+#include <stdint.h>
+#include <string.h>
+
+#if !NCNN_SIMPLESTL
+
+#include <new>
+
+#else
+
+// allocation functions
+NCNN_EXPORT void* operator new(size_t size);
+NCNN_EXPORT void* operator new[](size_t size);
+// placement allocation functions
+NCNN_EXPORT void* operator new(size_t size, void* ptr);
+NCNN_EXPORT void* operator new[](size_t size, void* ptr);
+// deallocation functions
+NCNN_EXPORT void operator delete(void* ptr);
+NCNN_EXPORT void operator delete[](void* ptr);
+// deallocation functions since c++14
+#if __cplusplus >= 201402L
+NCNN_EXPORT void operator delete(void* ptr, size_t sz);
+NCNN_EXPORT void operator delete[](void* ptr, size_t sz);
+#endif
+// placement deallocation functions
+NCNN_EXPORT void operator delete(void* ptr, void* voidptr2);
+NCNN_EXPORT void operator delete[](void* ptr, void* voidptr2);
+
+#endif
+
+// minimal stl data structure implementation
+namespace std {
+
+template<typename T>
+const T& max(const T& a, const T& b)
+{
+    return (a < b) ? b : a;
+}
+
+template<typename T>
+const T& min(const T& a, const T& b)
+{
+    return (a > b) ? b : a;
+}
+
+template<typename T>
+void swap(T& a, T& b)
+{
+    T temp(a);
+    a = b;
+    b = temp;
+}
+
+template<typename T1, typename T2>
+struct pair
+{
+    pair()
+        : first(), second()
+    {
+    }
+    pair(const T1& t1, const T2& t2)
+        : first(t1), second(t2)
+    {
+    }
+
+    T1 first;
+    T2 second;
+};
+
+template<typename T1, typename T2>
+bool operator==(const pair<T1, T2>& x, const pair<T1, T2>& y)
+{
+    return (x.first == y.first && x.second == y.second);
+}
+template<typename T1, typename T2>
+bool operator<(const pair<T1, T2>& x, const pair<T1, T2>& y)
+{
+    return x.first < y.first || (!(y.first < x.first) && x.second < y.second);
+}
+template<typename T1, typename T2>
+bool operator!=(const pair<T1, T2>& x, const pair<T1, T2>& y)
+{
+    return !(x == y);
+}
+template<typename T1, typename T2>
+bool operator>(const pair<T1, T2>& x, const pair<T1, T2>& y)
+{
+    return y < x;
+}
+template<typename T1, typename T2>
+bool operator<=(const pair<T1, T2>& x, const pair<T1, T2>& y)
+{
+    return !(y < x);
+}
+template<typename T1, typename T2>
+bool operator>=(const pair<T1, T2>& x, const pair<T1, T2>& y)
+{
+    return !(x < y);
+}
+
+template<typename T1, typename T2>
+pair<T1, T2> make_pair(const T1& t1, const T2& t2)
+{
+    return pair<T1, T2>(t1, t2);
+}
+
+template<typename T>
+struct node
+{
+    node* prev_;
+    node* next_;
+    T data_;
+
+    node()
+        : prev_(0), next_(0), data_()
+    {
+    }
+    node(const T& t)
+        : prev_(0), next_(0), data_(t)
+    {
+    }
+};
+
+template<typename T>
+struct iter_list
+{
+    iter_list()
+        : curr_(0)
+    {
+    }
+    iter_list(node<T>* n)
+        : curr_(n)
+    {
+    }
+    iter_list(const iter_list& i)
+        : curr_(i.curr_)
+    {
+    }
+    ~iter_list()
+    {
+    }
+
+    iter_list& operator=(const iter_list& i)
+    {
+        curr_ = i.curr_;
+        return *this;
+    }
+
+    T& operator*()
+    {
+        return curr_->data_;
+    }
+    T* operator->()
+    {
+        return &(curr_->data_);
+    }
+
+    bool operator==(const iter_list& i)
+    {
+        return curr_ == i.curr_;
+    }
+    bool operator!=(const iter_list& i)
+    {
+        return curr_ != i.curr_;
+    }
+
+    iter_list& operator++()
+    {
+        curr_ = curr_->next_;
+        return *this;
+    }
+    iter_list& operator--()
+    {
+        curr_ = curr_->prev_;
+        return *this;
+    }
+
+    node<T>* curr_;
+};
+
+template<typename T>
+struct list
+{
+    typedef iter_list<T> iterator;
+
+    list()
+    {
+        head_ = new node<T>();
+        tail_ = head_;
+        count_ = 0;
+    }
+    ~list()
+    {
+        clear();
+        delete head_;
+    }
+    list(const list& l)
+    {
+        head_ = new node<T>();
+        tail_ = head_;
+        count_ = 0;
+
+        for (iter_list<T> i = l.begin(); i != l.end(); ++i)
+        {
+            push_back(*i);
+        }
+    }
+
+    list& operator=(const list& l)
+    {
+        if (this == &l)
+        {
+            return *this;
+        }
+        clear();
+
+        for (iter_list<T> i = l.begin(); i != l.end(); ++i)
+        {
+            push_back(*i);
+        }
+        return *this;
+    }
+
+    void clear()
+    {
+        while (count_ > 0)
+        {
+            pop_front();
+        }
+    }
+
+    void pop_front()
+    {
+        if (count_ > 0)
+        {
+            head_ = head_->next_;
+            delete head_->prev_;
+            head_->prev_ = 0;
+            --count_;
+        }
+    }
+
+    size_t size() const
+    {
+        return count_;
+    }
+    iter_list<T> begin() const
+    {
+        return iter_list<T>(head_);
+    }
+    iter_list<T> end() const
+    {
+        return iter_list<T>(tail_);
+    }
+    bool empty() const
+    {
+        return count_ == 0;
+    }
+
+    void push_back(const T& t)
+    {
+        if (count_ == 0)
+        {
+            head_ = new node<T>(t);
+            head_->prev_ = 0;
+            head_->next_ = tail_;
+            tail_->prev_ = head_;
+            count_ = 1;
+        }
+        else
+        {
+            node<T>* temp = new node<T>(t);
+            temp->prev_ = tail_->prev_;
+            temp->next_ = tail_;
+            tail_->prev_->next_ = temp;
+            tail_->prev_ = temp;
+            ++count_;
+        }
+    }
+
+    iter_list<T> erase(iter_list<T> pos)
+    {
+        if (pos != end())
+        {
+            node<T>* temp = pos.curr_;
+            if (temp == head_)
+            {
+                ++pos;
+                temp->next_->prev_ = 0;
+                head_ = temp->next_;
+            }
+            else
+            {
+                --pos;
+                temp->next_->prev_ = temp->prev_;
+                temp->prev_->next_ = temp->next_;
+                ++pos;
+            }
+            delete temp;
+            --count_;
+        }
+        return pos;
+    }
+
+protected:
+    node<T>* head_;
+    node<T>* tail_;
+    size_t count_;
+};
+
+template<typename T>
+struct greater
+{
+    bool operator()(const T& x, const T& y) const
+    {
+        return (x > y);
+    }
+};
+
+template<typename T>
+struct less
+{
+    bool operator()(const T& x, const T& y) const
+    {
+        return (x < y);
+    }
+};
+
+template<typename RandomAccessIter, typename Compare>
+void partial_sort(RandomAccessIter first, RandomAccessIter middle, RandomAccessIter last, Compare comp)
+{
+    // [TODO] heap sort should be used here, but we simply use bubble sort now
+    for (RandomAccessIter i = first; i < middle; ++i)
+    {
+        // bubble sort
+        for (RandomAccessIter j = last - 1; j > first; --j)
+        {
+            if (comp(*j, *(j - 1)))
+            {
+                swap(*j, *(j - 1));
+            }
+        }
+    }
+}
+
+template<typename T>
+struct vector
+{
+    vector()
+        : data_(0), size_(0), capacity_(0)
+    {
+    }
+    vector(const size_t new_size, const T& value = T())
+        : data_(0), size_(0), capacity_(0)
+    {
+        resize(new_size, value);
+    }
+    ~vector()
+    {
+        clear();
+    }
+    vector(const vector& v)
+        : data_(0), size_(0), capacity_(0)
+    {
+        resize(v.size());
+        for (size_t i = 0; i < size_; i++)
+        {
+            data_[i] = v.data_[i];
+        }
+    }
+
+    vector& operator=(const vector& v)
+    {
+        if (this == &v)
+        {
+            return *this;
+        }
+        resize(0);
+        resize(v.size());
+        for (size_t i = 0; i < size_; i++)
+        {
+            data_[i] = v.data_[i];
+        }
+        return *this;
+    }
+
+    void resize(const size_t new_size, const T& value = T())
+    {
+        try_alloc(new_size);
+        if (new_size > size_)
+        {
+            for (size_t i = size_; i < new_size; i++)
+            {
+                new (&data_[i]) T(value);
+            }
+        }
+        else if (new_size < size_)
+        {
+            for (size_t i = new_size; i < size_; i++)
+            {
+                data_[i].~T();
+            }
+        }
+        size_ = new_size;
+    }
+
+    void clear()
+    {
+        for (size_t i = 0; i < size_; i++)
+        {
+            data_[i].~T();
+        }
+        delete[](char*) data_;
+        data_ = 0;
+        size_ = 0;
+        capacity_ = 0;
+    }
+
+    T* data() const
+    {
+        return data_;
+    }
+    size_t size() const
+    {
+        return size_;
+    }
+    T& operator[](size_t i) const
+    {
+        return data_[i];
+    }
+    T* begin() const
+    {
+        return &data_[0];
+    }
+    T* end() const
+    {
+        return &data_[size_];
+    }
+    bool empty() const
+    {
+        return size_ == 0;
+    }
+
+    void push_back(const T& t)
+    {
+        try_alloc(size_ + 1);
+        new (&data_[size_]) T(t);
+        size_++;
+    }
+
+    void insert(T* pos, T* b, T* e)
+    {
+        vector* v = 0;
+        if (b >= begin() && b < end())
+        {
+            //the same vector
+            v = new vector(*this);
+            b = v->begin() + (b - begin());
+            e = v->begin() + (e - begin());
+        }
+        size_t diff = pos - begin();
+        try_alloc(size_ + (e - b));
+        pos = begin() + diff;
+        memmove(pos + (e - b), pos, (end() - pos) * sizeof(T));
+        size_t len = e - b;
+        size_ += len;
+        for (size_t i = 0; i < len; i++)
+        {
+            *pos = *b;
+            pos++;
+            b++;
+        }
+        delete v;
+    }
+
+    T* erase(T* pos)
+    {
+        pos->~T();
+        memmove(pos, pos + 1, (end() - pos - 1) * sizeof(T));
+        size_--;
+        return pos;
+    }
+
+protected:
+    T* data_;
+    size_t size_;
+    size_t capacity_;
+    void try_alloc(size_t new_size)
+    {
+        if (new_size * 3 / 2 > capacity_ / 2)
+        {
+            capacity_ = new_size * 2;
+            T* new_data = (T*)new char[capacity_ * sizeof(T)];
+            memset(static_cast<void*>(new_data), 0, capacity_ * sizeof(T));
+            if (data_)
+            {
+                memmove(new_data, data_, sizeof(T) * size_);
+                delete[](char*) data_;
+            }
+            data_ = new_data;
+        }
+    }
+};
+
+struct NCNN_EXPORT string : public vector<char>
+{
+    string()
+    {
+    }
+    string(const char* str)
+    {
+        size_t len = strlen(str);
+        resize(len);
+        memcpy(data_, str, len);
+    }
+    const char* c_str() const
+    {
+        return (const char*)data_;
+    }
+    bool operator==(const string& str2) const
+    {
+        return strcmp(data_, str2.data_) == 0;
+    }
+    bool operator==(const char* str2) const
+    {
+        return strcmp(data_, str2) == 0;
+    }
+    bool operator!=(const char* str2) const
+    {
+        return strcmp(data_, str2) != 0;
+    }
+    string& operator+=(const string& str1)
+    {
+        insert(end(), str1.begin(), str1.end());
+        return *this;
+    }
+};
+
+inline string operator+(const string& str1, const string& str2)
+{
+    string str(str1);
+    str.insert(str.end(), str2.begin(), str2.end());
+    return str;
+}
+
+} // namespace std
+
+#endif // NCNN_SIMPLESTL_H
diff --git a/linux/include/ncnn/vulkan_header_fix.h b/linux/include/ncnn/vulkan_header_fix.h
new file mode 100644
index 0000000..0a5ea9b
--- /dev/null
+++ b/linux/include/ncnn/vulkan_header_fix.h
@@ -0,0 +1,449 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef NCNN_VULKAN_HEADER_FIX_H
+#define NCNN_VULKAN_HEADER_FIX_H
+
+#include <vulkan/vulkan.h>
+
+// This header contains new structure and function declearation to fix build with old vulkan sdk
+
+#if VK_HEADER_VERSION < 70
+#define VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_SUBGROUP_PROPERTIES (VkStructureType)1000094000
+typedef enum VkSubgroupFeatureFlagBits
+{
+    VK_SUBGROUP_FEATURE_BASIC_BIT = 0x00000001,
+    VK_SUBGROUP_FEATURE_VOTE_BIT = 0x00000002,
+    VK_SUBGROUP_FEATURE_ARITHMETIC_BIT = 0x00000004,
+    VK_SUBGROUP_FEATURE_BALLOT_BIT = 0x00000008,
+    VK_SUBGROUP_FEATURE_SHUFFLE_BIT = 0x00000010,
+    VK_SUBGROUP_FEATURE_SHUFFLE_RELATIVE_BIT = 0x00000020,
+    VK_SUBGROUP_FEATURE_CLUSTERED_BIT = 0x00000040,
+    VK_SUBGROUP_FEATURE_QUAD_BIT = 0x00000080,
+    VK_SUBGROUP_FEATURE_PARTITIONED_BIT_NV = 0x00000100,
+    VK_SUBGROUP_FEATURE_FLAG_BITS_MAX_ENUM = 0x7FFFFFFF
+} VkSubgroupFeatureFlagBits;
+typedef VkFlags VkSubgroupFeatureFlags;
+typedef struct VkPhysicalDeviceSubgroupProperties
+{
+    VkStructureType sType;
+    void* pNext;
+    uint32_t subgroupSize;
+    VkShaderStageFlags supportedStages;
+    VkSubgroupFeatureFlags supportedOperations;
+    VkBool32 quadOperationsInAllStages;
+} VkPhysicalDeviceSubgroupProperties;
+#define VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_MAINTENANCE_3_PROPERTIES (VkStructureType)1000168000
+#define VK_STRUCTURE_TYPE_DESCRIPTOR_SET_LAYOUT_SUPPORT            (VkStructureType)1000168001
+typedef struct VkPhysicalDeviceMaintenance3Properties
+{
+    VkStructureType sType;
+    void* pNext;
+    uint32_t maxPerSetDescriptors;
+    VkDeviceSize maxMemoryAllocationSize;
+} VkPhysicalDeviceMaintenance3Properties;
+typedef struct VkDescriptorSetLayoutSupport
+{
+    VkStructureType sType;
+    void* pNext;
+    VkBool32 supported;
+} VkDescriptorSetLayoutSupport;
+typedef VkPhysicalDeviceMaintenance3Properties VkPhysicalDeviceMaintenance3PropertiesKHR;
+typedef VkDescriptorSetLayoutSupport VkDescriptorSetLayoutSupportKHR;
+typedef void(VKAPI_PTR* PFN_vkGetDescriptorSetLayoutSupportKHR)(VkDevice device, const VkDescriptorSetLayoutCreateInfo* pCreateInfo, VkDescriptorSetLayoutSupport* pSupport);
+#endif // VK_HEADER_VERSION < 70
+
+#if VK_HEADER_VERSION < 80
+#define VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_8BIT_STORAGE_FEATURES_KHR (VkStructureType)1000177000
+typedef struct VkPhysicalDevice8BitStorageFeaturesKHR
+{
+    VkStructureType sType;
+    void* pNext;
+    VkBool32 storageBuffer8BitAccess;
+    VkBool32 uniformAndStorageBuffer8BitAccess;
+    VkBool32 storagePushConstant8;
+} VkPhysicalDevice8BitStorageFeaturesKHR;
+#define VK_STRUCTURE_TYPE_ATTACHMENT_DESCRIPTION_2_KHR  (VkStructureType)1000109000
+#define VK_STRUCTURE_TYPE_ATTACHMENT_REFERENCE_2_KHR    (VkStructureType)1000109001
+#define VK_STRUCTURE_TYPE_SUBPASS_DESCRIPTION_2_KHR     (VkStructureType)1000109002
+#define VK_STRUCTURE_TYPE_SUBPASS_DEPENDENCY_2_KHR      (VkStructureType)1000109003
+#define VK_STRUCTURE_TYPE_RENDER_PASS_CREATE_INFO_2_KHR (VkStructureType)1000109004
+#define VK_STRUCTURE_TYPE_SUBPASS_BEGIN_INFO_KHR        (VkStructureType)1000109005
+#define VK_STRUCTURE_TYPE_SUBPASS_END_INFO_KHR          (VkStructureType)1000109006
+typedef struct VkAttachmentDescription2KHR
+{
+    VkStructureType sType;
+    const void* pNext;
+    VkAttachmentDescriptionFlags flags;
+    VkFormat format;
+    VkSampleCountFlagBits samples;
+    VkAttachmentLoadOp loadOp;
+    VkAttachmentStoreOp storeOp;
+    VkAttachmentLoadOp stencilLoadOp;
+    VkAttachmentStoreOp stencilStoreOp;
+    VkImageLayout initialLayout;
+    VkImageLayout finalLayout;
+} VkAttachmentDescription2KHR;
+typedef struct VkAttachmentReference2KHR
+{
+    VkStructureType sType;
+    const void* pNext;
+    uint32_t attachment;
+    VkImageLayout layout;
+    VkImageAspectFlags aspectMask;
+} VkAttachmentReference2KHR;
+typedef struct VkSubpassDescription2KHR
+{
+    VkStructureType sType;
+    const void* pNext;
+    VkSubpassDescriptionFlags flags;
+    VkPipelineBindPoint pipelineBindPoint;
+    uint32_t viewMask;
+    uint32_t inputAttachmentCount;
+    const VkAttachmentReference2KHR* pInputAttachments;
+    uint32_t colorAttachmentCount;
+    const VkAttachmentReference2KHR* pColorAttachments;
+    const VkAttachmentReference2KHR* pResolveAttachments;
+    const VkAttachmentReference2KHR* pDepthStencilAttachment;
+    uint32_t preserveAttachmentCount;
+    const uint32_t* pPreserveAttachments;
+} VkSubpassDescription2KHR;
+typedef struct VkSubpassDependency2KHR
+{
+    VkStructureType sType;
+    const void* pNext;
+    uint32_t srcSubpass;
+    uint32_t dstSubpass;
+    VkPipelineStageFlags srcStageMask;
+    VkPipelineStageFlags dstStageMask;
+    VkAccessFlags srcAccessMask;
+    VkAccessFlags dstAccessMask;
+    VkDependencyFlags dependencyFlags;
+    int32_t viewOffset;
+} VkSubpassDependency2KHR;
+typedef struct VkRenderPassCreateInfo2KHR
+{
+    VkStructureType sType;
+    const void* pNext;
+    VkRenderPassCreateFlags flags;
+    uint32_t attachmentCount;
+    const VkAttachmentDescription2KHR* pAttachments;
+    uint32_t subpassCount;
+    const VkSubpassDescription2KHR* pSubpasses;
+    uint32_t dependencyCount;
+    const VkSubpassDependency2KHR* pDependencies;
+    uint32_t correlatedViewMaskCount;
+    const uint32_t* pCorrelatedViewMasks;
+} VkRenderPassCreateInfo2KHR;
+typedef struct VkSubpassBeginInfoKHR
+{
+    VkStructureType sType;
+    const void* pNext;
+    VkSubpassContents contents;
+} VkSubpassBeginInfoKHR;
+
+typedef struct VkSubpassEndInfoKHR
+{
+    VkStructureType sType;
+    const void* pNext;
+} VkSubpassEndInfoKHR;
+typedef VkResult(VKAPI_PTR* PFN_vkCreateRenderPass2KHR)(VkDevice device, const VkRenderPassCreateInfo2KHR* pCreateInfo, const VkAllocationCallbacks* pAllocator, VkRenderPass* pRenderPass);
+typedef void(VKAPI_PTR* PFN_vkCmdBeginRenderPass2KHR)(VkCommandBuffer commandBuffer, const VkRenderPassBeginInfo* pRenderPassBegin, const VkSubpassBeginInfoKHR* pSubpassBeginInfo);
+typedef void(VKAPI_PTR* PFN_vkCmdNextSubpass2KHR)(VkCommandBuffer commandBuffer, const VkSubpassBeginInfoKHR* pSubpassBeginInfo, const VkSubpassEndInfoKHR* pSubpassEndInfo);
+typedef void(VKAPI_PTR* PFN_vkCmdEndRenderPass2KHR)(VkCommandBuffer commandBuffer, const VkSubpassEndInfoKHR* pSubpassEndInfo);
+#endif // VK_HEADER_VERSION < 80
+
+#if VK_HEADER_VERSION < 95
+#define VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_FLOAT16_INT8_FEATURES_KHR (VkStructureType)1000082000
+typedef struct VkPhysicalDeviceFloat16Int8FeaturesKHR
+{
+    VkStructureType sType;
+    void* pNext;
+    VkBool32 shaderFloat16;
+    VkBool32 shaderInt8;
+} VkPhysicalDeviceFloat16Int8FeaturesKHR;
+#endif // VK_HEADER_VERSION < 95
+
+#if VK_HEADER_VERSION < 97
+#define VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_MEMORY_BUDGET_PROPERTIES_EXT (VkStructureType)1000237000
+#define VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_MEMORY_PRIORITY_FEATURES_EXT (VkStructureType)1000238000
+#define VK_STRUCTURE_TYPE_MEMORY_PRIORITY_ALLOCATE_INFO_EXT            (VkStructureType)1000238001
+#define VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_BUFFER_ADDRESS_FEATURES_EXT  (VkStructureType)1000244000
+#define VK_STRUCTURE_TYPE_BUFFER_DEVICE_ADDRESS_INFO_EXT               (VkStructureType)1000244001
+#define VK_STRUCTURE_TYPE_BUFFER_DEVICE_ADDRESS_CREATE_INFO_EXT        (VkStructureType)1000244002
+#define VK_STRUCTURE_TYPE_VALIDATION_FEATURES_EXT                      (VkStructureType)1000247000
+#define VK_BUFFER_CREATE_DEVICE_ADDRESS_CAPTURE_REPLAY_BIT_EXT         (VkBufferCreateFlagBits)0x00020000
+#define VK_BUFFER_USAGE_SHADER_DEVICE_ADDRESS_BIT_EXT                  (VkBufferUsageFlagBits)0x00020000
+typedef uint64_t VkDeviceAddress;
+typedef struct VkPhysicalDeviceMemoryBudgetPropertiesEXT
+{
+    VkStructureType sType;
+    void* pNext;
+    VkDeviceSize heapBudget[VK_MAX_MEMORY_HEAPS];
+    VkDeviceSize heapUsage[VK_MAX_MEMORY_HEAPS];
+} VkPhysicalDeviceMemoryBudgetPropertiesEXT;
+typedef struct VkPhysicalDeviceMemoryPriorityFeaturesEXT
+{
+    VkStructureType sType;
+    void* pNext;
+    VkBool32 memoryPriority;
+} VkPhysicalDeviceMemoryPriorityFeaturesEXT;
+typedef struct VkMemoryPriorityAllocateInfoEXT
+{
+    VkStructureType sType;
+    const void* pNext;
+    float priority;
+} VkMemoryPriorityAllocateInfoEXT;
+typedef struct VkPhysicalDeviceBufferAddressFeaturesEXT
+{
+    VkStructureType sType;
+    void* pNext;
+    VkBool32 bufferDeviceAddress;
+    VkBool32 bufferDeviceAddressCaptureReplay;
+    VkBool32 bufferDeviceAddressMultiDevice;
+} VkPhysicalDeviceBufferAddressFeaturesEXT;
+typedef struct VkBufferDeviceAddressInfoEXT
+{
+    VkStructureType sType;
+    const void* pNext;
+    VkBuffer buffer;
+} VkBufferDeviceAddressInfoEXT;
+typedef struct VkBufferDeviceAddressCreateInfoEXT
+{
+    VkStructureType sType;
+    const void* pNext;
+    VkDeviceSize deviceAddress;
+} VkBufferDeviceAddressCreateInfoEXT;
+typedef VkDeviceAddress(VKAPI_PTR* PFN_vkGetBufferDeviceAddressEXT)(VkDevice device, const VkBufferDeviceAddressInfoEXT* pInfo);
+typedef enum VkValidationFeatureEnableEXT
+{
+    VK_VALIDATION_FEATURE_ENABLE_GPU_ASSISTED_EXT = 0,
+    VK_VALIDATION_FEATURE_ENABLE_GPU_ASSISTED_RESERVE_BINDING_SLOT_EXT = 1,
+    VK_VALIDATION_FEATURE_ENABLE_BEGIN_RANGE_EXT = VK_VALIDATION_FEATURE_ENABLE_GPU_ASSISTED_EXT,
+    VK_VALIDATION_FEATURE_ENABLE_END_RANGE_EXT = VK_VALIDATION_FEATURE_ENABLE_GPU_ASSISTED_RESERVE_BINDING_SLOT_EXT,
+    VK_VALIDATION_FEATURE_ENABLE_RANGE_SIZE_EXT = (VK_VALIDATION_FEATURE_ENABLE_GPU_ASSISTED_RESERVE_BINDING_SLOT_EXT - VK_VALIDATION_FEATURE_ENABLE_GPU_ASSISTED_EXT + 1),
+    VK_VALIDATION_FEATURE_ENABLE_MAX_ENUM_EXT = 0x7FFFFFFF
+} VkValidationFeatureEnableEXT;
+typedef enum VkValidationFeatureDisableEXT
+{
+    VK_VALIDATION_FEATURE_DISABLE_ALL_EXT = 0,
+    VK_VALIDATION_FEATURE_DISABLE_SHADERS_EXT = 1,
+    VK_VALIDATION_FEATURE_DISABLE_THREAD_SAFETY_EXT = 2,
+    VK_VALIDATION_FEATURE_DISABLE_API_PARAMETERS_EXT = 3,
+    VK_VALIDATION_FEATURE_DISABLE_OBJECT_LIFETIMES_EXT = 4,
+    VK_VALIDATION_FEATURE_DISABLE_CORE_CHECKS_EXT = 5,
+    VK_VALIDATION_FEATURE_DISABLE_UNIQUE_HANDLES_EXT = 6,
+    VK_VALIDATION_FEATURE_DISABLE_BEGIN_RANGE_EXT = VK_VALIDATION_FEATURE_DISABLE_ALL_EXT,
+    VK_VALIDATION_FEATURE_DISABLE_END_RANGE_EXT = VK_VALIDATION_FEATURE_DISABLE_UNIQUE_HANDLES_EXT,
+    VK_VALIDATION_FEATURE_DISABLE_RANGE_SIZE_EXT = (VK_VALIDATION_FEATURE_DISABLE_UNIQUE_HANDLES_EXT - VK_VALIDATION_FEATURE_DISABLE_ALL_EXT + 1),
+    VK_VALIDATION_FEATURE_DISABLE_MAX_ENUM_EXT = 0x7FFFFFFF
+} VkValidationFeatureDisableEXT;
+typedef struct VkValidationFeaturesEXT
+{
+    VkStructureType sType;
+    const void* pNext;
+    uint32_t enabledValidationFeatureCount;
+    const VkValidationFeatureEnableEXT* pEnabledValidationFeatures;
+    uint32_t disabledValidationFeatureCount;
+    const VkValidationFeatureDisableEXT* pDisabledValidationFeatures;
+} VkValidationFeaturesEXT;
+#endif // VK_HEADER_VERSION < 97
+
+#if VK_HEADER_VERSION < 101
+#define VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_COOPERATIVE_MATRIX_FEATURES_NV   (VkStructureType)1000249000
+#define VK_STRUCTURE_TYPE_COOPERATIVE_MATRIX_PROPERTIES_NV                 (VkStructureType)1000249001
+#define VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_COOPERATIVE_MATRIX_PROPERTIES_NV (VkStructureType)1000249002
+typedef enum VkComponentTypeNV
+{
+    VK_COMPONENT_TYPE_FLOAT16_NV = 0,
+    VK_COMPONENT_TYPE_FLOAT32_NV = 1,
+    VK_COMPONENT_TYPE_FLOAT64_NV = 2,
+    VK_COMPONENT_TYPE_SINT8_NV = 3,
+    VK_COMPONENT_TYPE_SINT16_NV = 4,
+    VK_COMPONENT_TYPE_SINT32_NV = 5,
+    VK_COMPONENT_TYPE_SINT64_NV = 6,
+    VK_COMPONENT_TYPE_UINT8_NV = 7,
+    VK_COMPONENT_TYPE_UINT16_NV = 8,
+    VK_COMPONENT_TYPE_UINT32_NV = 9,
+    VK_COMPONENT_TYPE_UINT64_NV = 10,
+    VK_COMPONENT_TYPE_BEGIN_RANGE_NV = VK_COMPONENT_TYPE_FLOAT16_NV,
+    VK_COMPONENT_TYPE_END_RANGE_NV = VK_COMPONENT_TYPE_UINT64_NV,
+    VK_COMPONENT_TYPE_RANGE_SIZE_NV = (VK_COMPONENT_TYPE_UINT64_NV - VK_COMPONENT_TYPE_FLOAT16_NV + 1),
+    VK_COMPONENT_TYPE_MAX_ENUM_NV = 0x7FFFFFFF
+} VkComponentTypeNV;
+typedef enum VkScopeNV
+{
+    VK_SCOPE_DEVICE_NV = 1,
+    VK_SCOPE_WORKGROUP_NV = 2,
+    VK_SCOPE_SUBGROUP_NV = 3,
+    VK_SCOPE_QUEUE_FAMILY_NV = 5,
+    VK_SCOPE_BEGIN_RANGE_NV = VK_SCOPE_DEVICE_NV,
+    VK_SCOPE_END_RANGE_NV = VK_SCOPE_QUEUE_FAMILY_NV,
+    VK_SCOPE_RANGE_SIZE_NV = (VK_SCOPE_QUEUE_FAMILY_NV - VK_SCOPE_DEVICE_NV + 1),
+    VK_SCOPE_MAX_ENUM_NV = 0x7FFFFFFF
+} VkScopeNV;
+typedef struct VkCooperativeMatrixPropertiesNV
+{
+    VkStructureType sType;
+    void* pNext;
+    uint32_t MSize;
+    uint32_t NSize;
+    uint32_t KSize;
+    VkComponentTypeNV AType;
+    VkComponentTypeNV BType;
+    VkComponentTypeNV CType;
+    VkComponentTypeNV DType;
+    VkScopeNV scope;
+} VkCooperativeMatrixPropertiesNV;
+typedef struct VkPhysicalDeviceCooperativeMatrixFeaturesNV
+{
+    VkStructureType sType;
+    void* pNext;
+    VkBool32 cooperativeMatrix;
+    VkBool32 cooperativeMatrixRobustBufferAccess;
+} VkPhysicalDeviceCooperativeMatrixFeaturesNV;
+typedef struct VkPhysicalDeviceCooperativeMatrixPropertiesNV
+{
+    VkStructureType sType;
+    void* pNext;
+    VkShaderStageFlags cooperativeMatrixSupportedStages;
+} VkPhysicalDeviceCooperativeMatrixPropertiesNV;
+typedef VkResult(VKAPI_PTR* PFN_vkGetPhysicalDeviceCooperativeMatrixPropertiesNV)(VkPhysicalDevice physicalDevice, uint32_t* pPropertyCount, VkCooperativeMatrixPropertiesNV* pProperties);
+#endif // VK_HEADER_VERSION < 101
+
+#if VK_HEADER_VERSION < 121
+#define VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_COHERENT_MEMORY_FEATURES_AMD (VkStructureType)1000229000
+#define VK_MEMORY_PROPERTY_DEVICE_COHERENT_BIT_AMD                     (VkMemoryPropertyFlagBits)0x00000040
+#define VK_MEMORY_PROPERTY_DEVICE_UNCACHED_BIT_AMD                     (VkMemoryPropertyFlagBits)0x00000040
+typedef struct VkPhysicalDeviceCoherentMemoryFeaturesAMD
+{
+    VkStructureType sType;
+    void* pNext;
+    VkBool32 deviceCoherentMemory;
+} VkPhysicalDeviceCoherentMemoryFeaturesAMD;
+#endif // VK_HEADER_VERSION < 121
+
+#if VK_HEADER_VERSION < 129
+#define VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_BUFFER_DEVICE_ADDRESS_FEATURES_KHR (VkStructureType)1000257000
+#define VK_STRUCTURE_TYPE_BUFFER_DEVICE_ADDRESS_INFO_KHR                     (VkStructureType)1000244001
+#define VK_STRUCTURE_TYPE_BUFFER_OPAQUE_CAPTURE_ADDRESS_CREATE_INFO_KHR      (VkStructureType)1000257002
+#define VK_STRUCTURE_TYPE_MEMORY_OPAQUE_CAPTURE_ADDRESS_ALLOCATE_INFO_KHR    (VkStructureType)1000257003
+#define VK_STRUCTURE_TYPE_DEVICE_MEMORY_OPAQUE_CAPTURE_ADDRESS_INFO_KHR      (VkStructureType)1000257004
+#define VK_BUFFER_CREATE_DEVICE_ADDRESS_CAPTURE_REPLAY_BIT_KHR               (VkBufferCreateFlagBits)0x00020000
+#define VK_BUFFER_USAGE_SHADER_DEVICE_ADDRESS_BIT_KHR                        (VkBufferUsageFlagBits)0x00020000
+#define VK_MEMORY_ALLOCATE_DEVICE_ADDRESS_BIT_KHR                            (VkMemoryAllocateFlagBits)0x00000002
+#define VK_MEMORY_ALLOCATE_DEVICE_ADDRESS_CAPTURE_REPLAY_BIT_KHR             (VkMemoryAllocateFlagBits)0x00000004
+typedef struct VkPhysicalDeviceBufferDeviceAddressFeaturesKHR
+{
+    VkStructureType sType;
+    void* pNext;
+    VkBool32 bufferDeviceAddress;
+    VkBool32 bufferDeviceAddressCaptureReplay;
+    VkBool32 bufferDeviceAddressMultiDevice;
+} VkPhysicalDeviceBufferDeviceAddressFeaturesKHR;
+typedef struct VkBufferDeviceAddressInfoKHR
+{
+    VkStructureType sType;
+    const void* pNext;
+    VkBuffer buffer;
+} VkBufferDeviceAddressInfoKHR;
+typedef struct VkBufferOpaqueCaptureAddressCreateInfoKHR
+{
+    VkStructureType sType;
+    const void* pNext;
+    uint64_t opaqueCaptureAddress;
+} VkBufferOpaqueCaptureAddressCreateInfoKHR;
+typedef struct VkMemoryOpaqueCaptureAddressAllocateInfoKHR
+{
+    VkStructureType sType;
+    const void* pNext;
+    uint64_t opaqueCaptureAddress;
+} VkMemoryOpaqueCaptureAddressAllocateInfoKHR;
+typedef struct VkDeviceMemoryOpaqueCaptureAddressInfoKHR
+{
+    VkStructureType sType;
+    const void* pNext;
+    VkDeviceMemory memory;
+} VkDeviceMemoryOpaqueCaptureAddressInfoKHR;
+typedef VkDeviceAddress(VKAPI_PTR* PFN_vkGetBufferDeviceAddressKHR)(VkDevice device, const VkBufferDeviceAddressInfoKHR* pInfo);
+typedef uint64_t(VKAPI_PTR* PFN_vkGetBufferOpaqueCaptureAddressKHR)(VkDevice device, const VkBufferDeviceAddressInfoKHR* pInfo);
+typedef uint64_t(VKAPI_PTR* PFN_vkGetDeviceMemoryOpaqueCaptureAddressKHR)(VkDevice device, const VkDeviceMemoryOpaqueCaptureAddressInfoKHR* pInfo);
+#endif // VK_HEADER_VERSION < 129
+
+#if VK_HEADER_VERSION < 208
+typedef enum VkInstanceCreateFlagBits
+{
+    VK_INSTANCE_CREATE_ENUMERATE_PORTABILITY_BIT_KHR = 0x00000001,
+    VK_INSTANCE_CREATE_FLAG_BITS_MAX_ENUM = 0x7FFFFFFF
+} VkInstanceCreateFlagBits;
+#endif // VK_HEADER_VERSION < 208
+
+#if VK_HEADER_VERSION < 255
+#define VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_COOPERATIVE_MATRIX_FEATURES_KHR   (VkStructureType)1000506000
+#define VK_STRUCTURE_TYPE_COOPERATIVE_MATRIX_PROPERTIES_KHR                 (VkStructureType)1000506001
+#define VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_COOPERATIVE_MATRIX_PROPERTIES_KHR (VkStructureType)1000506002
+typedef enum VkComponentTypeKHR
+{
+    VK_COMPONENT_TYPE_FLOAT16_KHR = 0,
+    VK_COMPONENT_TYPE_FLOAT32_KHR = 1,
+    VK_COMPONENT_TYPE_FLOAT64_KHR = 2,
+    VK_COMPONENT_TYPE_SINT8_KHR = 3,
+    VK_COMPONENT_TYPE_SINT16_KHR = 4,
+    VK_COMPONENT_TYPE_SINT32_KHR = 5,
+    VK_COMPONENT_TYPE_SINT64_KHR = 6,
+    VK_COMPONENT_TYPE_UINT8_KHR = 7,
+    VK_COMPONENT_TYPE_UINT16_KHR = 8,
+    VK_COMPONENT_TYPE_UINT32_KHR = 9,
+    VK_COMPONENT_TYPE_UINT64_KHR = 10,
+    VK_COMPONENT_TYPE_MAX_ENUM_KHR = 0x7FFFFFFF
+} VkComponentTypeKHR;
+typedef enum VkScopeKHR
+{
+    VK_SCOPE_DEVICE_KHR = 1,
+    VK_SCOPE_WORKGROUP_KHR = 2,
+    VK_SCOPE_SUBGROUP_KHR = 3,
+    VK_SCOPE_QUEUE_FAMILY_KHR = 5,
+    VK_SCOPE_MAX_ENUM_KHR = 0x7FFFFFFF
+} VkScopeKHR;
+typedef struct VkCooperativeMatrixPropertiesKHR
+{
+    VkStructureType sType;
+    void* pNext;
+    uint32_t MSize;
+    uint32_t NSize;
+    uint32_t KSize;
+    VkComponentTypeKHR AType;
+    VkComponentTypeKHR BType;
+    VkComponentTypeKHR CType;
+    VkComponentTypeKHR ResultType;
+    VkBool32 saturatingAccumulation;
+    VkScopeKHR scope;
+} VkCooperativeMatrixPropertiesKHR;
+typedef struct VkPhysicalDeviceCooperativeMatrixFeaturesKHR
+{
+    VkStructureType sType;
+    void* pNext;
+    VkBool32 cooperativeMatrix;
+    VkBool32 cooperativeMatrixRobustBufferAccess;
+} VkPhysicalDeviceCooperativeMatrixFeaturesKHR;
+typedef struct VkPhysicalDeviceCooperativeMatrixPropertiesKHR
+{
+    VkStructureType sType;
+    void* pNext;
+    VkShaderStageFlags cooperativeMatrixSupportedStages;
+} VkPhysicalDeviceCooperativeMatrixPropertiesKHR;
+typedef VkResult(VKAPI_PTR* PFN_vkGetPhysicalDeviceCooperativeMatrixPropertiesKHR)(VkPhysicalDevice physicalDevice, uint32_t* pPropertyCount, VkCooperativeMatrixPropertiesKHR* pProperties);
+#endif // VK_HEADER_VERSION < 255
+
+#endif // NCNN_VULKAN_HEADER_FIX_H
diff --git a/linux/lib/cmake/ncnn/ncnn-release.cmake b/linux/lib/cmake/ncnn/ncnn-release.cmake
new file mode 100644
index 0000000..fe14fff
--- /dev/null
+++ b/linux/lib/cmake/ncnn/ncnn-release.cmake
@@ -0,0 +1,19 @@
+#----------------------------------------------------------------
+# Generated CMake target import file for configuration "Release".
+#----------------------------------------------------------------
+
+# Commands may need to know the format version.
+set(CMAKE_IMPORT_FILE_VERSION 1)
+
+# Import target "ncnn" for configuration "Release"
+set_property(TARGET ncnn APPEND PROPERTY IMPORTED_CONFIGURATIONS RELEASE)
+set_target_properties(ncnn PROPERTIES
+  IMPORTED_LOCATION_RELEASE "${_IMPORT_PREFIX}/lib/libncnn.so.1.0.20230920"
+  IMPORTED_SONAME_RELEASE "libncnn.so.1"
+  )
+
+list(APPEND _cmake_import_check_targets ncnn )
+list(APPEND _cmake_import_check_files_for_ncnn "${_IMPORT_PREFIX}/lib/libncnn.so.1.0.20230920" )
+
+# Commands beyond this point should not need to know the version.
+set(CMAKE_IMPORT_FILE_VERSION)
diff --git a/linux/lib/cmake/ncnn/ncnn.cmake b/linux/lib/cmake/ncnn/ncnn.cmake
new file mode 100644
index 0000000..5737c77
--- /dev/null
+++ b/linux/lib/cmake/ncnn/ncnn.cmake
@@ -0,0 +1,108 @@
+# Generated by CMake
+
+if("${CMAKE_MAJOR_VERSION}.${CMAKE_MINOR_VERSION}" LESS 2.8)
+   message(FATAL_ERROR "CMake >= 2.8.0 required")
+endif()
+if(CMAKE_VERSION VERSION_LESS "2.8.3")
+   message(FATAL_ERROR "CMake >= 2.8.3 required")
+endif()
+cmake_policy(PUSH)
+cmake_policy(VERSION 2.8.3...3.25)
+#----------------------------------------------------------------
+# Generated CMake target import file.
+#----------------------------------------------------------------
+
+# Commands may need to know the format version.
+set(CMAKE_IMPORT_FILE_VERSION 1)
+
+# Protect against multiple inclusion, which would fail when already imported targets are added once more.
+set(_cmake_targets_defined "")
+set(_cmake_targets_not_defined "")
+set(_cmake_expected_targets "")
+foreach(_cmake_expected_target IN ITEMS ncnn)
+  list(APPEND _cmake_expected_targets "${_cmake_expected_target}")
+  if(TARGET "${_cmake_expected_target}")
+    list(APPEND _cmake_targets_defined "${_cmake_expected_target}")
+  else()
+    list(APPEND _cmake_targets_not_defined "${_cmake_expected_target}")
+  endif()
+endforeach()
+unset(_cmake_expected_target)
+if(_cmake_targets_defined STREQUAL _cmake_expected_targets)
+  unset(_cmake_targets_defined)
+  unset(_cmake_targets_not_defined)
+  unset(_cmake_expected_targets)
+  unset(CMAKE_IMPORT_FILE_VERSION)
+  cmake_policy(POP)
+  return()
+endif()
+if(NOT _cmake_targets_defined STREQUAL "")
+  string(REPLACE ";" ", " _cmake_targets_defined_text "${_cmake_targets_defined}")
+  string(REPLACE ";" ", " _cmake_targets_not_defined_text "${_cmake_targets_not_defined}")
+  message(FATAL_ERROR "Some (but not all) targets in this export set were already defined.\nTargets Defined: ${_cmake_targets_defined_text}\nTargets not yet defined: ${_cmake_targets_not_defined_text}\n")
+endif()
+unset(_cmake_targets_defined)
+unset(_cmake_targets_not_defined)
+unset(_cmake_expected_targets)
+
+
+# Compute the installation prefix relative to this file.
+get_filename_component(_IMPORT_PREFIX "${CMAKE_CURRENT_LIST_FILE}" PATH)
+get_filename_component(_IMPORT_PREFIX "${_IMPORT_PREFIX}" PATH)
+get_filename_component(_IMPORT_PREFIX "${_IMPORT_PREFIX}" PATH)
+get_filename_component(_IMPORT_PREFIX "${_IMPORT_PREFIX}" PATH)
+if(_IMPORT_PREFIX STREQUAL "/")
+  set(_IMPORT_PREFIX "")
+endif()
+
+# Create imported target ncnn
+add_library(ncnn SHARED IMPORTED)
+
+set_target_properties(ncnn PROPERTIES
+  INTERFACE_INCLUDE_DIRECTORIES "${_IMPORT_PREFIX}/include/ncnn"
+  INTERFACE_LINK_LIBRARIES "OpenMP::OpenMP_CXX;Threads::Threads;Vulkan::Vulkan"
+  INTERFACE_POSITION_INDEPENDENT_CODE "ON"
+)
+
+if(CMAKE_VERSION VERSION_LESS 2.8.12)
+  message(FATAL_ERROR "This file relies on consumers using CMake 2.8.12 or greater.")
+endif()
+
+# Load information for each installed configuration.
+file(GLOB _cmake_config_files "${CMAKE_CURRENT_LIST_DIR}/ncnn-*.cmake")
+foreach(_cmake_config_file IN LISTS _cmake_config_files)
+  include("${_cmake_config_file}")
+endforeach()
+unset(_cmake_config_file)
+unset(_cmake_config_files)
+
+# Cleanup temporary variables.
+set(_IMPORT_PREFIX)
+
+# Loop over all imported files and verify that they actually exist
+foreach(_cmake_target IN LISTS _cmake_import_check_targets)
+  foreach(_cmake_file IN LISTS "_cmake_import_check_files_for_${_cmake_target}")
+    if(NOT EXISTS "${_cmake_file}")
+      message(FATAL_ERROR "The imported target \"${_cmake_target}\" references the file
+   \"${_cmake_file}\"
+but this file does not exist.  Possible reasons include:
+* The file was deleted, renamed, or moved to another location.
+* An install or uninstall procedure did not complete successfully.
+* The installation package was faulty and contained
+   \"${CMAKE_CURRENT_LIST_FILE}\"
+but not all the files it references.
+")
+    endif()
+  endforeach()
+  unset(_cmake_file)
+  unset("_cmake_import_check_files_for_${_cmake_target}")
+endforeach()
+unset(_cmake_target)
+unset(_cmake_import_check_targets)
+
+# This file does not depend on other imported targets which have
+# been exported from the same project but in a separate export set.
+
+# Commands beyond this point should not need to know the version.
+set(CMAKE_IMPORT_FILE_VERSION)
+cmake_policy(POP)
diff --git a/linux/lib/cmake/ncnn/ncnnConfig.cmake b/linux/lib/cmake/ncnn/ncnnConfig.cmake
new file mode 100644
index 0000000..abb2dd6
--- /dev/null
+++ b/linux/lib/cmake/ncnn/ncnnConfig.cmake
@@ -0,0 +1,42 @@
+set(NCNN_OPENMP ON)
+set(NCNN_THREADS ON)
+set(NCNN_VULKAN ON)
+set(NCNN_SHARED_LIB ON)
+set(NCNN_SYSTEM_GLSLANG OFF)
+
+if(NCNN_OPENMP)
+    find_package(OpenMP)
+endif()
+
+if(NCNN_THREADS)
+    set(CMAKE_THREAD_PREFER_PTHREAD TRUE)
+    set(THREADS_PREFER_PTHREAD_FLAG TRUE)
+    find_package(Threads REQUIRED)
+endif()
+
+if(NCNN_VULKAN)
+    find_package(Vulkan REQUIRED)
+
+    if(NOT NCNN_SHARED_LIB)
+        if(NCNN_SYSTEM_GLSLANG)
+            find_package(glslang QUIET)
+            if(NOT glslang_FOUND)
+                set(GLSLANG_TARGET_DIR "")
+                include(${GLSLANG_TARGET_DIR}/OSDependentTargets.cmake)
+                include(${GLSLANG_TARGET_DIR}/OGLCompilerTargets.cmake)
+                if(EXISTS "${GLSLANG_TARGET_DIR}/HLSLTargets.cmake")
+                    # hlsl support can be optional
+                    include("${GLSLANG_TARGET_DIR}/HLSLTargets.cmake")
+                endif()
+                include(${GLSLANG_TARGET_DIR}/glslangTargets.cmake)
+                include(${GLSLANG_TARGET_DIR}/SPIRVTargets.cmake)
+            endif()
+        else()
+            set(glslang_DIR "${CMAKE_CURRENT_LIST_DIR}/../../../lib/cmake/glslang")
+            find_package(glslang QUIET)
+        endif()
+
+    endif()
+endif()
+
+include(${CMAKE_CURRENT_LIST_DIR}/ncnn.cmake)
diff --git a/linux/lib/libncnn.so b/linux/lib/libncnn.so
new file mode 120000
index 0000000..f9f93ce
--- /dev/null
+++ b/linux/lib/libncnn.so
@@ -0,0 +1 @@
+libncnn.so.1
\ No newline at end of file
diff --git a/linux/lib/libncnn.so.1 b/linux/lib/libncnn.so.1
new file mode 120000
index 0000000..428e460
--- /dev/null
+++ b/linux/lib/libncnn.so.1
@@ -0,0 +1 @@
+libncnn.so.1.0.20230920
\ No newline at end of file
diff --git a/linux/lib/libncnn.so.1.0.20230920 b/linux/lib/libncnn.so.1.0.20230920
new file mode 100644
index 0000000..88a5d12
Binary files /dev/null and b/linux/lib/libncnn.so.1.0.20230920 differ
diff --git a/linux/lib/pkgconfig/ncnn.pc b/linux/lib/pkgconfig/ncnn.pc
new file mode 100644
index 0000000..f1190cf
--- /dev/null
+++ b/linux/lib/pkgconfig/ncnn.pc
@@ -0,0 +1,11 @@
+prefix=${pcfiledir}/../..
+librarydir=${prefix}/lib
+includedir=${prefix}/include
+
+Name: ncnn
+Description: high-performance neural network inference framework optimized for the mobile platform
+Version: 1.0.20230920
+URL: https://github.com/Tencent/ncnn
+Libs: -L"${librarydir}" -lncnn
+Cflags: -I"${includedir}"
+
diff --git a/linux/libncnn.so b/linux/libncnn.so
new file mode 100644
index 0000000..88a5d12
Binary files /dev/null and b/linux/libncnn.so differ
diff --git a/linux/main.cpp b/linux/main.cpp
new file mode 100644
index 0000000..54a3cdb
--- /dev/null
+++ b/linux/main.cpp
@@ -0,0 +1,10 @@
+#include "mainwindow.h"
+#include <QtWidgets/QApplication>
+
+int main(int argc, char *argv[])
+{
+    QApplication a(argc, argv);
+    MainWindow w;
+    w.show();
+    return a.exec();
+}
diff --git a/linux/mainwindow.cpp b/linux/mainwindow.cpp
new file mode 100644
index 0000000..f786e28
--- /dev/null
+++ b/linux/mainwindow.cpp
@@ -0,0 +1,829 @@
+#include "mainwindow.h"
+
+const int H = 512;
+const int W = 512;
+
+double X[H], Y[W];
+double xx[H][W], yy[H][W];
+double distance[H][W];
+
+class Where : public ncnn::Layer
+{
+public:
+	Where()
+	{
+		one_blob_only = true;
+	}
+
+	virtual int forward(const ncnn::Mat& bottom_blob, ncnn::Mat& top_blob, const ncnn::Option& opt) const
+	{
+		int w = bottom_blob.w;
+		int h = bottom_blob.h;
+		int channels = bottom_blob.c;
+
+		//printf("[Where] (%d,%d,%d)\n",channels,h,w);
+
+		top_blob.create(w, h, channels, 4u, 1, opt.blob_allocator);
+		if (top_blob.empty())
+			return -100;
+
+#pragma omp parallel for num_threads(opt.num_threads)
+		for (int p = 0; p < channels; p++)
+		{
+			const float* src = bottom_blob.channel(p);
+			float* dst = top_blob.channel(p);
+			for (int y = 0; y < h; y++) {
+				for (int x = 0; x < w; x++) {
+					if (src[0] >= 0)
+						dst[0] = 1.0f;
+					else
+						dst[0] = 0.2f;
+					src++;
+					dst++;
+				}
+			}
+		}
+
+		return 0;
+	}
+};
+
+DEFINE_LAYER_CREATOR(Where)
+
+class ConvTranspose2d1 : public ncnn::Layer
+{
+public:
+	ConvTranspose2d1()
+	{
+		one_blob_only = false;
+	}
+
+	virtual int forward(const std::vector<ncnn::Mat>& bottom_blobs, std::vector<ncnn::Mat>& top_blobs, const ncnn::Option& opt) const
+	{
+		const ncnn::Mat& bottom_blob = bottom_blobs[0];
+		const ncnn::Mat& _weight_data = bottom_blobs[1];
+		ncnn::Mat& top_blob = top_blobs[0];
+
+		//printf("[ConvTranspose2d1] (%d,%d,%d)*(%d,%d,%d,%d)\n", bottom_blob.c, bottom_blob.h, bottom_blob.w, _weight_data.c, _weight_data.d, _weight_data.h, _weight_data.w);
+
+		const int _kernel_w = _weight_data.w;
+		const int _kernel_h = _weight_data.h;
+		const int _num_output = _weight_data.c * _weight_data.elempack;
+
+		ncnn::Mat weight_data_flattened;
+		ncnn::flatten(_weight_data, weight_data_flattened, opt);
+		if (weight_data_flattened.empty())
+			return -100;
+
+		// weight_data_flattened as pack1
+		weight_data_flattened.w *= weight_data_flattened.elempack;
+		weight_data_flattened.elemsize /= weight_data_flattened.elempack;
+		weight_data_flattened.elempack = 1;
+
+		ncnn::Mat bias_data_flattened;
+
+		ncnn::Layer* op = ncnn::create_layer(ncnn::LayerType::Deconvolution);
+
+		const int dilation_w = 1;
+		const int dilation_h = 1;
+		const int stride_w = 2;
+		const int stride_h = 2;
+		const int pad_left = 0;
+		const int pad_right = 0;
+		const int pad_top = 0;
+		const int pad_bottom = 0;
+		const int bias_term = 0;
+		const int output_pad_right = 0;
+		const int output_pad_bottom = 0;
+
+		ncnn::ParamDict pd;
+		pd.set(0, _num_output);
+		pd.set(1, _kernel_w);
+		pd.set(11, _kernel_h);
+		pd.set(2, dilation_w);
+		pd.set(12, dilation_h);
+		pd.set(3, stride_w);
+		pd.set(13, stride_h);
+		pd.set(4, pad_left);
+		pd.set(15, pad_right);
+		pd.set(14, pad_top);
+		pd.set(16, pad_bottom);
+		pd.set(18, output_pad_right);
+		pd.set(19, output_pad_bottom);
+		pd.set(5, bias_term);
+		pd.set(6, weight_data_flattened.w);
+		pd.set(9, 0);
+		pd.set(10, ncnn::Mat());
+
+
+		op->load_param(pd);
+
+		ncnn::Mat weights[2];
+		weights[0] = weight_data_flattened;
+		weights[1] = bias_data_flattened;
+
+		op->load_model(ncnn::ModelBinFromMatArray(weights));
+
+		op->create_pipeline(opt);
+
+		op->forward(bottom_blob, top_blob, opt);
+
+		op->destroy_pipeline(opt);
+
+		delete op;
+
+		return 0;
+	}
+};
+
+DEFINE_LAYER_CREATOR(ConvTranspose2d1)
+
+class ConvTranspose2d2 : public ncnn::Layer
+{
+public:
+	ConvTranspose2d2()
+	{
+		one_blob_only = false;
+	}
+
+	virtual int forward(const std::vector<ncnn::Mat>& bottom_blobs, std::vector<ncnn::Mat>& top_blobs, const ncnn::Option& opt) const
+	{
+		const ncnn::Mat& bottom_blob = bottom_blobs[0];
+		const ncnn::Mat& _weight_data = bottom_blobs[1];
+		ncnn::Mat& top_blob = top_blobs[0];
+
+		// transpose wegiht from cdhw to dchw
+		ncnn::Layer* transpose_op = ncnn::create_layer(ncnn::LayerType::Permute);
+		ncnn::ParamDict transpose_pd;
+		transpose_pd.set(0, 6); // WHDC->WHCD
+		transpose_op->load_param(transpose_pd);
+		transpose_op->create_pipeline(opt);
+		ncnn::Mat _weight_data_T;
+		transpose_op->forward(_weight_data, _weight_data_T, opt);
+
+		//printf("[ConvTranspose2d2] (%d,%d,%d)*(%d,%d,%d,%d)\n", bottom_blob.c, bottom_blob.h, bottom_blob.w, _weight_data_T.c, _weight_data_T.d, _weight_data_T.h, _weight_data_T.w);
+
+		const int _kernel_w = _weight_data_T.w;
+		const int _kernel_h = _weight_data_T.h;
+		const int _num_output = _weight_data_T.c * _weight_data_T.elempack;
+
+		ncnn::Mat weight_data_flattened;
+		ncnn::flatten(_weight_data_T, weight_data_flattened, opt);
+		if (weight_data_flattened.empty())
+			return -100;
+
+		// weight_data_flattened as pack1
+		weight_data_flattened.w *= weight_data_flattened.elempack;
+		weight_data_flattened.elemsize /= weight_data_flattened.elempack;
+		weight_data_flattened.elempack = 1;
+
+		ncnn::Layer* op = ncnn::create_layer(ncnn::LayerType::Deconvolution);
+		const int dilation_w = 1;
+		const int dilation_h = 1;
+		const int stride_w = 1;
+		const int stride_h = 1;
+		const int pad_left = 1;
+		const int pad_right = 1;
+		const int pad_top = 1;
+		const int pad_bottom = 1;
+		const int bias_term = 0;
+		const int output_pad_right = 0;
+		const int output_pad_bottom = 0;
+		ncnn::ParamDict pd;
+		pd.set(0, _num_output);
+		pd.set(1, _kernel_w);
+		pd.set(11, _kernel_h);
+		pd.set(2, dilation_w);
+		pd.set(12, dilation_h);
+		pd.set(3, stride_w);
+		pd.set(13, stride_h);
+		pd.set(4, pad_left);
+		pd.set(15, pad_right);
+		pd.set(14, pad_top);
+		pd.set(16, pad_bottom);
+		pd.set(18, output_pad_right);
+		pd.set(19, output_pad_bottom);
+		pd.set(5, bias_term);
+		pd.set(6, weight_data_flattened.w);
+		pd.set(9, 0);
+		pd.set(10, ncnn::Mat());
+		op->load_param(pd);
+
+		ncnn::Mat weights[2];
+		ncnn::Mat bias_data_flattened;
+		weights[0] = weight_data_flattened;
+		weights[1] = bias_data_flattened;
+
+		op->load_model(ncnn::ModelBinFromMatArray(weights));
+		op->create_pipeline(opt);
+		op->forward(bottom_blob, top_blob, opt);
+
+
+		op->destroy_pipeline(opt);
+		delete op;
+
+		transpose_op->destroy_pipeline(opt);
+		delete transpose_op;
+
+		return 0;
+	}
+};
+
+DEFINE_LAYER_CREATOR(ConvTranspose2d2)
+
+class BConv2d1 : public ncnn::Layer
+{
+public:
+	BConv2d1()
+	{
+		one_blob_only = false;
+	}
+
+	virtual int forward(const std::vector<ncnn::Mat>& bottom_blobs, std::vector<ncnn::Mat>& top_blobs, const ncnn::Option& opt) const
+	{
+		const ncnn::Mat& bottom_blob = bottom_blobs[0];
+		const ncnn::Mat& _weight_data = bottom_blobs[1];
+
+		//printf("[BConv2d1] (%d,%d,%d)%d*(%d,%d,%d,%d)%d\n", bottom_blob.c, bottom_blob.h, bottom_blob.w, bottom_blob.elempack, _weight_data.c, _weight_data.d, _weight_data.h, _weight_data.w, _weight_data.elempack);
+
+
+		// ����conv2d����
+		const int _kernel_w = _weight_data.w;
+		const int _kernel_h = _weight_data.h;
+		const int _num_output = _weight_data.c * _weight_data.elempack;
+		const int _num_input = bottom_blob.c;
+
+		ncnn::Mat weight_data_flattened;
+		ncnn::flatten(_weight_data, weight_data_flattened, opt);
+		if (weight_data_flattened.empty())
+			return -100;
+
+		weight_data_flattened.w *= weight_data_flattened.elempack;
+		weight_data_flattened.elemsize /= weight_data_flattened.elempack;
+		weight_data_flattened.elempack = 1;
+
+		ncnn::Mat bias_data_flattened;
+		ncnn::Layer* op = ncnn::create_layer(ncnn::LayerType::Convolution);
+		ncnn::ParamDict pd;
+		pd.set(0, _num_output);
+		pd.set(1, _kernel_w);
+		pd.set(11, _kernel_h);
+		pd.set(2, 2);
+		pd.set(21, 2);
+		pd.set(3, 1);
+		pd.set(31, 1);
+		pd.set(4, 0);
+		pd.set(15, 0);
+		pd.set(14, 0);
+		pd.set(16, 0);
+		pd.set(18, 0);
+		pd.set(5, 0);
+		pd.set(6, weight_data_flattened.w);
+		pd.set(8, 0);
+		op->load_param(pd);
+
+		ncnn::Mat weights[2];
+		weights[0] = weight_data_flattened;
+		weights[1] = bias_data_flattened;
+		op->load_model(ncnn::ModelBinFromMatArray(weights));
+		op->create_pipeline(opt);
+
+
+		// ѭ�����ɣ�ע�⣬�����bottom_blob��top_blob_set���ܻ���elempack������
+		std::vector<ncnn::Mat> top_blob_set(_num_input);
+		for (int i = 0; i < _num_input; i++) {
+			op->forward(bottom_blob.channel(i), top_blob_set[i], opt);
+		}
+
+		// ƴ�ӽ��
+		std::vector<ncnn::Mat> cat_out(1);
+		ncnn::Layer* cat = ncnn::create_layer(ncnn::LayerType::Concat);
+		ncnn::ParamDict cat_pd;
+		cat_pd.set(0, 0);
+		cat->load_param(cat_pd);
+		cat->create_pipeline(opt);
+		cat->forward(top_blob_set, cat_out, opt);
+
+		// reshape
+		ncnn::Layer* reshape = ncnn::create_layer(ncnn::LayerType::Reshape);
+		ncnn::ParamDict reshape_pd;
+		reshape_pd.set(0, cat_out[0].w);
+		reshape_pd.set(1, cat_out[0].h);
+		reshape_pd.set(11, _num_output);
+		reshape_pd.set(2, _num_input);
+		reshape->load_param(reshape_pd);
+		reshape->create_pipeline(opt);
+		reshape->forward(cat_out[0], top_blobs[0], opt);
+
+
+		// �ͷ�
+		reshape->destroy_pipeline(opt);
+		delete reshape;
+
+		cat->destroy_pipeline(opt);
+		delete cat;
+
+		op->destroy_pipeline(opt);
+		delete op;
+
+		return 0;
+	}
+};
+
+DEFINE_LAYER_CREATOR(BConv2d1)
+
+class BConv2d2 : public ncnn::Layer
+{
+public:
+	BConv2d2()
+	{
+		one_blob_only = false;
+	}
+
+	virtual int forward(const std::vector<ncnn::Mat>& bottom_blobs, std::vector<ncnn::Mat>& top_blobs, const ncnn::Option& opt) const
+	{
+		const ncnn::Mat& bottom_blob = bottom_blobs[0];
+		const ncnn::Mat& _weight_data = bottom_blobs[1];
+		ncnn::Mat& top_blob = top_blobs[0];
+
+		//printf("[BConv2d2] (%d,%d,%d)%d*(%d,%d,%d,%d)%d\n", bottom_blob.c, bottom_blob.h, bottom_blob.w, bottom_blob.elempack, _weight_data.c, _weight_data.d, _weight_data.h, _weight_data.w, _weight_data.elempack);
+
+
+		// ����conv2d����
+		const int _kernel_w = _weight_data.w;
+		const int _kernel_h = _weight_data.h;
+		const int _num_output = _weight_data.c * _weight_data.elempack;
+		const int _num_input = bottom_blob.c;
+
+		ncnn::Mat weight_data_flattened;
+		ncnn::flatten(_weight_data, weight_data_flattened, opt);
+		if (weight_data_flattened.empty())
+			return -100;
+
+		weight_data_flattened.w *= weight_data_flattened.elempack;
+		weight_data_flattened.elemsize /= weight_data_flattened.elempack;
+		weight_data_flattened.elempack = 1;
+
+		ncnn::Mat bias_data_flattened;
+		ncnn::Layer* op = ncnn::create_layer(ncnn::LayerType::Convolution);
+		ncnn::ParamDict pd;
+		pd.set(0, _num_output);
+		pd.set(1, _kernel_w);
+		pd.set(11, _kernel_h);
+		pd.set(2, 1);
+		pd.set(21, 1);
+		pd.set(3, 1);
+		pd.set(31, 1);
+		pd.set(4, 1);
+		pd.set(15, 1);
+		pd.set(14, 1);
+		pd.set(16, 1);
+		pd.set(18, 0);
+		pd.set(5, 0);
+		pd.set(6, weight_data_flattened.w);
+		op->load_param(pd);
+
+		ncnn::Mat weights[2];
+		weights[0] = weight_data_flattened;
+		weights[1] = bias_data_flattened;
+		op->load_model(ncnn::ModelBinFromMatArray(weights));
+		op->create_pipeline(opt);
+
+
+		// ѭ�����ɣ�ע�⣬�����bottom_blob��top_blob_set���ܻ���elempack������
+		std::vector<ncnn::Mat> top_blob_set(_num_input);
+		for (int i = 0; i < _num_input; i++) {
+			op->forward(bottom_blob.channel(i), top_blob_set[i], opt);
+		}
+
+		// ƴ�ӽ��
+		std::vector<ncnn::Mat> cat_out(1);
+		ncnn::Layer* cat = ncnn::create_layer(ncnn::LayerType::Concat);
+		ncnn::ParamDict cat_pd;
+		cat_pd.set(0, 0);
+		cat->load_param(cat_pd);
+		cat->create_pipeline(opt);
+		cat->forward(top_blob_set, cat_out, opt);
+
+		// reshape
+		ncnn::Mat reshape_out;
+		ncnn::Layer* reshape = ncnn::create_layer(ncnn::LayerType::Reshape);
+		ncnn::ParamDict reshape_pd;
+		reshape_pd.set(0, cat_out[0].w);
+		reshape_pd.set(1, cat_out[0].h);
+		reshape_pd.set(11, _num_output);
+		reshape_pd.set(2, _num_input);
+		reshape->load_param(reshape_pd);
+		reshape->create_pipeline(opt);
+		reshape->forward(cat_out[0], top_blob, opt);
+
+		// �ͷ�
+		reshape->destroy_pipeline(opt);
+		delete reshape;
+
+		cat->destroy_pipeline(opt);
+		delete cat;
+
+		op->destroy_pipeline(opt);
+		delete op;
+
+		return 0;
+	}
+};
+
+DEFINE_LAYER_CREATOR(BConv2d2)
+
+ncnn::Mat rand(int seed)
+{
+	cv::Mat cv_x(cv::Size(512, 1), CV_32FC4);
+	cv::RNG rng(seed);
+	rng.fill(cv_x, cv::RNG::NORMAL, 0, 1);
+	ncnn::Mat x_mat(512, 1, (void*)cv_x.data);
+	return x_mat.clone();
+}
+
+void linspace(double* arr, double start, double end, int size)
+{
+	double step = (end - start) / (size - 1);
+	for (int i = 0; i < size; i++)
+	{
+		arr[i] = start + i * step;
+	}
+}
+
+void meshgrid(double* X, double* Y, double(*xx)[W], double(*yy)[W])
+{
+	for (int i = 0; i < H; i++)
+	{
+		for (int j = 0; j < W; j++)
+		{
+			xx[i][j] = X[i];
+			yy[i][j] = Y[j];
+		}
+	}
+}
+
+template <typename T>
+int sign(T val) {
+	return (T(0) < val) - (val < T(0));
+}
+
+MainWindow::MainWindow(QWidget *parent)
+    : QMainWindow(parent)
+{
+    ui.setupUi(this);
+
+	// ����mapping
+	mapping.load_param("assets/mapping.param");
+	mapping.load_model("assets/mapping.bin");
+
+	// ����generator
+	generator.register_custom_layer("Where", Where_layer_creator);
+	generator.register_custom_layer("ConvTranspose2d1", ConvTranspose2d1_layer_creator);
+	generator.register_custom_layer("ConvTranspose2d2", ConvTranspose2d2_layer_creator);
+	generator.register_custom_layer("BConv2d1", BConv2d1_layer_creator);
+	generator.register_custom_layer("BConv2d2", BConv2d2_layer_creator);
+	generator.load_param("assets/generator.param");
+	generator.load_model("assets/generator.bin");
+
+	// ����gridsample����
+	gridsample_opt = generator.opt;
+	gridsample = ncnn::create_layer(ncnn::LayerType::GridSample);
+	ncnn::ParamDict gridsample_pd;
+	gridsample->load_param(gridsample_pd);
+	gridsample->create_pipeline(gridsample_opt);
+
+	// ����interp����
+	interp_opt = generator.opt;
+	interp = ncnn::create_layer(ncnn::LayerType::Interp);
+	ncnn::ParamDict interp_pd;
+	interp_pd.set(0, 2);
+	interp_pd.set(3, 128);
+	interp_pd.set(4, 128);
+	interp->load_param(interp_pd);
+	interp->create_pipeline(interp_opt);
+
+	// Ԥ��������
+	linspace(X, 0, H, H);
+	linspace(Y, 0, W, W);
+	meshgrid(X, Y, xx, yy);
+}
+
+MainWindow::~MainWindow()
+{
+	gridsample->destroy_pipeline(gridsample_opt);
+	delete gridsample;
+
+	interp->destroy_pipeline(interp_opt);
+	delete interp;
+}
+
+void MainWindow::showImage(cv::Mat in)
+{
+	cv::Mat show;
+	cv::resize(in, show, cv::Size(1024, 1024));
+	QImage qImage(show.data, show.cols, show.rows, static_cast<int>(show.step), QImage::Format_RGB888);
+	ui.show->setPixmap(QPixmap::fromImage(qImage));
+	ui.show->show();
+}
+
+void MainWindow::on_getBtn_clicked()
+{
+	int seed = QVariant(ui.seedEdit->text()).toInt();
+	
+	// �����mapping�õ���ʼw
+	ncnn::Mat z = rand(seed);
+	ncnn::Mat w0(512, 16);
+	w0.fill<float>(1.0f);
+	{
+		ncnn::Mat output;
+		ncnn::Extractor ex = mapping.create_extractor();
+		ex.input("/mapping/Cast_output_0", z);
+		ex.extract("/mapping/Sub_2_output_0", output);
+		float* src = output.row(0);
+		for (int i = 0; i < 16; i++) {
+			float* dst = w0.row(i);
+			for (int j = 0; j < 512; j++) {
+				dst[j] = src[j];
+			}
+		}
+	}
+	ncnn::Mat w = w0.row_range(0, 6).clone();
+
+	// ����һ��w
+	w_tmp = w.clone();
+	w0_tmp = w0.clone();
+
+	// ���ɳ�ʼͼ��
+	{
+		ncnn::Mat ws(512, 16);
+		{
+			for (int i = 0; i < 6; i++) {
+				float* src = w.row(i);
+				float* dst = ws.row(i);
+				for (int j = 0; j < 512; j++) {
+					dst[j] = src[j];
+				}
+			}
+			for (int i = 6; i < 16; i++) {
+				float* src = w0.row(i);
+				float* dst = ws.row(i);
+				for (int j = 0; j < 512; j++) {
+					dst[j] = src[j];
+				}
+			}
+		}
+
+		ncnn::Mat img;
+		{
+			ncnn::Extractor ex = generator.create_extractor();
+			ex.set_light_mode(true);
+			ex.input("in0", ws);
+			ex.extract("out0", img);
+		}
+		const float _mean_[3] = { -128.0f / 127.5f, -128.0f / 127.5f, -128.0f / 127.5f };
+		const float _norm_[3] = { 127.5f, 127.5f, 127.5f };
+		img.substract_mean_normalize(_mean_, _norm_);
+		cv::Mat image(512, 512, CV_8UC3);
+		img.to_pixels(image.data, ncnn::Mat::PIXEL_RGB);
+		showwing = image.clone();
+
+		showImage(showwing);
+
+		qDebug() << "[Init] seed:" << seed;
+	}
+}
+
+void MainWindow::mousePressEvent(QMouseEvent* event)
+{
+	if (event->button() == Qt::LeftButton) {
+
+		QPoint globalPos = event->globalPos();
+
+		QPoint labelPos = ui.show->mapToGlobal(QPoint(0, 0));
+		int labelWidth = ui.show->width();
+		int labelHeight = ui.show->height();
+
+		int relativeX = globalPos.x() - labelPos.x();
+		int relativeY = globalPos.y() - labelPos.y();
+
+		relativeX = relativeX / 2;
+		relativeY = relativeY / 2;
+
+		if (points[0] == -1) {
+			cv::Mat show_point = showwing.clone();
+			cv::circle(show_point, cv::Point(relativeX, relativeY), 3, cv::Scalar(255, 0, 0), -1);
+			showImage(show_point);
+
+			points[0] = relativeY;
+			points[1] = relativeX;
+
+			qDebug() << "[Choose] start point: (" << points[0] << "," << points[1] << ")";
+		}
+		else if (targets[0] == -1) {
+			cv::Mat show_point = showwing.clone();
+			cv::circle(show_point, cv::Point(int(points[1]), int(points[0])), 3, cv::Scalar(255, 0, 0), -1);
+			cv::circle(show_point, cv::Point(relativeX, relativeY), 3, cv::Scalar(0, 255, 0), -1);
+			showImage(show_point);
+
+			targets[0] = relativeY;
+			targets[1] = relativeX;
+
+			qDebug() << "[Choose] target point: (" << targets[0] << "," << targets[1] << ")";
+		}
+
+	}
+}
+
+void MainWindow::on_cleanBtn_clicked()
+{
+	points[0] = -1;
+	points[1] = -1;
+	targets[0] = -1;
+	targets[1] = -1;
+	showImage(showwing);
+	qDebug() << "[Clean] point";
+}
+
+void MainWindow::on_dragBtn_clicked()
+{
+	float lr = 0.1;
+	ncnn::Mat feat_refs;
+	int r1 = 3, r2 = 12;
+
+	ncnn::Mat w = w_tmp.clone();
+	ncnn::Mat w0 = w0_tmp.clone();
+
+	for (int it = 0; it < 200; it++)
+	{
+		ncnn::Mat ws(512, 16);
+		{
+			for (int i = 0; i < 6; i++) {
+				float* src = w.row(i);
+				float* dst = ws.row(i);
+				for (int j = 0; j < 512; j++) {
+					dst[j] = src[j];
+				}
+			}
+			for (int i = 6; i < 16; i++) {
+				float* src = w0.row(i);
+				float* dst = ws.row(i);
+				for (int j = 0; j < 512; j++) {
+					dst[j] = src[j];
+				}
+			}
+		}
+
+		{
+			ncnn::Mat feat5;
+			ncnn::Extractor ex = generator.create_extractor();
+			ex.set_light_mode(false);
+			ex.input("in0", ws);
+			ex.extract("out1", feat5);
+
+			ncnn::Mat feat_resize;
+			ncnn::resize_bilinear(feat5, feat_resize, W, H);
+
+			// ��һ�εĻ�Ҫ��¼feature
+			if (feat_refs.empty()) {
+				feat_refs.create(256);
+				for (int i = 0; i < 256; i++) {
+					feat_refs[i] = feat_resize.channel(i).row(int(std::round(points[0])))[int(std::round(points[1]))];
+				}
+			}
+
+			// Point tracking with feature matching
+			int r = std::round(r2 / 512.0 * H);
+			int up = std::max(points[0] - r, (double)0.0);
+			int down = std::min(points[0] + r + 1, (double)H);
+			int left = std::max(points[1] - r, (double)0.0);
+			int right = std::min(points[1] + r + 1, (double)W);
+			int height_patch = down - up;
+			int width_patch = right - left;
+			float min_value = 1e8;
+			int min_y = -1, min_x = -1;
+			for (int h = 0; h < height_patch; h++) {
+				for (int w = 0; w < width_patch; w++) {
+					float tmp = 0.0f;
+					for (int c = 0; c < 256; c++) {
+						tmp += std::pow(feat_resize.channel(c).row(up + h)[left + w] - feat_refs[c], 2);
+					}
+					tmp = std::sqrt(tmp);
+					if ((min_y == -1 && min_x == -1) || tmp < min_value) {
+						min_value = tmp;
+						min_y = up + h;
+						min_x = left + w;
+					}
+				}
+			}
+			points[0] = min_y;
+			points[1] = min_x;
+
+			qDebug() << "[Drag " << it << "]current:(" << int(points[0]) << "," << int(points[1]) << "), target:(" << targets[0] << "," << targets[1] << ")";
+
+			//// save intermediate
+			//{
+			//	ncnn::Mat img;
+			//	ex.extract("out0", img);
+			//	const float _mean_[3] = { -128.0f / 127.5f, -128.0f / 127.5f, -128.0f / 127.5f };
+			//	const float _norm_[3] = { 127.5f, 127.5f, 127.5f };
+			//	img.substract_mean_normalize(_mean_, _norm_);
+			//	cv::Mat image(512, 512, CV_8UC3);
+			//	img.to_pixels(image.data, ncnn::Mat::PIXEL_RGB2BGR);
+			//	cv::circle(image, cv::Point(int(points[1]), int(points[0])), 3, cv::Scalar(0, 0, 255), -1);
+			//	cv::circle(image, cv::Point(int(targets[1]), int(targets[0])), 3, cv::Scalar(0, 255, 0), -1);
+			//	cv::imwrite("images/" + std::to_string(it) + ".png", image);
+			//}
+
+			// Motion supervision
+			double direction[2] = { targets[1] - points[1],targets[0] - points[0] };
+			if (std::sqrt(std::pow(direction[0], 2) + std::pow(direction[1], 2)) > 1) {
+
+				std::vector<int> relis, reljs;
+				for (int h = 0; h < H; h++) {
+					for (int w = 0; w < W; w++) {
+						if (std::sqrt(std::pow(xx[h][w] - points[0], 2) + std::pow(yy[h][w] - points[1], 2)) < std::round(r1 / 512.0 * H)) {
+							relis.push_back(h);
+							reljs.push_back(w);
+						}
+					}
+				}
+
+				double direction_norm = std::sqrt(std::pow(direction[0], 2) + std::pow(direction[1], 2));
+				direction[0] /= direction_norm;
+				direction[1] /= direction_norm;
+
+				ncnn::Mat grid;
+				grid.create(2, int(relis.size()), 1, (size_t)4u);
+				for (int w = 0; w < relis.size(); w++) {
+					grid.channel(0).row(w)[0] = (reljs[w] - direction[0]) / (W - 1) * 2 - 1;
+					grid.channel(0).row(w)[1] = (relis[w] - direction[1]) / (H - 1) * 2 - 1;
+				}
+
+				std::vector<ncnn::Mat> inputs(2);
+				inputs[0] = feat_resize;
+				inputs[1] = grid;
+				std::vector<ncnn::Mat> outputs(1);
+				gridsample->forward(inputs, outputs, gridsample_opt);
+				ncnn::Mat& target = outputs[0];
+
+				ncnn::Mat feat5_grad(512, 512, 256, (size_t)4u);
+				for (int i = 0; i < relis.size(); i++) {
+					for (int c = 0; c < 256; c++) {
+						feat5_grad.channel(c).row(relis[i])[reljs[i]] = sign(feat_resize.channel(c).row(relis[i])[reljs[i]] - target.channel(c).row(0)[i]) / 256.0;
+					}
+				}
+				ncnn::Mat feat5_grad_fit;
+				interp->forward(feat5_grad, feat5_grad_fit, interp_opt);
+
+				ex.input("in1", feat5_grad_fit);
+				std::vector<ncnn::Mat> vg(6);
+				ex.extract("out2", vg[0]);
+				ex.extract("out3", vg[1]);
+				ex.extract("out4", vg[2]);
+				ex.extract("out5", vg[3]);
+				ex.extract("out6", vg[4]);
+				ex.extract("out7", vg[5]);
+
+				// update w
+				for (int i = 0; i < 6; i++) {
+					for (int j = 0; j < 512; j++) {
+						w.row(i)[j] = w.row(i)[j] - lr * vg[i].row(0)[j] * w.row(i)[j];
+					}
+				}
+			}
+			else {
+				qDebug() << "[Finish]";
+
+				ncnn::Mat img;
+				ex.extract("out0", img);
+				const float _mean_[3] = { -128.0f / 127.5f, -128.0f / 127.5f, -128.0f / 127.5f };
+				const float _norm_[3] = { 127.5f, 127.5f, 127.5f };
+				img.substract_mean_normalize(_mean_, _norm_);
+				cv::Mat image(512, 512, CV_8UC3);
+				img.to_pixels(image.data, ncnn::Mat::PIXEL_RGB);
+				showwing = image.clone();
+
+				showImage(showwing);
+
+				break;
+			}
+
+			if (it == 200 - 1) {
+				ncnn::Mat img;
+				ex.extract("out0", img);
+				const float _mean_[3] = { -128.0f / 127.5f, -128.0f / 127.5f, -128.0f / 127.5f };
+				const float _norm_[3] = { 127.5f, 127.5f, 127.5f };
+				img.substract_mean_normalize(_mean_, _norm_);
+				cv::Mat image(512, 512, CV_8UC3);
+				img.to_pixels(image.data, ncnn::Mat::PIXEL_RGB);
+				showwing = image.clone();
+
+				showImage(showwing);
+			}
+		}
+	}
+
+
+
+}
\ No newline at end of file
diff --git a/linux/mainwindow.h b/linux/mainwindow.h
new file mode 100644
index 0000000..2eb90b5
--- /dev/null
+++ b/linux/mainwindow.h
@@ -0,0 +1,64 @@
+#pragma once
+#include <QtWidgets/QMainWindow>
+#include "ui_mainwindow.h"
+#include <QDebug>
+#include <QVariant>
+#include <QMouseEvent>
+
+// AVOID moc parse error
+#ifndef Q_MOC_RUN 
+#include <iostream>
+#include <regex>
+#include <string>
+#include <vector>
+#include <stack>
+#include <fstream>
+#include <map>
+#include <math.h>
+#include <algorithm>
+#include <time.h>
+
+#include <opencv2/opencv.hpp>
+
+#include "ncnn/net.h"
+#include "ncnn/layer.h"
+#include "ncnn/layer_type.h"
+#include "ncnn/benchmark.h"
+#endif
+
+class MainWindow : public QMainWindow
+{
+    Q_OBJECT
+
+public:
+    MainWindow(QWidget *parent = nullptr);
+    ~MainWindow();
+    void showImage(cv::Mat in);
+
+private slots:
+    void on_getBtn_clicked();
+    void on_cleanBtn_clicked();
+    void on_dragBtn_clicked();
+
+protected:
+    void mousePressEvent(QMouseEvent* event);
+
+private:
+    Ui::MainWindowClass ui;
+
+    cv::Mat showwing;
+    double points[2] = { -1,-1 };
+    double targets[2] = { -1,-1 };
+
+    ncnn::Mat w_tmp;
+    ncnn::Mat w0_tmp;
+
+    ncnn::Net mapping;
+    ncnn::Net generator;
+
+    ncnn::Layer* gridsample;
+    ncnn::Option gridsample_opt;
+
+    ncnn::Layer* interp;
+    ncnn::Option interp_opt;
+};
diff --git a/linux/mainwindow.qrc b/linux/mainwindow.qrc
new file mode 100644
index 0000000..68043e7
--- /dev/null
+++ b/linux/mainwindow.qrc
@@ -0,0 +1,4 @@
+<RCC>
+    <qresource prefix="MainWindow">
+    </qresource>
+</RCC>
diff --git a/linux/mainwindow.ui b/linux/mainwindow.ui
new file mode 100644
index 0000000..66fee83
--- /dev/null
+++ b/linux/mainwindow.ui
@@ -0,0 +1,126 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<ui version="4.0">
+ <class>MainWindowClass</class>
+ <widget class="QMainWindow" name="MainWindowClass">
+  <property name="geometry">
+   <rect>
+    <x>0</x>
+    <y>0</y>
+    <width>1024</width>
+    <height>1077</height>
+   </rect>
+  </property>
+  <property name="windowTitle">
+   <string>MainWindow</string>
+  </property>
+  <widget class="QWidget" name="centralWidget">
+   <layout class="QVBoxLayout" name="verticalLayout" stretch="1,20">
+    <property name="spacing">
+     <number>0</number>
+    </property>
+    <property name="leftMargin">
+     <number>0</number>
+    </property>
+    <property name="topMargin">
+     <number>0</number>
+    </property>
+    <property name="rightMargin">
+     <number>0</number>
+    </property>
+    <property name="bottomMargin">
+     <number>0</number>
+    </property>
+    <item>
+     <widget class="QWidget" name="widget" native="true">
+      <layout class="QHBoxLayout" name="horizontalLayout">
+       <item>
+        <widget class="QLineEdit" name="seedEdit"/>
+       </item>
+       <item>
+        <widget class="QPushButton" name="getBtn">
+         <property name="text">
+          <string>Get Image</string>
+         </property>
+        </widget>
+       </item>
+       <item>
+        <spacer name="horizontalSpacer">
+         <property name="orientation">
+          <enum>Qt::Horizontal</enum>
+         </property>
+         <property name="sizeHint" stdset="0">
+          <size>
+           <width>40</width>
+           <height>20</height>
+          </size>
+         </property>
+        </spacer>
+       </item>
+       <item>
+        <widget class="QPushButton" name="cleanBtn">
+         <property name="text">
+          <string>Clean Point</string>
+         </property>
+        </widget>
+       </item>
+       <item>
+        <spacer name="horizontalSpacer_2">
+         <property name="orientation">
+          <enum>Qt::Horizontal</enum>
+         </property>
+         <property name="sizeHint" stdset="0">
+          <size>
+           <width>40</width>
+           <height>20</height>
+          </size>
+         </property>
+        </spacer>
+       </item>
+       <item>
+        <widget class="QPushButton" name="dragBtn">
+         <property name="text">
+          <string>Drag</string>
+         </property>
+        </widget>
+       </item>
+      </layout>
+     </widget>
+    </item>
+    <item>
+     <widget class="QLabel" name="show">
+      <property name="text">
+       <string>TextLabel</string>
+      </property>
+      <property name="alignment">
+       <set>Qt::AlignCenter</set>
+      </property>
+     </widget>
+    </item>
+   </layout>
+  </widget>
+  <widget class="QMenuBar" name="menuBar">
+   <property name="geometry">
+    <rect>
+     <x>0</x>
+     <y>0</y>
+     <width>1024</width>
+     <height>26</height>
+    </rect>
+   </property>
+  </widget>
+  <widget class="QToolBar" name="mainToolBar">
+   <attribute name="toolBarArea">
+    <enum>TopToolBarArea</enum>
+   </attribute>
+   <attribute name="toolBarBreak">
+    <bool>false</bool>
+   </attribute>
+  </widget>
+  <widget class="QStatusBar" name="statusBar"/>
+ </widget>
+ <layoutdefault spacing="6" margin="11"/>
+ <resources>
+  <include location="mainwindow.qrc"/>
+ </resources>
+ <connections/>
+</ui>