From be9da14ed8090487a724df153edec38b9c2f6b38 Mon Sep 17 00:00:00 2001
From: Peter Marcisovsky <peter.marcisovsky@espressif.com>
Date: Fri, 26 Jul 2024 10:50:17 +0200
Subject: [PATCH] feature(lvgl_port): Initial support for SIMD rendering in
 LVGL

    - Assembly source files for LVGL blend API integrated into lvgl_port
    - Initial assembly assembly implementation of:
        - ARGB8888 simple fill for esp32s3 and esp32
        - RGB565 simple fill for esp32
    - Functionality and benchmark test app
---
 .gitignore                                    |   1 +
 components/esp_lvgl_port/CMakeLists.txt       |  21 +
 .../include/esp_lvgl_port_lv_blend.h          |  90 ++
 .../simd/lv_color_blend_to_argb8888_esp32.S   |  81 ++
 .../simd/lv_color_blend_to_argb8888_esp32s3.S | 328 ++++++
 .../simd/lv_color_blend_to_rgb565_esp32.S     | 149 +++
 .../simd/lv_color_blend_to_rgb565_esp32s3.S   | 149 +++
 .../test_apps/{ => lvgl_port}/CMakeLists.txt  |   0
 .../{ => lvgl_port}/main/CMakeLists.txt       |   0
 .../{ => lvgl_port}/main/idf_component.yml    |   4 +-
 .../test_apps/{ => lvgl_port}/main/test.c     |   0
 .../lvgl_port/sdkconfig.ci.asm_render         |   6 +
 .../{ => lvgl_port}/sdkconfig.defaults        |   0
 .../test_apps/simd/CMakeLists.txt             |   7 +
 .../esp_lvgl_port/test_apps/simd/README.md    | 109 ++
 .../test_apps/simd/main/CMakeLists.txt        |  21 +
 .../test_apps/simd/main/Kconfig.projbuild     |   5 +
 .../simd/main/lv_blend/include/lv_assert.h    |  60 ++
 .../simd/main/lv_blend/include/lv_color.h     | 272 +++++
 .../simd/main/lv_blend/include/lv_color_op.h  |  93 ++
 .../main/lv_blend/include/lv_draw_sw_blend.h  |  74 ++
 .../include/lv_draw_sw_blend_to_argb8888.h    |  51 +
 .../include/lv_draw_sw_blend_to_rgb565.h      |  51 +
 .../simd/main/lv_blend/include/lv_log.h       |  45 +
 .../simd/main/lv_blend/include/lv_math.h      |  56 +
 .../simd/main/lv_blend/include/lv_style.h     |  48 +
 .../simd/main/lv_blend/include/lv_types.h     |  51 +
 .../simd/main/lv_blend/src/lv_color.c         |  66 ++
 .../src/lv_draw_sw_blend_to_argb8888.c        | 911 +++++++++++++++++
 .../lv_blend/src/lv_draw_sw_blend_to_rgb565.c | 960 ++++++++++++++++++
 .../test_apps/simd/main/lv_fill_common.h      |  73 ++
 .../test_apps/simd/main/test_app_main.c       |  50 +
 .../simd/main/test_lv_fill_benchmark.c        | 176 ++++
 .../simd/main/test_lv_fill_functionality.c    | 311 ++++++
 .../test_apps/simd/sdkconfig.defaults         |   3 +
 35 files changed, 4320 insertions(+), 2 deletions(-)
 create mode 100644 components/esp_lvgl_port/include/esp_lvgl_port_lv_blend.h
 create mode 100644 components/esp_lvgl_port/src/lvgl9/simd/lv_color_blend_to_argb8888_esp32.S
 create mode 100644 components/esp_lvgl_port/src/lvgl9/simd/lv_color_blend_to_argb8888_esp32s3.S
 create mode 100644 components/esp_lvgl_port/src/lvgl9/simd/lv_color_blend_to_rgb565_esp32.S
 create mode 100644 components/esp_lvgl_port/src/lvgl9/simd/lv_color_blend_to_rgb565_esp32s3.S
 rename components/esp_lvgl_port/test_apps/{ => lvgl_port}/CMakeLists.txt (100%)
 rename components/esp_lvgl_port/test_apps/{ => lvgl_port}/main/CMakeLists.txt (100%)
 rename components/esp_lvgl_port/test_apps/{ => lvgl_port}/main/idf_component.yml (59%)
 rename components/esp_lvgl_port/test_apps/{ => lvgl_port}/main/test.c (100%)
 create mode 100644 components/esp_lvgl_port/test_apps/lvgl_port/sdkconfig.ci.asm_render
 rename components/esp_lvgl_port/test_apps/{ => lvgl_port}/sdkconfig.defaults (100%)
 create mode 100644 components/esp_lvgl_port/test_apps/simd/CMakeLists.txt
 create mode 100644 components/esp_lvgl_port/test_apps/simd/README.md
 create mode 100644 components/esp_lvgl_port/test_apps/simd/main/CMakeLists.txt
 create mode 100644 components/esp_lvgl_port/test_apps/simd/main/Kconfig.projbuild
 create mode 100644 components/esp_lvgl_port/test_apps/simd/main/lv_blend/include/lv_assert.h
 create mode 100644 components/esp_lvgl_port/test_apps/simd/main/lv_blend/include/lv_color.h
 create mode 100644 components/esp_lvgl_port/test_apps/simd/main/lv_blend/include/lv_color_op.h
 create mode 100644 components/esp_lvgl_port/test_apps/simd/main/lv_blend/include/lv_draw_sw_blend.h
 create mode 100644 components/esp_lvgl_port/test_apps/simd/main/lv_blend/include/lv_draw_sw_blend_to_argb8888.h
 create mode 100644 components/esp_lvgl_port/test_apps/simd/main/lv_blend/include/lv_draw_sw_blend_to_rgb565.h
 create mode 100644 components/esp_lvgl_port/test_apps/simd/main/lv_blend/include/lv_log.h
 create mode 100644 components/esp_lvgl_port/test_apps/simd/main/lv_blend/include/lv_math.h
 create mode 100644 components/esp_lvgl_port/test_apps/simd/main/lv_blend/include/lv_style.h
 create mode 100644 components/esp_lvgl_port/test_apps/simd/main/lv_blend/include/lv_types.h
 create mode 100644 components/esp_lvgl_port/test_apps/simd/main/lv_blend/src/lv_color.c
 create mode 100644 components/esp_lvgl_port/test_apps/simd/main/lv_blend/src/lv_draw_sw_blend_to_argb8888.c
 create mode 100644 components/esp_lvgl_port/test_apps/simd/main/lv_blend/src/lv_draw_sw_blend_to_rgb565.c
 create mode 100644 components/esp_lvgl_port/test_apps/simd/main/lv_fill_common.h
 create mode 100644 components/esp_lvgl_port/test_apps/simd/main/test_app_main.c
 create mode 100644 components/esp_lvgl_port/test_apps/simd/main/test_lv_fill_benchmark.c
 create mode 100644 components/esp_lvgl_port/test_apps/simd/main/test_lv_fill_functionality.c
 create mode 100644 components/esp_lvgl_port/test_apps/simd/sdkconfig.defaults

diff --git a/.gitignore b/.gitignore
index e8989d81..89afabd6 100644
--- a/.gitignore
+++ b/.gitignore
@@ -7,3 +7,4 @@ dependencies.lock
 doxygen_output/**
 dist
 __pycache__
+gdbinit
diff --git a/components/esp_lvgl_port/CMakeLists.txt b/components/esp_lvgl_port/CMakeLists.txt
index c9948677..8dc53693 100644
--- a/components/esp_lvgl_port/CMakeLists.txt
+++ b/components/esp_lvgl_port/CMakeLists.txt
@@ -76,6 +76,27 @@ if("usb_host_hid" IN_LIST build_components)
     list(APPEND ADD_LIBS idf::usb_host_hid)
 endif()
 
+# Include SIMD assembly source code for rendering, only for (9.1.0 <= LVG_version < 9.2.0) and only for esp32 and esp32s3
+if((lvgl_ver VERSION_GREATER_EQUAL "9.1.0") AND (lvgl_ver VERSION_LESS "9.2.0"))
+    if(CONFIG_IDF_TARGET_ESP32 OR CONFIG_IDF_TARGET_ESP32S3)
+        message(VERBOSE "Compiling SIMD")
+        if(CONFIG_IDF_TARGET_ESP32S3)
+            file(GLOB_RECURSE ASM_SRCS ${PORT_PATH}/simd/*_esp32s3.S)    # Select only esp32s3 related files
+        else()
+            file(GLOB_RECURSE ASM_SRCS ${PORT_PATH}/simd/*_esp32.S)      # Select only esp32 related files
+        endif()
+        list(APPEND ADD_SRCS ${ASM_SRCS})
+
+        # Include component libraries, so lvgl component would see lvgl_port includes
+        idf_component_get_property(lvgl_lib ${lvgl_name} COMPONENT_LIB)
+        target_include_directories(${lvgl_lib} PRIVATE "include")
+
+        # Force link .S files
+        set_property(TARGET ${COMPONENT_LIB} APPEND PROPERTY INTERFACE_LINK_LIBRARIES "-u lv_color_blend_to_argb8888_esp")
+        set_property(TARGET ${COMPONENT_LIB} APPEND PROPERTY INTERFACE_LINK_LIBRARIES "-u lv_color_blend_to_rgb565_esp")
+    endif()
+endif()
+
 # Here we create the real lvgl_port_lib
 add_library(lvgl_port_lib STATIC
     ${PORT_PATH}/esp_lvgl_port.c
diff --git a/components/esp_lvgl_port/include/esp_lvgl_port_lv_blend.h b/components/esp_lvgl_port/include/esp_lvgl_port_lv_blend.h
new file mode 100644
index 00000000..c00de1c0
--- /dev/null
+++ b/components/esp_lvgl_port/include/esp_lvgl_port_lv_blend.h
@@ -0,0 +1,90 @@
+/*
+ * SPDX-FileCopyrightText: 2024 Espressif Systems (Shanghai) CO LTD
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+#pragma once
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*********************
+ *      INCLUDES
+ *********************/
+
+#if !CONFIG_LV_DRAW_SW_ASM_CUSTOM
+#warning "esp_lvgl_port_lv_blend.h included, but CONFIG_LV_DRAW_SW_ASM_CUSTOM not set. Assembly rendering not used"
+#else
+
+/*********************
+ *      DEFINES
+ *********************/
+
+#ifndef LV_DRAW_SW_COLOR_BLEND_TO_ARGB8888
+#define LV_DRAW_SW_COLOR_BLEND_TO_ARGB8888(dsc) \
+    _lv_color_blend_to_argb8888_esp(dsc)
+#endif
+
+#ifndef LV_DRAW_SW_COLOR_BLEND_TO_RGB565
+#define LV_DRAW_SW_COLOR_BLEND_TO_RGB565(dsc) \
+    _lv_color_blend_to_rgb565_esp(dsc)
+#endif
+
+
+/**********************
+ *      TYPEDEFS
+ **********************/
+
+typedef struct {
+    uint32_t opa;
+    void *dst_buf;
+    uint32_t dst_w;
+    uint32_t dst_h;
+    uint32_t dst_stride;
+    const void *src_buf;
+    uint32_t src_stride;
+    const lv_opa_t *mask_buf;
+    uint32_t mask_stride;
+} asm_dsc_t;
+
+/**********************
+ * GLOBAL PROTOTYPES
+ **********************/
+
+extern int lv_color_blend_to_argb8888_esp(asm_dsc_t *asm_dsc);
+
+static inline lv_result_t _lv_color_blend_to_argb8888_esp(_lv_draw_sw_blend_fill_dsc_t *dsc)
+{
+    asm_dsc_t asm_dsc = {
+        .dst_buf = dsc->dest_buf,
+        .dst_w = dsc->dest_w,
+        .dst_h = dsc->dest_h,
+        .dst_stride = dsc->dest_stride,
+        .src_buf = &dsc->color,
+    };
+
+    return lv_color_blend_to_argb8888_esp(&asm_dsc);
+}
+
+extern int lv_color_blend_to_rgb565_esp(asm_dsc_t *asm_dsc);
+
+static inline lv_result_t _lv_color_blend_to_rgb565_esp(_lv_draw_sw_blend_fill_dsc_t *dsc)
+{
+    asm_dsc_t asm_dsc = {
+        .dst_buf = dsc->dest_buf,
+        .dst_w = dsc->dest_w,
+        .dst_h = dsc->dest_h,
+        .dst_stride = dsc->dest_stride,
+        .src_buf = &dsc->color,
+    };
+
+    return lv_color_blend_to_rgb565_esp(&asm_dsc);
+}
+
+#endif // CONFIG_LV_DRAW_SW_ASM_CUSTOM
+
+#ifdef __cplusplus
+} /*extern "C"*/
+#endif
diff --git a/components/esp_lvgl_port/src/lvgl9/simd/lv_color_blend_to_argb8888_esp32.S b/components/esp_lvgl_port/src/lvgl9/simd/lv_color_blend_to_argb8888_esp32.S
new file mode 100644
index 00000000..7d060675
--- /dev/null
+++ b/components/esp_lvgl_port/src/lvgl9/simd/lv_color_blend_to_argb8888_esp32.S
@@ -0,0 +1,81 @@
+/*
+ * SPDX-FileCopyrightText: 2024 Espressif Systems (Shanghai) CO LTD
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+// This is LVGL ARGB8888 simple fill for ESP32 processor
+
+    .section .text
+    .align  4
+    .global lv_color_blend_to_argb8888_esp
+    .type   lv_color_blend_to_argb8888_esp,@function
+
+// The function implements the following C code:
+// void lv_color_blend_to_argb8888(_lv_draw_sw_blend_fill_dsc_t * dsc);
+
+// Input params
+//
+// dsc - a2
+
+// typedef struct {
+//     uint32_t opa;                l32i    0
+//     void * dst_buf;              l32i    4
+//     uint32_t dst_w;              l32i    8
+//     uint32_t dst_h;              l32i    12
+//     uint32_t dst_stride;         l32i    16
+//     const void * src_buf;        l32i    20
+//     uint32_t src_stride;         l32i    24
+//     const lv_opa_t * mask_buf;   l32i    28
+//     uint32_t mask_stride;        l32i    32
+// } asm_dsc_t;
+
+lv_color_blend_to_argb8888_esp:
+
+    entry   a1,    32
+
+    l32i.n   a3,    a2,    4                    // a3 - dest_buff
+    l32i.n   a4,    a2,    8                    // a4 - dest_w                in uint32_t
+    l32i.n   a5,    a2,    12                   // a5 - dest_h                in uint32_t
+    l32i.n   a6,    a2,    16                   // a6 - dest_stride           in bytes
+    l32i.n   a7,    a2,    20                   // a7 - src_buff (color)
+    l32i.n   a8,    a7,    0                    // a8 - color as value
+    slli     a11,   a4,    2                    // a11 - dest_w_bytes = sizeof(uint32_t) * dest_w
+
+    movi     a7,    0xff000000                  // oppactiy mask
+    or       a10,    a7,    a8                  // apply oppacity
+
+    srli    a9,    a4,   2                      // a9 - loop_len = dest_w / 4
+    sub     a6,    a6,   a11                    // dest_stride = dest_stride - dest_w_bytes
+
+    .outer_loop:
+
+        // Run main loop which sets 16 bytes in one loop run
+        loopnez a9, ._main_loop
+            s32i.n      a10,  a3,  0                    // save 32 bits from a10 to dest_buff a3
+            s32i.n      a10,  a3,  4                    // save 32 bits from a10 to dest_buff a3
+            s32i.n      a10,  a3,  8                    // save 32 bits from a10 to dest_buff a3
+            s32i.n      a10,  a3,  12                   // save 32 bits from a10 to dest_buff a3
+            addi.n      a3,   a3,  16                   // increment dest_buff pointer by 16 bytes
+        ._main_loop:
+
+        // Finish the remaining bytes out of the loop
+        // Check modulo 8 of the dest_w_bytes, if - then set 8 bytes
+        bbci a11, 3, _mod_8_check                       // branch if 2-nd bit of dest_w_bytes is clear
+            s32i.n      a10,  a3,  0                    // save 32 bits from a10 to dest_buff a3, offset 0 bytes
+            s32i.n      a10,  a3,  4                    // save 32 bits from a10 to dest_buff a3, offset 0 bytes
+            addi.n      a3,   a3,  8                    // increment dest_buff pointer by 8 bytes
+        _mod_8_check:
+
+        // Check modulo 4 of the dest_w_bytes, if - then set 4 bytes
+        bbci a11, 2, _mod_4_check                       // branch if 2-nd bit of dest_w_bytes is clear
+            s32i.n      a10,  a3,  0                    // save 32 bits from a10 to dest_buff a3, offset 0 bytes
+            addi.n      a3,   a3,  4                    // increment dest_buff pointer by 4 bytes
+        _mod_4_check:
+
+        add     a3,  a3,  a6                             // dest_buff + dest_stride
+        addi.n  a5,  a5,  -1                             // decrease the outer loop
+    bnez a5, .outer_loop
+
+    movi.n   a2, 1                                      // return LV_RESULT_OK = 1
+    retw.n                                              // return
diff --git a/components/esp_lvgl_port/src/lvgl9/simd/lv_color_blend_to_argb8888_esp32s3.S b/components/esp_lvgl_port/src/lvgl9/simd/lv_color_blend_to_argb8888_esp32s3.S
new file mode 100644
index 00000000..4d9f84f1
--- /dev/null
+++ b/components/esp_lvgl_port/src/lvgl9/simd/lv_color_blend_to_argb8888_esp32s3.S
@@ -0,0 +1,328 @@
+/*
+ * SPDX-FileCopyrightText: 2024 Espressif Systems (Shanghai) CO LTD
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+// This is LVGL ARGB8888 simple fill for ESP32S3 processor
+
+    .section .text
+    .align  4
+    .global lv_color_blend_to_argb8888_esp
+    .type   lv_color_blend_to_argb8888_esp,@function
+// The function implements the following C code:
+// void lv_color_blend_to_argb8888(_lv_draw_sw_blend_fill_dsc_t * dsc);
+
+// Input params
+//
+// dsc - a2
+
+// typedef struct {
+//     uint32_t opa;                l32i    0
+//     void * dst_buf;              l32i    4
+//     uint32_t dst_w;              l32i    8
+//     uint32_t dst_h;              l32i    12
+//     uint32_t dst_stride;         l32i    16
+//     const void * src_buf;        l32i    20
+//     uint32_t src_stride;         l32i    24
+//     const lv_opa_t * mask_buf;   l32i    28
+//     uint32_t mask_stride;        l32i    32
+// } asm_dsc_t;
+
+
+lv_color_blend_to_argb8888_esp:
+
+    entry      a1,    32
+    ee.zero.q  q0                               // dummy TIE instruction, to enable the TIE
+
+    l32i.n   a3,    a2,    4                    // a3 - dest_buff
+    l32i.n   a4,    a2,    8                    // a4 - dest_w                in uint32_t
+    l32i.n   a5,    a2,    12                   // a5 - dest_h                in uint32_t
+    l32i.n   a6,    a2,    16                   // a6 - dest_stride           in bytes
+    l32i.n   a7,    a2,    20                   // a7 - src_buff (color)
+    l32i.n   a8,    a7,    0                    // a8 - color as value
+    slli     a11,   a4,    2                    // a11 - dest_w_bytes = sizeof(uint32_t) * dest_w
+
+    movi     a7,    0xff000000                  // oppactiy mask
+    or       a10,    a7,    a8                  // apply oppacity
+
+    // Check for short lengths
+    // dest_w should be at least 8, othewise it's not worth using esp32s3 TIE
+    bgei     a4,   8,  _esp32s3_implementation          // Branch if dest_w is greater than or equal to 8
+    j .lv_color_blend_to_argb8888_esp32_body            // Jump to esp32 implementation
+
+    _esp32s3_implementation:
+
+    ee.movi.32.q   q0,   a10,  0                        // fill q0 register from a10 by 32 bits
+    ee.movi.32.q   q0,   a10,  1
+    ee.movi.32.q   q0,   a10,  2
+    ee.movi.32.q   q0,   a10,  3
+
+    // Check dest_buff alignment
+    movi.n   a7,   0xf                                  // 0xf alignment mask (16-byte alignment)
+    and     a15,   a7,  a3                              // 16-byte alignment mask AND dest_buff pointer
+    bnez    a15,   _unaligned_by_4byte                  // branch if a15 not equals to zero
+
+    // Check dest_stride alignment
+    and     a15,   a7,  a6                              // 16-byte alignment mask AND dest_stride
+    bnez    a15,   _unaligned_by_4byte                  // branch if a15 not equals to zero
+
+    // Check dest_w_bytes alignment
+    and     a15,   a7,  a11                             // 16-byte alignment mask AND dest_w_bytes
+    bnez    a15,   _unaligned_by_4byte                  // branch if a15 not equals to zero
+
+//**********************************************************************************************************************
+
+    // all aligned, the most ideal case
+
+    // dest_buff   (a3) - 16-byte aligned
+    // dest_stride (a6) - 16-byte multiple
+    // dest_w      (a4) - 16-byte multiple
+
+    srli    a9,    a4,   2                              // a9 - loop_len = dest_w / 4
+    sub     a6,    a6,   a11                            // dest_stride = dest_stride - dest_w_bytes
+
+    .outer_loop_aligned:
+
+        loopnez  a9, ._main_loop_aligned                // 16 bytes (4 argb8888) in one loop
+            ee.vst.128.ip q0, a3, 16                    // store 16 bytes from q0 to dest_buff a3
+        ._main_loop_aligned:
+
+        add     a3,  a3,  a6                            // dest_buff + dest_stride
+        addi.n  a5,  a5,  -1                            // decrease the outer loop
+    bnez a5, .outer_loop_aligned
+
+    movi.n   a2, 1                                      // return LV_RESULT_OK = 1
+    retw.n                                              // return
+
+    _unaligned_by_4byte:
+
+    // Check dest_buff alignment
+    movi.n   a7,    0x3                                 // 0x3 alignment mask (4-byte alignment)
+    and     a15,    a7,   a3                            // 4-byte alignment mask AND dest_buff pointer
+    bnez    a15,    _unaligned_by_1byte                 // branch if a15 not equals to zero
+
+    // Check dest_stride alignment
+    and     a15,    a7,   a6                            // 4-byte alignment mask AND dest_stride pointer
+    bnez    a15,    _unaligned_by_1byte                 // branch if a15 not equals to zero
+
+//**********************************************************************************************************************
+
+    // either dest_buff or dest_stride is not 16-byte aligned
+    // dest_w is always 4-byte multiple
+    // all of the following are 4-byte aligned
+
+    // dest_buff   (a3) - 16-byte, or 4-byte aligned
+    // dest_stride (a6) - 16-byte, or 4-byte multiple
+    // dest_w      (a4) - 4-byte multiple
+
+    sub      a6,    a6,   a11                           // dest_stride = dest_stride - dest_w_bytes
+    movi.n   a7,    0xf                                 // 0xf alignment mask
+
+    .outer_loop_aligned_by_4byte:
+
+        // alignment check
+        and     a15,   a7,  a3                          // 0xf (alignment mask) AND dest_buff pointer
+        mov     a12,   a11                              // a12 - local_dest_w_bytes = dest_w_bytes
+        beqz    a15,   _dest_buff_aligned_by_4byte       // branch if a15 equals to zero
+
+
+            movi.n  a14,   16                           // a14 - 16
+            sub     a15,   a14,   a15                   // a15 = 16 - unalignment (lower 4 bits of dest_buff address)
+            sub     a12,   a12,   a15                   // local_dest_w_bytes = len - (16 - unalignment)
+
+            // keep setting until dest_buff is aligned
+            // Check modulo 8 of the unalignment, if - then set 8 bytes
+            bbci    a15,  3, _aligning_mod_8_check_4byte // branch if 3-rd bit of unalignment a15 is clear
+                s32i.n      a10,  a3,  0                // save 32 bits from a10 to dest_buff a3, offset 0 bytes
+                s32i.n      a10,  a3,  4                // save 32 bits from a10 to dest_buff a3, offset 4 bytes
+                addi.n      a3,   a3,  8                // increment dest_buff pointer by 8 bytes
+            _aligning_mod_8_check_4byte:
+
+            // Check modulo 4 of the unalignment, if - then set 4 bytes
+            bbci a15, 2, _aligning_mod_4_check_4byte     // branch if 2-nd bit unalignment a15 is clear
+                s32i.n      a10,  a3,  0                // save 32 bits from a10 to dest_buff a3, offset 0 bytes
+                addi.n      a3,   a3,  4                // increment dest_buff pointer by 4 bytes
+            _aligning_mod_4_check_4byte:
+
+        _dest_buff_aligned_by_4byte:
+        // Calculate main loop_len
+        srli    a9,    a12,   4                         // a9 - loop_len = local_dest_w_bytes / 16
+
+        // Main loop
+        loopnez  a9, ._main_loop_unaligned_by_4byte     // 16 bytes (4 argb8888) in one loop
+            ee.vst.128.ip q0, a3, 16                    // store 16 bytes from q0 to dest_buff a3
+        ._main_loop_unaligned_by_4byte:
+
+        // Check modulo 8 of the dest_w, if - then set 8 bytes
+        bbci a12, 3, _aligned_mod_8_check_4byte         // branch if 3-rd bit of local_dest_w_bytes a12 is clear
+            ee.vst.l.64.ip    q0,  a3,  8               // save lower 64 bits from q0 to dest_buff a3, increase dest_buff pointer by 8 bytes
+        _aligned_mod_8_check_4byte:
+
+        // Check modulo 4 of the dest_w, if - then set 4 bytes
+        bbci a12, 2, _aligned_mod_4_check_4byte         // branch if 2-nd bit of local_dest_w_bytes a12 is clear
+            s32i.n      a10,  a3,  0                    // save 32 bits from a10 to dest_buff a3, offset 0 bytes
+            addi.n      a3,   a3,  4                    // increment dest_buff pointer by 4 bytes
+        _aligned_mod_4_check_4byte:
+
+        add     a3,  a3,  a6                            // dest_buff + dest_stride
+        addi.n  a5,  a5,  -1                            // decrease the outer loop
+    bnez a5, .outer_loop_aligned_by_4byte
+
+    movi.n   a2, 1                                      // return LV_RESULT_OK = 1
+    retw.n                                              // return
+
+    _unaligned_by_1byte:
+
+//**********************************************************************************************************************
+
+    // either dest_buff or dest_stride is not 4-byte aligned
+    // dest_w is always 4-byte multiple
+
+    // dest_buff   (a3) - 4-byte, or 1-byte aligned
+    // dest_stride (a6) - 4-byte, or 1-byte multiple
+    // dest_w      (a4) - 4-byte multiple
+
+
+    mov   a13, a3
+
+    ee.zero.q   q1                                      // clear q1 
+    ee.orq      q1,    q1,   q0                         // copy q0 to q1
+    sub         a6,    a6,   a11                        // dest_stride = dest_stride - dest_w_bytes
+    movi.n      a7,    0xf                              // 0xf alignment mask
+
+    .outer_loop_aligned_by_1byte:
+
+        // alignment check
+        and     a15,   a7,  a3                          // 0xf (alignment mask) AND dest_buff pointer
+        mov     a12,   a11                              // a12 - local_dest_w_bytes = dest_w_bytes
+        beqz    a15,   _dest_buff_aligned_by_1byte      // branch if a15 equals to zero
+
+
+            movi.n  a14,   16                           // a14 - 16
+            sub     a15,   a14,   a15                   // a15 = 16 - unalignment (lower 4 bits of dest_buff address)
+            sub     a12,   a12,   a15                   // local_dest_w_bytes = len - (16 - unalignment)
+
+            // keep setting until dest_buff is aligned
+            // Check modulo 8 of the unalignment, if - then set 8 bytes
+            bbci    a15,  3, _aligning_mod_8_check_1byte// branch if 3-rd bit of unalignment a15 is clear
+                s32i.n      a10,  a3,  0                // save 32 bits from a10 to dest_buff a3, offset 0 bytes
+                s32i.n      a10,  a3,  4                // save 32 bits from a10 to dest_buff a3, offset 4 bytes
+                addi.n      a3,   a3,  8                // increment dest_buff pointer by 8 bytes
+            _aligning_mod_8_check_1byte:
+
+            // Check modulo 4 of the unalignment, if - then set 4 bytes
+            bbci a15, 2, _aligning_mod_4_check_1byte    // branch if 2-nd bit unalignment a15 is clear
+                s32i.n      a10,  a3,  0                // save 32 bits from a10 to dest_buff a3, offset 0 bytes
+                addi.n      a3,   a3,  4                // increment dest_buff pointer by 4 bytes
+            _aligning_mod_4_check_1byte:
+
+            // Check modulo 2 and 1 (the following 2 ifs do the same correction)
+            // modulo 2 and modulo 1 requires the same action, just once
+            bbci a15, 1, _aligning_mod_2_check_1byte
+                s32i.n      a10,  a3,  0                // save 32 bits from a10 to dest_buff a3, offset 0 bytes
+                addi.n      a3,   a3,  4                // increment dest_buff pointer by 4 bytes
+                j _dest_buff_aligned_by_1byte
+            _aligning_mod_2_check_1byte:
+
+            bbci a15, 0, _dest_buff_aligned_by_1byte
+                s32i.n      a10,  a3,  0                // save 32 bits from a10 to dest_buff a3, offset 0 bytes
+                addi.n      a3,   a3,  4                // increment dest_buff pointer by 4 bytes
+        _dest_buff_aligned_by_1byte:
+
+        // Shift q reg, allowing to set 16-byte unaligned adata
+        wur.sar_byte     a15                            // apply unalignment to the SAR_BYTE
+        ee.src.q   q2,  q0,  q1                         // shift concat. of q0 and q1 to q2 by SAR_BYTE amount
+
+        // Calculate main loop_len
+        srli    a9,    a12,   4                         // a9 - loop_len = local_dest_w_bytes / 16
+
+        // Main loop
+        loopnez  a9, ._main_loop_unaligned_by_1byte     // 16 bytes (4 argb8888) in one loop
+            ee.vst.128.ip q2, a3, 16                    // store 16 bytes from q0 to dest_buff a3
+        ._main_loop_unaligned_by_1byte:
+
+        // Firstly check mod 0 and mod 1 - correcting the aligned memory access
+        // Go back in one Byte, allow to correct after ee.vst.128.ip aligned access
+        addi    a3, a3, -4
+
+        // Check modulo 2 of the dest_w, if - then set 2 bytes
+        // set SSSS in 0xSSSS0000
+        bbci a12, 1, _aligned_mod_2_check_1byte         // branch if 1-st bit of dest_w a12 is clear
+            srli    a14,   a10,  16                     // shift a10 in 16, allowing s16i (saving of lower 16 bits)
+            s16i    a14,   a3,   2                      // save 16 bits from a10 to dest_buff a3, offset 2 bytes
+
+            // Check modulo 1 of the dest_w, if - then set 1 byte
+            // additionally set SS in 0x0000SS00
+            bbci a12, 0, _aligned_end                   // branch if 0-th bit of dest_w a12 is clear
+                srli    a14,   a10,  8                  // shift a10 in 8, allowing s8i
+                s8i     a14,   a3,   1                  // save 8 bits from a10 to dest_buff a3, offset 1 byte
+                j _aligned_end
+        _aligned_mod_2_check_1byte:
+
+        // Check modulo 1 of the dest_w, if - then set 1 byte
+        // set SS in 0xSS000000
+        bbci a12, 0, _aligned_end                       // branch if 0-th bit of dest_w a12 is clear
+            srli    a14,   a10,  24                     // shift a10 in 24, allowing s8i (saving of lower 8 bits)
+            s8i     a14,   a3,   3                      // save 8 bits from a10 to dest_buff a3, offset 3 bytes
+        _aligned_end:
+
+        addi    a3, a3, 4                               // Increase the pointer back, correction for addi    a3, a3, -4
+
+        // Check modulo 8 of the dest_w, if - then set 8 bytes
+        bbci a12, 3, _aligned_mod_8_check_1byte         // branch if 3-rd bit of local_dest_w_bytes a12 is clear
+            s32i.n      a10,  a3,  0                    // save 32 bits from a10 to dest_buff a3, offset 0 bytes
+            s32i.n      a10,  a3,  4                    // save 32 bits from a10 to dest_buff a3, offset 0 bytes
+            addi.n      a3,   a3,  8                    // increment dest_buff pointer by 4 bytes
+            //ee.vst.l.64.ip    q2,  a3,  8               // save lower 64 bits from q0 to dest_buff a3, increase dest_buff pointer by 8 bytes
+        _aligned_mod_8_check_1byte:
+
+        // Check modulo 4 of the dest_w, if - then set 4 bytes
+        bbci a12, 2, _aligned_mod_4_check_1byte         // branch if 2-nd bit of local_dest_w_bytes a12 is clear
+            s32i.n      a10,  a3,  0                    // save 32 bits from a10 to dest_buff a3, offset 0 bytes
+            addi.n      a3,   a3,  4                    // increment dest_buff pointer by 4 bytes
+        _aligned_mod_4_check_1byte:
+
+        add     a3,  a3,  a6                            // dest_buff + dest_stride
+        addi.n  a5,  a5,  -1                            // decrease the outer loop
+    bnez a5, .outer_loop_aligned_by_1byte
+
+    movi.n   a2, 1                                      // return LV_RESULT_OK = 1
+    retw.n                                              // return
+
+    .lv_color_blend_to_argb8888_esp32_body:
+
+    srli    a9,    a4,   2                              // a9 - loop_len = dest_w / 4
+    sub     a6,    a6,   a11                            // dest_stride = dest_stride - dest_w_bytes
+
+    .outer_loop:
+
+        // Run main loop which sets 16 bytes in one loop run
+        loopnez a9, ._main_loop
+            s32i.n      a10,  a3,  0                    // save 32 bits from a10 to dest_buff a3
+            s32i.n      a10,  a3,  4                    // save 32 bits from a10 to dest_buff a3
+            s32i.n      a10,  a3,  8                    // save 32 bits from a10 to dest_buff a3
+            s32i.n      a10,  a3,  12                   // save 32 bits from a10 to dest_buff a3
+            addi.n      a3,   a3,  16                   // increment dest_buff pointer by 16 bytes
+        ._main_loop:
+
+        // Finish the remaining bytes out of the loop
+        // Check modulo 8 of the dest_w_bytes, if - then set 8 bytes
+        bbci a11, 3, _mod_8_check                       // branch if 2-nd bit of dest_w_bytes is clear
+            s32i.n      a10,  a3,  0                    // save 32 bits from a10 to dest_buff a3, offset 0 bytes
+            s32i.n      a10,  a3,  4                    // save 32 bits from a10 to dest_buff a3, offset 0 bytes
+            addi.n      a3,   a3,  8                    // increment dest_buff pointer by 8 bytes
+        _mod_8_check:
+
+        // Check modulo 4 of the dest_w_bytes, if - then set 4 bytes
+        bbci a11, 2, _mod_4_check                       // branch if 2-nd bit of dest_w_bytes is clear
+            s32i.n      a10,  a3,  0                    // save 32 bits from a10 to dest_buff a3, offset 0 bytes
+            addi.n      a3,   a3,  4                    // increment dest_buff pointer by 4 bytes
+        _mod_4_check:
+
+        add     a3,  a3,  a6                             // dest_buff + dest_stride
+        addi.n  a5,  a5,  -1                             // decrease the outer loop
+    bnez a5, .outer_loop
+
+    movi.n   a2, 1                                      // return LV_RESULT_OK = 1
+    retw.n                                              // return
diff --git a/components/esp_lvgl_port/src/lvgl9/simd/lv_color_blend_to_rgb565_esp32.S b/components/esp_lvgl_port/src/lvgl9/simd/lv_color_blend_to_rgb565_esp32.S
new file mode 100644
index 00000000..07b5aa11
--- /dev/null
+++ b/components/esp_lvgl_port/src/lvgl9/simd/lv_color_blend_to_rgb565_esp32.S
@@ -0,0 +1,149 @@
+/*
+ * SPDX-FileCopyrightText: 2024 Espressif Systems (Shanghai) CO LTD
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+// This is LVGL RGB565 simple fill for ESP32 processor
+
+    .section .text
+    .align  4
+    .global lv_color_blend_to_rgb565_esp
+    .type   lv_color_blend_to_rgb565_esp,@function
+// The function implements the following C code:
+// void lv_color_blend_to_rgb565(_lv_draw_sw_blend_fill_dsc_t * dsc);
+
+// Input params
+//
+// dsc - a2
+
+// typedef struct {
+//     uint32_t opa;                l32i    0
+//     void * dst_buf;              l32i    4
+//     uint32_t dst_w;              l32i    8
+//     uint32_t dst_h;              l32i    12
+//     uint32_t dst_stride;         l32i    16
+//     const void * src_buf;        l32i    20
+//     uint32_t src_stride;         l32i    24
+//     const lv_opa_t * mask_buf;   l32i    28
+//     uint32_t mask_stride;        l32i    32
+// } asm_dsc_t;
+
+lv_color_blend_to_rgb565_esp:
+
+    entry   a1,    32
+
+    l32i.n   a3,    a2,    4                    // a3 - dest_buff
+    l32i.n   a4,    a2,    8                    // a4 - dest_w                in uint16_t
+    l32i.n   a5,    a2,    12                   // a5 - dest_h                in uint16_t
+    l32i.n   a6,    a2,    16                   // a6 - dest_stride           in bytes
+    l32i.n   a7,    a2,    20                   // a7 - src_buff (color)
+    l32i.n   a8,    a7,    0                    // a8 - color as value
+    slli     a11,   a4,    1                    // a11 - dest_w_bytes = sizeof(uint16_t) * dest_w
+
+    // Convert color to rgb656
+    l8ui    a15,    a7,    2                    // red
+    movi.n  a14,    0xf8
+    and     a13,    a15,   a14
+    slli    a10,    a13,   8
+
+    l8ui    a15,    a7,    0                    // blue
+    and     a13,    a15,   a14
+    srli    a12,    a13,   3
+    add     a10,    a10,   a12
+
+    l8ui    a15,    a7,    1                    // green
+    movi.n  a14,    0xfc
+    and     a13,    a15,   a14
+    slli    a12,    a13,   3
+    add     a12,    a10,   a12                  // a12 = 16-bit color
+
+    slli    a10,    a12,   16
+    movi.n  a13,    0xFFFF0000
+    and     a10,    a10,    a13
+    or      a10,    a10,    a12                 // a10 = 32-bit color (16bit + (16bit << 16))
+
+    movi.n  a8,    0x3                          // a8 = 0x3, dest_buff align mask
+    sub     a6,    a6,   a11                    // dest_stride = dest_stride - dest_w_bytes
+
+    // cache init
+    // Prepare main loop length and dest_w_bytes
+    srli     a9,     a4,    4                    // a9 = loop_len = dest_w / 8, calculate main loop_len for original dest_w
+    slli     a11,    a4,    1                    // a11 = dest_w_bytes = sizeof(uint16_t) * dest_w
+    addi     a4,     a4,   -1                    // a4-- (decrement a4)
+    s32i.n   a9,     a1,    0                    // cache.orig.loop_len
+    s32i.n   a11,    a1,    4                    // cache.orig.dest_w_bytes
+
+    // Prepare decreased main loop length and dest_w_bytes
+    srli     a9,     a4,    4                    // a9 = loop_len = dest_w / 8, calculate main loop_len for dest_w - 1
+    slli     a11,    a4,    1                    // a11 = dest_w_bytes = sizeof(uint16_t) * (dest_w - 1)
+    s32i.n   a9,     a1,    8                    // cache.decr.loop_len
+    s32i.n   a11,    a1,    12                   // cache.decr.dest_w_bytes
+    and      a7,     a8,    a3                   // a7 = dest_buff AND 0x3 (chck if the address is 4-byte aligned)
+
+    .outer_loop:
+
+        // Check if the des_buff is 2-byte aligned
+        beqz  a7, _dest_buff_2_byte_aligned      // branch if a7 is equal to zero
+            s16i        a12,  a3,   0            // save 16 bits from 16-bit color a12 to dest_buff a3, offset 0
+            l32i.n      a9,   a1,   8            // a9  = load cache.decr.loop_len
+            l32i.n      a11,  a1,   12           // a11 = load cache.decr.dest_w_bytes
+            addi.n      a3,   a3,   2            // increment dest_buff pointer by 2
+            j           _dest_buff_unaligned
+        _dest_buff_2_byte_aligned:
+
+        l32i.n      a9,   a1,   0                // a11 = load cache.orig.loop_len
+        l32i.n      a11,  a1,   4                // a11 = load cache.orig.dest_w_bytes
+
+        _dest_buff_unaligned:
+
+        // Run main loop which sets 16 bytes in one loop run
+        loopnez a9, ._main_loop
+            s32i.n      a10,  a3,  0                    // save 32 bits from 32-bit color a10 to dest_buff a3, offset 0
+            s32i.n      a10,  a3,  4                    // save 32 bits from 32-bit color a10 to dest_buff a3, offset 4
+            s32i.n      a10,  a3,  8                    // save 32 bits from 32-bit color a10 to dest_buff a3, offset 8
+            s32i.n      a10,  a3,  12                   // save 32 bits from 32-bit color a10 to dest_buff a3, offset 12
+            s32i.n      a10,  a3,  16                   // save 32 bits from 32-bit color a10 to dest_buff a3, offset 16
+            s32i.n      a10,  a3,  20                   // save 32 bits from 32-bit color a10 to dest_buff a3, offset 20
+            s32i.n      a10,  a3,  24                   // save 32 bits from 32-bit color a10 to dest_buff a3, offset 24
+            s32i.n      a10,  a3,  28                   // save 32 bits from 32-bit color a10 to dest_buff a3, offset 28
+            addi.n      a3,   a3,  32                   // increment dest_buff pointer by 32
+        ._main_loop:
+
+        // Finish the remaining bytes out of the loop
+        // Check modulo 8 of the dest_w_bytes, if - then set 16 bytes
+        bbci a11, 4, _mod_16_check                      // branch if 2-nd bit of dest_w_bytes is clear
+            s32i.n      a10,  a3,  0                    // save 32 bits from 32-bit color a10 to dest_buff a3, offset 0
+            s32i.n      a10,  a3,  4                    // save 32 bits from 32-bit color a10 to dest_buff a3, offset 4
+            s32i.n      a10,  a3,  8                    // save 32 bits from 32-bit color a10 to dest_buff a3, offset 8
+            s32i.n      a10,  a3,  12                   // save 32 bits from 32-bit color a10 to dest_buff a3, offset 12
+            addi.n      a3,   a3,  16                   // increment dest_buff pointer by 16
+        _mod_16_check:
+
+        // Finish the remaining bytes out of the loop
+        // Check modulo 8 of the dest_w_bytes, if - then set 8 bytes
+        bbci a11, 3, _mod_8_check                       // branch if 2-nd bit of dest_w_bytes is clear
+            s32i.n      a10,  a3,  0                    // save 32 bits from 32-bit color a10 to dest_buff a3, offset 0
+            s32i.n      a10,  a3,  4                    // save 32 bits from 32-bit color a10 to dest_buff a3, offset 4
+            addi.n      a3,   a3,  8                    // increment dest_buff pointer by 8 bytes
+        _mod_8_check:
+
+        // Check modulo 4 of the dest_w_bytes, if - then set 4 bytes
+        bbci a11, 2, _mod_4_check                       // branch if 2-nd bit of dest_w_bytes is clear
+            s32i.n      a10,  a3,  0                    // save 32 bits from 32-bit color a10 to dest_buff a3, offset 0
+            addi.n      a3,   a3,  4                    // increment dest_buff pointer by 4
+        _mod_4_check:
+
+        // Check modulo 2 of the dest_w_bytes, if - then set 2 bytes
+        bbci a11, 1, _mod_2_check                       // branch if 1-st bit of dest_w_bytes is clear
+            s16i        a12,  a3,  0                    // save 16 bits from 16-bit color a12 to dest_buff a3, offset 0
+            addi.n      a3,   a3,  2                    // increment dest_buff pointer by 2 bytes
+        _mod_2_check:
+
+        add     a3,  a3,  a6                            // dest_buff + dest_stride
+        addi.n  a5,  a5,  -1                            // decrease the outer loop
+        and     a7,  a8,  a3                            // a7 = dest_buff AND 0x3 (chck if the address is 4-byte aligned)
+    bnez a5, .outer_loop
+
+    movi.n   a2, 1                                      // return LV_RESULT_OK = 1
+    retw.n                                              // return
diff --git a/components/esp_lvgl_port/src/lvgl9/simd/lv_color_blend_to_rgb565_esp32s3.S b/components/esp_lvgl_port/src/lvgl9/simd/lv_color_blend_to_rgb565_esp32s3.S
new file mode 100644
index 00000000..07b5aa11
--- /dev/null
+++ b/components/esp_lvgl_port/src/lvgl9/simd/lv_color_blend_to_rgb565_esp32s3.S
@@ -0,0 +1,149 @@
+/*
+ * SPDX-FileCopyrightText: 2024 Espressif Systems (Shanghai) CO LTD
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+// This is LVGL RGB565 simple fill for ESP32 processor
+
+    .section .text
+    .align  4
+    .global lv_color_blend_to_rgb565_esp
+    .type   lv_color_blend_to_rgb565_esp,@function
+// The function implements the following C code:
+// void lv_color_blend_to_rgb565(_lv_draw_sw_blend_fill_dsc_t * dsc);
+
+// Input params
+//
+// dsc - a2
+
+// typedef struct {
+//     uint32_t opa;                l32i    0
+//     void * dst_buf;              l32i    4
+//     uint32_t dst_w;              l32i    8
+//     uint32_t dst_h;              l32i    12
+//     uint32_t dst_stride;         l32i    16
+//     const void * src_buf;        l32i    20
+//     uint32_t src_stride;         l32i    24
+//     const lv_opa_t * mask_buf;   l32i    28
+//     uint32_t mask_stride;        l32i    32
+// } asm_dsc_t;
+
+lv_color_blend_to_rgb565_esp:
+
+    entry   a1,    32
+
+    l32i.n   a3,    a2,    4                    // a3 - dest_buff
+    l32i.n   a4,    a2,    8                    // a4 - dest_w                in uint16_t
+    l32i.n   a5,    a2,    12                   // a5 - dest_h                in uint16_t
+    l32i.n   a6,    a2,    16                   // a6 - dest_stride           in bytes
+    l32i.n   a7,    a2,    20                   // a7 - src_buff (color)
+    l32i.n   a8,    a7,    0                    // a8 - color as value
+    slli     a11,   a4,    1                    // a11 - dest_w_bytes = sizeof(uint16_t) * dest_w
+
+    // Convert color to rgb656
+    l8ui    a15,    a7,    2                    // red
+    movi.n  a14,    0xf8
+    and     a13,    a15,   a14
+    slli    a10,    a13,   8
+
+    l8ui    a15,    a7,    0                    // blue
+    and     a13,    a15,   a14
+    srli    a12,    a13,   3
+    add     a10,    a10,   a12
+
+    l8ui    a15,    a7,    1                    // green
+    movi.n  a14,    0xfc
+    and     a13,    a15,   a14
+    slli    a12,    a13,   3
+    add     a12,    a10,   a12                  // a12 = 16-bit color
+
+    slli    a10,    a12,   16
+    movi.n  a13,    0xFFFF0000
+    and     a10,    a10,    a13
+    or      a10,    a10,    a12                 // a10 = 32-bit color (16bit + (16bit << 16))
+
+    movi.n  a8,    0x3                          // a8 = 0x3, dest_buff align mask
+    sub     a6,    a6,   a11                    // dest_stride = dest_stride - dest_w_bytes
+
+    // cache init
+    // Prepare main loop length and dest_w_bytes
+    srli     a9,     a4,    4                    // a9 = loop_len = dest_w / 8, calculate main loop_len for original dest_w
+    slli     a11,    a4,    1                    // a11 = dest_w_bytes = sizeof(uint16_t) * dest_w
+    addi     a4,     a4,   -1                    // a4-- (decrement a4)
+    s32i.n   a9,     a1,    0                    // cache.orig.loop_len
+    s32i.n   a11,    a1,    4                    // cache.orig.dest_w_bytes
+
+    // Prepare decreased main loop length and dest_w_bytes
+    srli     a9,     a4,    4                    // a9 = loop_len = dest_w / 8, calculate main loop_len for dest_w - 1
+    slli     a11,    a4,    1                    // a11 = dest_w_bytes = sizeof(uint16_t) * (dest_w - 1)
+    s32i.n   a9,     a1,    8                    // cache.decr.loop_len
+    s32i.n   a11,    a1,    12                   // cache.decr.dest_w_bytes
+    and      a7,     a8,    a3                   // a7 = dest_buff AND 0x3 (chck if the address is 4-byte aligned)
+
+    .outer_loop:
+
+        // Check if the des_buff is 2-byte aligned
+        beqz  a7, _dest_buff_2_byte_aligned      // branch if a7 is equal to zero
+            s16i        a12,  a3,   0            // save 16 bits from 16-bit color a12 to dest_buff a3, offset 0
+            l32i.n      a9,   a1,   8            // a9  = load cache.decr.loop_len
+            l32i.n      a11,  a1,   12           // a11 = load cache.decr.dest_w_bytes
+            addi.n      a3,   a3,   2            // increment dest_buff pointer by 2
+            j           _dest_buff_unaligned
+        _dest_buff_2_byte_aligned:
+
+        l32i.n      a9,   a1,   0                // a11 = load cache.orig.loop_len
+        l32i.n      a11,  a1,   4                // a11 = load cache.orig.dest_w_bytes
+
+        _dest_buff_unaligned:
+
+        // Run main loop which sets 16 bytes in one loop run
+        loopnez a9, ._main_loop
+            s32i.n      a10,  a3,  0                    // save 32 bits from 32-bit color a10 to dest_buff a3, offset 0
+            s32i.n      a10,  a3,  4                    // save 32 bits from 32-bit color a10 to dest_buff a3, offset 4
+            s32i.n      a10,  a3,  8                    // save 32 bits from 32-bit color a10 to dest_buff a3, offset 8
+            s32i.n      a10,  a3,  12                   // save 32 bits from 32-bit color a10 to dest_buff a3, offset 12
+            s32i.n      a10,  a3,  16                   // save 32 bits from 32-bit color a10 to dest_buff a3, offset 16
+            s32i.n      a10,  a3,  20                   // save 32 bits from 32-bit color a10 to dest_buff a3, offset 20
+            s32i.n      a10,  a3,  24                   // save 32 bits from 32-bit color a10 to dest_buff a3, offset 24
+            s32i.n      a10,  a3,  28                   // save 32 bits from 32-bit color a10 to dest_buff a3, offset 28
+            addi.n      a3,   a3,  32                   // increment dest_buff pointer by 32
+        ._main_loop:
+
+        // Finish the remaining bytes out of the loop
+        // Check modulo 8 of the dest_w_bytes, if - then set 16 bytes
+        bbci a11, 4, _mod_16_check                      // branch if 2-nd bit of dest_w_bytes is clear
+            s32i.n      a10,  a3,  0                    // save 32 bits from 32-bit color a10 to dest_buff a3, offset 0
+            s32i.n      a10,  a3,  4                    // save 32 bits from 32-bit color a10 to dest_buff a3, offset 4
+            s32i.n      a10,  a3,  8                    // save 32 bits from 32-bit color a10 to dest_buff a3, offset 8
+            s32i.n      a10,  a3,  12                   // save 32 bits from 32-bit color a10 to dest_buff a3, offset 12
+            addi.n      a3,   a3,  16                   // increment dest_buff pointer by 16
+        _mod_16_check:
+
+        // Finish the remaining bytes out of the loop
+        // Check modulo 8 of the dest_w_bytes, if - then set 8 bytes
+        bbci a11, 3, _mod_8_check                       // branch if 2-nd bit of dest_w_bytes is clear
+            s32i.n      a10,  a3,  0                    // save 32 bits from 32-bit color a10 to dest_buff a3, offset 0
+            s32i.n      a10,  a3,  4                    // save 32 bits from 32-bit color a10 to dest_buff a3, offset 4
+            addi.n      a3,   a3,  8                    // increment dest_buff pointer by 8 bytes
+        _mod_8_check:
+
+        // Check modulo 4 of the dest_w_bytes, if - then set 4 bytes
+        bbci a11, 2, _mod_4_check                       // branch if 2-nd bit of dest_w_bytes is clear
+            s32i.n      a10,  a3,  0                    // save 32 bits from 32-bit color a10 to dest_buff a3, offset 0
+            addi.n      a3,   a3,  4                    // increment dest_buff pointer by 4
+        _mod_4_check:
+
+        // Check modulo 2 of the dest_w_bytes, if - then set 2 bytes
+        bbci a11, 1, _mod_2_check                       // branch if 1-st bit of dest_w_bytes is clear
+            s16i        a12,  a3,  0                    // save 16 bits from 16-bit color a12 to dest_buff a3, offset 0
+            addi.n      a3,   a3,  2                    // increment dest_buff pointer by 2 bytes
+        _mod_2_check:
+
+        add     a3,  a3,  a6                            // dest_buff + dest_stride
+        addi.n  a5,  a5,  -1                            // decrease the outer loop
+        and     a7,  a8,  a3                            // a7 = dest_buff AND 0x3 (chck if the address is 4-byte aligned)
+    bnez a5, .outer_loop
+
+    movi.n   a2, 1                                      // return LV_RESULT_OK = 1
+    retw.n                                              // return
diff --git a/components/esp_lvgl_port/test_apps/CMakeLists.txt b/components/esp_lvgl_port/test_apps/lvgl_port/CMakeLists.txt
similarity index 100%
rename from components/esp_lvgl_port/test_apps/CMakeLists.txt
rename to components/esp_lvgl_port/test_apps/lvgl_port/CMakeLists.txt
diff --git a/components/esp_lvgl_port/test_apps/main/CMakeLists.txt b/components/esp_lvgl_port/test_apps/lvgl_port/main/CMakeLists.txt
similarity index 100%
rename from components/esp_lvgl_port/test_apps/main/CMakeLists.txt
rename to components/esp_lvgl_port/test_apps/lvgl_port/main/CMakeLists.txt
diff --git a/components/esp_lvgl_port/test_apps/main/idf_component.yml b/components/esp_lvgl_port/test_apps/lvgl_port/main/idf_component.yml
similarity index 59%
rename from components/esp_lvgl_port/test_apps/main/idf_component.yml
rename to components/esp_lvgl_port/test_apps/lvgl_port/main/idf_component.yml
index 1f1a2d1a..39731a01 100644
--- a/components/esp_lvgl_port/test_apps/main/idf_component.yml
+++ b/components/esp_lvgl_port/test_apps/lvgl_port/main/idf_component.yml
@@ -3,7 +3,7 @@ dependencies:
   idf: ">=4.4"
   esp_lcd_touch_tt21100:
     version: "^1"
-    override_path: "../../../lcd_touch/esp_lcd_touch_tt21100/"
+    override_path: "../../../../lcd_touch/esp_lcd_touch_tt21100/"
   esp_lvgl_port:
     version: "*"
-    override_path: "../../"
+    override_path: "../../../"
diff --git a/components/esp_lvgl_port/test_apps/main/test.c b/components/esp_lvgl_port/test_apps/lvgl_port/main/test.c
similarity index 100%
rename from components/esp_lvgl_port/test_apps/main/test.c
rename to components/esp_lvgl_port/test_apps/lvgl_port/main/test.c
diff --git a/components/esp_lvgl_port/test_apps/lvgl_port/sdkconfig.ci.asm_render b/components/esp_lvgl_port/test_apps/lvgl_port/sdkconfig.ci.asm_render
new file mode 100644
index 00000000..30815ee1
--- /dev/null
+++ b/components/esp_lvgl_port/test_apps/lvgl_port/sdkconfig.ci.asm_render
@@ -0,0 +1,6 @@
+# sdkconfig to enable the SIMD in the lvgl_port
+
+# Set custom ASM render and provide a header file with function prototypes
+CONFIG_LV_DRAW_SW_ASM_CUSTOM=y
+CONFIG_LV_USE_DRAW_SW_ASM=255
+CONFIG_LV_DRAW_SW_ASM_CUSTOM_INCLUDE="esp_lvgl_port_lv_blend.h"
diff --git a/components/esp_lvgl_port/test_apps/sdkconfig.defaults b/components/esp_lvgl_port/test_apps/lvgl_port/sdkconfig.defaults
similarity index 100%
rename from components/esp_lvgl_port/test_apps/sdkconfig.defaults
rename to components/esp_lvgl_port/test_apps/lvgl_port/sdkconfig.defaults
diff --git a/components/esp_lvgl_port/test_apps/simd/CMakeLists.txt b/components/esp_lvgl_port/test_apps/simd/CMakeLists.txt
new file mode 100644
index 00000000..735c48dc
--- /dev/null
+++ b/components/esp_lvgl_port/test_apps/simd/CMakeLists.txt
@@ -0,0 +1,7 @@
+# The following lines of boilerplate have to be in your project's
+# CMakeLists in this exact order for cmake to work correctly
+cmake_minimum_required(VERSION 3.16)
+
+include($ENV{IDF_PATH}/tools/cmake/project.cmake)
+
+project(test_lvgl_simd)
\ No newline at end of file
diff --git a/components/esp_lvgl_port/test_apps/simd/README.md b/components/esp_lvgl_port/test_apps/simd/README.md
new file mode 100644
index 00000000..7c579c4f
--- /dev/null
+++ b/components/esp_lvgl_port/test_apps/simd/README.md
@@ -0,0 +1,109 @@
+# HW Acceleration using SIMD assembly instructions
+
+Test app accommodates two types of tests: [`functionality test`](#Functionality-test) and [`benchmark test`](#Benchmark-test). Both tests are provided per each function written in assembly (typically per each assembly file). Both test apps use a hard copy of LVGL blending API, representing an ANSI implementation of the LVGL blending functions. The hard copy is present in [`lv_blend`](main/lv_blend/) folder.
+
+Assembly source files could be found in the [`lvgl_port`](../../src/lvgl9/simd/) component. Header file with the assembly function prototypes is provided into the LVGL using Kconfig option `LV_DRAW_SW_ASM_CUSTOM_INCLUDE` and can be found in the [`lvgl_port/include`](../../include/esp_lvgl_port_lv_blend.h)
+
+## Functionality test
+* Tests, whether the HW accelerated assembly version of an LVGL function provides the same results as the ANSI version
+* A top-level flow of the functionality test:
+    * generate a test matrix with test parameters (matrix width, matrix height, memory alignment.. )
+    * run an ANSI version of a DUT function with the generated input parameters
+    * run an assembly version of a DUT function with the same input parameters
+    * compare the results given by the ANSI and the assembly DUTs
+    * the results shall be the same
+    * repeat all the steps for a set of different input parameters, checking different matrix heights, widths..
+
+## Benchmark test
+* Tests, whether the HW accelerated assembly version of an LVGL function provides a performance increase over the ANSI version
+* A top-level flow of the functionality test:
+    * generate a test matrix with test parameters (matrix width, matrix height, memory alignment.. )
+    * run an ANSI version of a DUT function with the generated input parameters multiple times (1000 times for example), while counting CPU cycles
+    * run an assembly version of a DUT function with the generated input parameters multiple times (1000 times for example), while counting CPU cycles
+    * compare the results given by the ANSI and the assembly DUTs
+    * the assembly version of the DUT function shall be faster than the ANSI version of the DUT function
+
+## Run the test app
+
+The test app is intended to be used only with esp32 and esp32s3
+
+    idf.py build
+
+## Example output
+
+```
+I (302) main_task: Started on CPU0
+I (322) main_task: Calling app_main()
+______  _____ ______   _               _   
+|  _  \/  ___|| ___ \ | |             | |  
+| | | |\ `--. | |_/ / | |_   ___  ___ | |_ 
+| | | | `--. \|  __/  | __| / _ \/ __|| __|
+| |/ / /\__/ /| |     | |_ |  __/\__ \| |_ 
+|___/  \____/ \_|      \__| \___||___/ \__|
+
+
+Press ENTER to see the list of tests.
+
+
+
+Here's the test menu, pick your combo:
+(1)	"Test fill functionality ARGB8888" [fill][functionality][ARGB8888]
+(2)	"Test fill functionality RGB565" [fill][functionality][RGB565]
+(3)	"LV Fill benchmark ARGB8888" [fill][benchmark][ARGB8888]
+(4)	"LV Fill benchmark RGB565" [fill][benchmark][RGB565]
+
+Enter test for running.
+```
+
+### Example of a functionality test run
+
+```
+Running Test fill functionality ARGB8888...
+I (81512) LV Fill Functionality: running test for ARGB8888 color format
+I (84732) LV Fill Functionality: test combinations: 31824
+
+MALLOC_CAP_8BIT usage: Free memory delta: 0 Leak threshold: -800 
+MALLOC_CAP_32BIT usage: Free memory delta: 0 Leak threshold: -800 
+./main/test_lv_fill_functionality.c:102:Test fill functionality ARGB8888:PASS
+Test ran in 3242ms
+```
+The test gives a simple FAIL/PASS result after comparison of the two DUTs results.
+Also gives us an information about how many combinations (input parameters) the functionality test run with, `31824` in this case.
+
+### Example of a benchmark test run
+
+```
+Running LV Fill benchmark ARGB8888...
+I (163492) LV Fill Benchmark: running test for ARGB8888 color format
+I (163522) LV Fill Benchmark:  ASM ideal case: 5363.123 cycles for 128x128 matrix, 0.327 cycles per sample
+I (163572) LV Fill Benchmark:  ASM corner case: 7868.724 cycles for 127x127 matrix, 0.488 cycles per sample
+
+I (163732) LV Fill Benchmark:  ANSI ideal case: 26219.137 cycles for 128x128 matrix, 1.600 cycles per sample
+I (163902) LV Fill Benchmark:  ANSI corner case: 25762.178 cycles for 127x127 matrix, 1.597 cycles per sample
+
+MALLOC_CAP_8BIT usage: Free memory delta: -220 Leak threshold: -800 
+MALLOC_CAP_8BIT potential leak: Before 393820 bytes free, After 393600 bytes free (delta 220)
+MALLOC_CAP_32BIT usage: Free memory delta: -220 Leak threshold: -800 
+MALLOC_CAP_32BIT potential leak: Before 393820 bytes free, After 393600 bytes free (delta 220)
+./main/test_lv_fill_benchmark.c:69:LV Fill benchmark ARGB8888:PASS
+Test ran in 458ms
+```
+
+The test provides couple of information:
+* Total number of CPU cycles for the whole DUT function
+    * `5363.123` cycles for the assembly DUT function
+    * `26219.137` cycles for the ANSI DUT function
+* Number of CPU cycles per sample, which is basically the total number of CPU cycles divided by the test matrix area
+    * `0.327` cycles per sample for the assembly DUT
+    * `1.6` cycles per sample for the ANSI DUT
+    * In this case, the assembly implementation has achieved a performance increase in around 4.9-times, comparing to the ANSI implementation.
+* Range of the CPU cycles (a best case and a corner case scenarios) into which, the DUT functions are expected to fit into
+    * The execution time of those function highly depends on the input parameters, thus a boundary scenarios for input parameters shall be set
+    * An example of such a boundaries is in a table below
+    * The benchmark boundary would help us to get an performance expectations of the real scenarios
+
+Example of an best and corner case input parameters for benchmark test, for a color format `ARGB8888`
+| Test matrix params | Memory alignment | Width          | Height         | Stride         |
+| :----------------- | :--------------- | :------------- | :------------- | :------------- |
+| Best case          | 16-byte aligned  | Multiple of 8  | Multiple of 8  | Multiple of 8  |
+| Corner case        | 1-byte aligned   | Not power of 2 | Not power of 2 | Not power of 2 |
diff --git a/components/esp_lvgl_port/test_apps/simd/main/CMakeLists.txt b/components/esp_lvgl_port/test_apps/simd/main/CMakeLists.txt
new file mode 100644
index 00000000..0a6d5da4
--- /dev/null
+++ b/components/esp_lvgl_port/test_apps/simd/main/CMakeLists.txt
@@ -0,0 +1,21 @@
+# Include SIMD assembly source code for rendering
+if(CONFIG_IDF_TARGET_ESP32 OR CONFIG_IDF_TARGET_ESP32S3)
+    message(VERBOSE "Compiling SIMD")
+    set(PORT_PATH "../../../src/lvgl9")
+
+    if(CONFIG_IDF_TARGET_ESP32S3)
+        file(GLOB_RECURSE ASM_SOURCES ${PORT_PATH}/simd/*_esp32s3.S)    # Select only esp32s3 related files
+    else()
+        file(GLOB_RECURSE ASM_SOURCES ${PORT_PATH}/simd/*_esp32.S)      # Select only esp32 related files
+    endif()
+else()
+    message(WARNING "This test app is intended only for esp32 and esp32s3")
+endif()
+
+# Hard copy of LV files
+file(GLOB_RECURSE BLEND_SRCS lv_blend/src/*.c)
+
+idf_component_register(SRCS "test_app_main.c" "test_lv_fill_functionality.c" "test_lv_fill_benchmark.c" ${BLEND_SRCS} ${ASM_SOURCES}
+                      INCLUDE_DIRS "lv_blend/include" "../../../include"
+                      REQUIRES unity
+                      WHOLE_ARCHIVE)
diff --git a/components/esp_lvgl_port/test_apps/simd/main/Kconfig.projbuild b/components/esp_lvgl_port/test_apps/simd/main/Kconfig.projbuild
new file mode 100644
index 00000000..8d2c596c
--- /dev/null
+++ b/components/esp_lvgl_port/test_apps/simd/main/Kconfig.projbuild
@@ -0,0 +1,5 @@
+# Creating CONFIG_LV_DRAW_SW_ASM_CUSTOM avaliable in lvgl Kconfig to enable assembler source files by deafult
+
+config LV_DRAW_SW_ASM_CUSTOM
+    bool
+    default y
\ No newline at end of file
diff --git a/components/esp_lvgl_port/test_apps/simd/main/lv_blend/include/lv_assert.h b/components/esp_lvgl_port/test_apps/simd/main/lv_blend/include/lv_assert.h
new file mode 100644
index 00000000..6fe5589d
--- /dev/null
+++ b/components/esp_lvgl_port/test_apps/simd/main/lv_blend/include/lv_assert.h
@@ -0,0 +1,60 @@
+/*
+ * SPDX-FileCopyrightText: 2024 Espressif Systems (Shanghai) CO LTD
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * This file is derived from the LVGL project.
+ * See https://github.com/lvgl/lvgl for details.
+ */
+
+/**
+ * @file lv_assert.h
+ *
+ */
+
+#ifndef LV_ASSERT_H
+#define LV_ASSERT_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*********************
+ *      INCLUDES
+ *********************/
+#include "lv_log.h"
+
+/*********************
+ *      DEFINES
+ *********************/
+
+/**********************
+ *      TYPEDEFS
+ **********************/
+
+/**********************
+ * GLOBAL PROTOTYPES
+ **********************/
+
+/**********************
+ *      MACROS
+ **********************/
+#define LV_ASSERT_HANDLER while(1);   /*Halt by default*/
+
+#define LV_ASSERT(expr)                                        \
+    do {                                                       \
+        if(!(expr)) {                                          \
+            LV_LOG_ERROR("Asserted at expression: %s", #expr); \
+            LV_ASSERT_HANDLER                                  \
+        }                                                      \
+    } while(0)
+
+/*-----------------
+ * ASSERTS
+ *-----------------*/
+
+#ifdef __cplusplus
+} /*extern "C"*/
+#endif
+
+#endif /*LV_ASSERT_H*/
diff --git a/components/esp_lvgl_port/test_apps/simd/main/lv_blend/include/lv_color.h b/components/esp_lvgl_port/test_apps/simd/main/lv_blend/include/lv_color.h
new file mode 100644
index 00000000..ecb6017e
--- /dev/null
+++ b/components/esp_lvgl_port/test_apps/simd/main/lv_blend/include/lv_color.h
@@ -0,0 +1,272 @@
+/*
+ * SPDX-FileCopyrightText: 2024 Espressif Systems (Shanghai) CO LTD
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * This file is derived from the LVGL project.
+ * See https://github.com/lvgl/lvgl for details.
+ */
+
+/**
+ * @file lv_color.h
+ *
+ */
+
+#ifndef LV_COLOR_H
+#define LV_COLOR_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*********************
+ *      INCLUDES
+ *********************/
+#include "stdint.h"
+#include "stdbool.h"
+#include "sdkconfig.h"
+
+/*********************
+ *      DEFINES
+ *********************/
+#define LV_ATTRIBUTE_FAST_MEM
+
+#ifndef LV_COLOR_MIX_ROUND_OFS
+#ifdef CONFIG_LV_COLOR_MIX_ROUND_OFS
+#define LV_COLOR_MIX_ROUND_OFS CONFIG_LV_COLOR_MIX_ROUND_OFS
+#else
+#define LV_COLOR_MIX_ROUND_OFS  0
+#endif
+#endif
+
+/**
+ * Opacity percentages.
+ */
+
+typedef enum {
+    LV_OPA_TRANSP = 0,
+    LV_OPA_0      = 0,
+    LV_OPA_10     = 25,
+    LV_OPA_20     = 51,
+    LV_OPA_30     = 76,
+    LV_OPA_40     = 102,
+    LV_OPA_50     = 127,
+    LV_OPA_60     = 153,
+    LV_OPA_70     = 178,
+    LV_OPA_80     = 204,
+    LV_OPA_90     = 229,
+    LV_OPA_100    = 255,
+    LV_OPA_COVER  = 255,
+} lv_opa_t;
+
+#define LV_OPA_MIN 2    /*Opacities below this will be transparent*/
+#define LV_OPA_MAX 253  /*Opacities above this will fully cover*/
+
+#define LV_COLOR_FORMAT_GET_BPP(cf) (       \
+                                            (cf) == LV_COLOR_FORMAT_I1 ? 1 :        \
+                                            (cf) == LV_COLOR_FORMAT_A1 ? 1 :        \
+                                            (cf) == LV_COLOR_FORMAT_I2 ? 2 :        \
+                                            (cf) == LV_COLOR_FORMAT_A2 ? 2 :        \
+                                            (cf) == LV_COLOR_FORMAT_I4 ? 4 :        \
+                                            (cf) == LV_COLOR_FORMAT_A4 ? 4 :        \
+                                            (cf) == LV_COLOR_FORMAT_L8 ? 8 :        \
+                                            (cf) == LV_COLOR_FORMAT_A8 ? 8 :        \
+                                            (cf) == LV_COLOR_FORMAT_I8 ? 8 :        \
+                                            (cf) == LV_COLOR_FORMAT_AL88 ? 16 :     \
+                                            (cf) == LV_COLOR_FORMAT_RGB565 ? 16 :   \
+                                            (cf) == LV_COLOR_FORMAT_RGB565A8 ? 16 : \
+                                            (cf) == LV_COLOR_FORMAT_ARGB8565 ? 24 : \
+                                            (cf) == LV_COLOR_FORMAT_RGB888 ? 24 :   \
+                                            (cf) == LV_COLOR_FORMAT_ARGB8888 ? 32 : \
+                                            (cf) == LV_COLOR_FORMAT_XRGB8888 ? 32 : \
+                                            0                                       \
+                                    )
+
+/**********************
+ *      TYPEDEFS
+ **********************/
+
+typedef struct {
+    uint8_t blue;
+    uint8_t green;
+    uint8_t red;
+} lv_color_t;
+
+typedef struct {
+    uint16_t blue : 5;
+    uint16_t green : 6;
+    uint16_t red : 5;
+} lv_color16_t;
+
+typedef struct {
+    uint8_t blue;
+    uint8_t green;
+    uint8_t red;
+    uint8_t alpha;
+} lv_color32_t;
+
+typedef struct {
+    uint16_t h;
+    uint8_t s;
+    uint8_t v;
+} lv_color_hsv_t;
+
+typedef struct {
+    uint8_t lumi;
+    uint8_t alpha;
+} lv_color16a_t;
+
+typedef enum {
+    LV_COLOR_FORMAT_UNKNOWN           = 0,
+
+    LV_COLOR_FORMAT_RAW               = 0x01,
+    LV_COLOR_FORMAT_RAW_ALPHA         = 0x02,
+
+    /*<=1 byte (+alpha) formats*/
+    LV_COLOR_FORMAT_L8                = 0x06,
+    LV_COLOR_FORMAT_I1                = 0x07,
+    LV_COLOR_FORMAT_I2                = 0x08,
+    LV_COLOR_FORMAT_I4                = 0x09,
+    LV_COLOR_FORMAT_I8                = 0x0A,
+    LV_COLOR_FORMAT_A8                = 0x0E,
+
+    /*2 byte (+alpha) formats*/
+    LV_COLOR_FORMAT_RGB565            = 0x12,
+    LV_COLOR_FORMAT_ARGB8565          = 0x13,   /**< Not supported by sw renderer yet. */
+    LV_COLOR_FORMAT_RGB565A8          = 0x14,   /**< Color array followed by Alpha array*/
+    LV_COLOR_FORMAT_AL88              = 0x15,   /**< L8 with alpha >*/
+
+    /*3 byte (+alpha) formats*/
+    LV_COLOR_FORMAT_RGB888            = 0x0F,
+    LV_COLOR_FORMAT_ARGB8888          = 0x10,
+    LV_COLOR_FORMAT_XRGB8888          = 0x11,
+
+    /*Formats not supported by software renderer but kept here so GPU can use it*/
+    LV_COLOR_FORMAT_A1                = 0x0B,
+    LV_COLOR_FORMAT_A2                = 0x0C,
+    LV_COLOR_FORMAT_A4                = 0x0D,
+
+    /* reference to https://wiki.videolan.org/YUV/ */
+    /*YUV planar formats*/
+    LV_COLOR_FORMAT_YUV_START         = 0x20,
+    LV_COLOR_FORMAT_I420              = LV_COLOR_FORMAT_YUV_START,  /*YUV420 planar(3 plane)*/
+    LV_COLOR_FORMAT_I422              = 0x21,  /*YUV422 planar(3 plane)*/
+    LV_COLOR_FORMAT_I444              = 0x22,  /*YUV444 planar(3 plane)*/
+    LV_COLOR_FORMAT_I400              = 0x23,  /*YUV400 no chroma channel*/
+    LV_COLOR_FORMAT_NV21              = 0x24,  /*YUV420 planar(2 plane), UV plane in 'V, U, V, U'*/
+    LV_COLOR_FORMAT_NV12              = 0x25,  /*YUV420 planar(2 plane), UV plane in 'U, V, U, V'*/
+
+    /*YUV packed formats*/
+    LV_COLOR_FORMAT_YUY2              = 0x26,  /*YUV422 packed like 'Y U Y V'*/
+    LV_COLOR_FORMAT_UYVY              = 0x27,  /*YUV422 packed like 'U Y V Y'*/
+
+    LV_COLOR_FORMAT_YUV_END           = LV_COLOR_FORMAT_UYVY,
+
+    /*Color formats in which LVGL can render*/
+#if LV_COLOR_DEPTH == 8
+    LV_COLOR_FORMAT_NATIVE            = LV_COLOR_FORMAT_L8,
+    LV_COLOR_FORMAT_NATIVE_WITH_ALPHA = LV_COLOR_FORMAT_AL88,
+#elif LV_COLOR_DEPTH == 16
+    LV_COLOR_FORMAT_NATIVE            = LV_COLOR_FORMAT_RGB565,
+    LV_COLOR_FORMAT_NATIVE_WITH_ALPHA = LV_COLOR_FORMAT_RGB565A8,
+#elif LV_COLOR_DEPTH == 24
+    LV_COLOR_FORMAT_NATIVE            = LV_COLOR_FORMAT_RGB888,
+    LV_COLOR_FORMAT_NATIVE_WITH_ALPHA = LV_COLOR_FORMAT_ARGB8888,
+#elif LV_COLOR_DEPTH == 32
+    LV_COLOR_FORMAT_NATIVE            = LV_COLOR_FORMAT_XRGB8888,
+    LV_COLOR_FORMAT_NATIVE_WITH_ALPHA = LV_COLOR_FORMAT_ARGB8888,
+#endif
+} lv_color_format_t;
+
+/**********************
+ * MACROS
+ **********************/
+
+#define LV_COLOR_MAKE(r8, g8, b8) {b8, g8, r8}
+
+#define LV_OPA_MIX2(a1, a2) (((int32_t)(a1) * (a2)) >> 8)
+#define LV_OPA_MIX3(a1, a2, a3) (((int32_t)(a1) * (a2) * (a3)) >> 16)
+
+/**********************
+ * GLOBAL PROTOTYPES
+ **********************/
+
+/**
+ * Create an ARGB8888 color from RGB888 + alpha
+ * @param color     an RGB888 color
+ * @param opa       the alpha value
+ * @return          the ARGB8888 color
+ */
+lv_color32_t lv_color_to_32(lv_color_t color, lv_opa_t opa);
+
+/**
+ * Convert am RGB888 color to RGB565 stored in `uint16_t`
+ * @param color     and RGB888 color
+ * @return          `color` as RGB565 on `uin16_t`
+ */
+uint16_t lv_color_to_u16(lv_color_t color);
+
+/**
+ * Convert am RGB888 color to XRGB8888 stored in `uint32_t`
+ * @param color     and RGB888 color
+ * @return          `color` as XRGB8888 on `uin32_t` (the alpha channel is always set to 0xFF)
+ */
+uint32_t lv_color_to_u32(lv_color_t color);
+
+/**
+ * Mix two RGB565 colors
+ * @param c1        the first color (typically the foreground color)
+ * @param c2        the second color  (typically the background color)
+ * @param mix       0..255, or LV_OPA_0/10/20...
+ * @return          mix == 0: c2
+ *                  mix == 255: c1
+ *                  mix == 128: 0.5 x c1 + 0.5 x c2
+ */
+static inline uint16_t LV_ATTRIBUTE_FAST_MEM lv_color_16_16_mix(uint16_t c1, uint16_t c2, uint8_t mix)
+{
+    if (mix == 255) {
+        return c1;
+    }
+    if (mix == 0) {
+        return c2;
+    }
+    if (c1 == c2) {
+        return c1;
+    }
+
+    uint16_t ret;
+
+    /* Source: https://stackoverflow.com/a/50012418/1999969*/
+    mix = (uint32_t)((uint32_t)mix + 4) >> 3;
+
+    /*0x7E0F81F = 0b00000111111000001111100000011111*/
+    uint32_t bg = (uint32_t)(c2 | ((uint32_t)c2 << 16)) & 0x7E0F81F;
+    uint32_t fg = (uint32_t)(c1 | ((uint32_t)c1 << 16)) & 0x7E0F81F;
+    uint32_t result = ((((fg - bg) * mix) >> 5) + bg) & 0x7E0F81F;
+    ret = (uint16_t)(result >> 16) | result;
+
+    return ret;
+}
+
+/**
+ * Check if two ARGB8888 color are equal
+ * @param c1    the first color
+ * @param c2    the second color
+ * @return      true: equal
+ */
+static inline bool lv_color32_eq(lv_color32_t c1, lv_color32_t c2)
+{
+    return *((uint32_t *)&c1) == *((uint32_t *)&c2);
+}
+
+/**********************
+ *      MACROS
+ **********************/
+
+#include "lv_color_op.h"
+
+#ifdef __cplusplus
+} /*extern "C"*/
+#endif
+
+#endif /*LV_COLOR_H*/
diff --git a/components/esp_lvgl_port/test_apps/simd/main/lv_blend/include/lv_color_op.h b/components/esp_lvgl_port/test_apps/simd/main/lv_blend/include/lv_color_op.h
new file mode 100644
index 00000000..083fcd83
--- /dev/null
+++ b/components/esp_lvgl_port/test_apps/simd/main/lv_blend/include/lv_color_op.h
@@ -0,0 +1,93 @@
+/*
+ * SPDX-FileCopyrightText: 2024 Espressif Systems (Shanghai) CO LTD
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * This file is derived from the LVGL project.
+ * See https://github.com/lvgl/lvgl for details.
+ */
+
+/**
+ * @file lv_color_op.h
+ *
+ */
+
+#ifndef LV_COLOR_OP_H
+#define LV_COLOR_OP_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*********************
+ *      INCLUDES
+ *********************/
+#include "lv_math.h"
+#include "lv_color.h"
+#include "lv_types.h"
+
+/*********************
+ *      DEFINES
+ *********************/
+
+/**********************
+ *      TYPEDEFS
+ **********************/
+
+/**********************
+ * GLOBAL PROTOTYPES
+ **********************/
+
+/**
+ * Mix two colors with a given ratio.
+ * @param c1 the first color to mix (usually the foreground)
+ * @param c2 the second color to mix (usually the background)
+ * @param mix The ratio of the colors. 0: full `c2`, 255: full `c1`, 127: half `c1` and half`c2`
+ * @return the mixed color
+ */
+static inline lv_color_t LV_ATTRIBUTE_FAST_MEM lv_color_mix(lv_color_t c1, lv_color_t c2, uint8_t mix)
+{
+    lv_color_t ret;
+
+    ret.red = LV_UDIV255((uint16_t)c1.red * mix + c2.red * (255 - mix) + LV_COLOR_MIX_ROUND_OFS);
+    ret.green = LV_UDIV255((uint16_t)c1.green * mix + c2.green * (255 - mix) + LV_COLOR_MIX_ROUND_OFS);
+    ret.blue = LV_UDIV255((uint16_t)c1.blue * mix + c2.blue * (255 - mix) + LV_COLOR_MIX_ROUND_OFS);
+    return ret;
+}
+
+/**
+ *
+ * @param fg
+ * @param bg
+ * @return
+ * @note Use bg.alpha in the return value
+ * @note Use fg.alpha as mix ratio
+ */
+static inline lv_color32_t lv_color_mix32(lv_color32_t fg, lv_color32_t bg)
+{
+    if (fg.alpha >= LV_OPA_MAX) {
+        fg.alpha = bg.alpha;
+        return fg;
+    }
+    if (fg.alpha <= LV_OPA_MIN) {
+        return bg;
+    }
+    bg.red = (uint32_t)((uint32_t)fg.red * fg.alpha + (uint32_t)bg.red * (255 - fg.alpha)) >> 8;
+    bg.green = (uint32_t)((uint32_t)fg.green * fg.alpha + (uint32_t)bg.green * (255 - fg.alpha)) >> 8;
+    bg.blue = (uint32_t)((uint32_t)fg.blue * fg.alpha + (uint32_t)bg.blue * (255 - fg.alpha)) >> 8;
+    return bg;
+}
+
+/**********************
+ *  PREDEFINED COLORS
+ **********************/
+
+/**********************
+ *      MACROS
+ **********************/
+
+#ifdef __cplusplus
+} /*extern "C"*/
+#endif
+
+#endif /*LV_COLOR_H*/
diff --git a/components/esp_lvgl_port/test_apps/simd/main/lv_blend/include/lv_draw_sw_blend.h b/components/esp_lvgl_port/test_apps/simd/main/lv_blend/include/lv_draw_sw_blend.h
new file mode 100644
index 00000000..01c5f769
--- /dev/null
+++ b/components/esp_lvgl_port/test_apps/simd/main/lv_blend/include/lv_draw_sw_blend.h
@@ -0,0 +1,74 @@
+/*
+ * SPDX-FileCopyrightText: 2024 Espressif Systems (Shanghai) CO LTD
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * This file is derived from the LVGL project.
+ * See https://github.com/lvgl/lvgl for details.
+ */
+
+/**
+ * @file lv_draw_sw_blend.h
+ *
+ */
+
+#ifndef LV_DRAW_SW_BLEND_H
+#define LV_DRAW_SW_BLEND_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*********************
+ *      INCLUDES
+ *********************/
+#include "lv_style.h"
+#include "lv_color.h"
+
+/*********************
+ *      DEFINES
+ *********************/
+
+/**********************
+ *      TYPEDEFS
+ **********************/
+
+typedef struct {
+    void *dest_buf;
+    int32_t dest_w;
+    int32_t dest_h;
+    int32_t dest_stride;
+    const lv_opa_t *mask_buf;
+    int32_t mask_stride;
+    lv_color_t color;
+    lv_opa_t opa;
+    bool use_asm;
+} _lv_draw_sw_blend_fill_dsc_t;
+
+typedef struct {
+    void *dest_buf;
+    int32_t dest_w;
+    int32_t dest_h;
+    int32_t dest_stride;
+    const lv_opa_t *mask_buf;
+    int32_t mask_stride;
+    const void *src_buf;
+    int32_t src_stride;
+    lv_color_format_t src_color_format;
+    lv_opa_t opa;
+    lv_blend_mode_t blend_mode;
+} _lv_draw_sw_blend_image_dsc_t;
+
+/**********************
+ * GLOBAL PROTOTYPES
+ **********************/
+
+/**********************
+ *      MACROS
+ **********************/
+
+#ifdef __cplusplus
+} /*extern "C"*/
+#endif
+
+#endif /*LV_DRAW_SW_BLEND_H*/
diff --git a/components/esp_lvgl_port/test_apps/simd/main/lv_blend/include/lv_draw_sw_blend_to_argb8888.h b/components/esp_lvgl_port/test_apps/simd/main/lv_blend/include/lv_draw_sw_blend_to_argb8888.h
new file mode 100644
index 00000000..c6c94880
--- /dev/null
+++ b/components/esp_lvgl_port/test_apps/simd/main/lv_blend/include/lv_draw_sw_blend_to_argb8888.h
@@ -0,0 +1,51 @@
+/*
+ * SPDX-FileCopyrightText: 2024 Espressif Systems (Shanghai) CO LTD
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * This file is derived from the LVGL project.
+ * See https://github.com/lvgl/lvgl for details.
+ */
+
+/**
+ * @file lv_draw_sw_blend_argb8888.h
+ *
+ */
+
+#ifndef LV_DRAW_SW_BLEND_ARGB8888_H
+#define LV_DRAW_SW_BLEND_ARGB8888_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*********************
+ *      INCLUDES
+ *********************/
+#include "lv_draw_sw_blend.h"
+
+/*********************
+ *      DEFINES
+ *********************/
+
+/**********************
+ *      TYPEDEFS
+ **********************/
+
+/**********************
+ * GLOBAL PROTOTYPES
+ **********************/
+
+void /* LV_ATTRIBUTE_FAST_MEM */ lv_draw_sw_blend_color_to_argb8888(_lv_draw_sw_blend_fill_dsc_t *dsc);
+
+void /* LV_ATTRIBUTE_FAST_MEM */ lv_draw_sw_blend_image_to_argb8888(_lv_draw_sw_blend_image_dsc_t *dsc);
+
+/**********************
+ *      MACROS
+ **********************/
+
+#ifdef __cplusplus
+} /*extern "C"*/
+#endif
+
+#endif /*LV_DRAW_SW_BLEND_ARGB8888_H*/
diff --git a/components/esp_lvgl_port/test_apps/simd/main/lv_blend/include/lv_draw_sw_blend_to_rgb565.h b/components/esp_lvgl_port/test_apps/simd/main/lv_blend/include/lv_draw_sw_blend_to_rgb565.h
new file mode 100644
index 00000000..e8c1873d
--- /dev/null
+++ b/components/esp_lvgl_port/test_apps/simd/main/lv_blend/include/lv_draw_sw_blend_to_rgb565.h
@@ -0,0 +1,51 @@
+/*
+ * SPDX-FileCopyrightText: 2024 Espressif Systems (Shanghai) CO LTD
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * This file is derived from the LVGL project.
+ * See https://github.com/lvgl/lvgl for details.
+ */
+
+/**
+ * @file lv_draw_sw_blend_rgb565.h
+ *
+ */
+
+#ifndef LV_DRAW_SW_BLEND_RGB565_H
+#define LV_DRAW_SW_BLEND_RGB565_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*********************
+ *      INCLUDES
+ *********************/
+#include "lv_draw_sw_blend.h"
+
+/*********************
+ *      DEFINES
+ *********************/
+
+/**********************
+ *      TYPEDEFS
+ **********************/
+
+/**********************
+ * GLOBAL PROTOTYPES
+ **********************/
+
+void /* LV_ATTRIBUTE_FAST_MEM */ lv_draw_sw_blend_color_to_rgb565(_lv_draw_sw_blend_fill_dsc_t *dsc);
+
+void /* LV_ATTRIBUTE_FAST_MEM */ lv_draw_sw_blend_image_to_rgb565(_lv_draw_sw_blend_image_dsc_t *dsc);
+
+/**********************
+ *      MACROS
+ **********************/
+
+#ifdef __cplusplus
+} /*extern "C"*/
+#endif
+
+#endif /*LV_DRAW_SW_BLEND_RGB565_H*/
diff --git a/components/esp_lvgl_port/test_apps/simd/main/lv_blend/include/lv_log.h b/components/esp_lvgl_port/test_apps/simd/main/lv_blend/include/lv_log.h
new file mode 100644
index 00000000..c7250c96
--- /dev/null
+++ b/components/esp_lvgl_port/test_apps/simd/main/lv_blend/include/lv_log.h
@@ -0,0 +1,45 @@
+/*
+ * SPDX-FileCopyrightText: 2024 Espressif Systems (Shanghai) CO LTD
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * This file is derived from the LVGL project.
+ * See https://github.com/lvgl/lvgl for details.
+ */
+
+/**
+ * @file lv_log.h
+ *
+ */
+
+#ifndef LV_LOG_H
+#define LV_LOG_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*********************
+ *      INCLUDES
+ *********************/
+#include "lv_types.h"
+
+/*********************
+ *      DEFINES
+ *********************/
+
+/*Do nothing if `LV_USE_LOG 0`*/
+#define _lv_log_add(level, file, line, ...)
+#define LV_LOG_TRACE(...) do {}while(0)
+#define LV_LOG_INFO(...) do {}while(0)
+#define LV_LOG_WARN(...) do {}while(0)
+#define LV_LOG_ERROR(...) do {}while(0)
+#define LV_LOG_USER(...) do {}while(0)
+#define LV_LOG(...) do {}while(0)
+
+
+#ifdef __cplusplus
+} /*extern "C"*/
+#endif
+
+#endif /*LV_LOG_H*/
diff --git a/components/esp_lvgl_port/test_apps/simd/main/lv_blend/include/lv_math.h b/components/esp_lvgl_port/test_apps/simd/main/lv_blend/include/lv_math.h
new file mode 100644
index 00000000..52508a8e
--- /dev/null
+++ b/components/esp_lvgl_port/test_apps/simd/main/lv_blend/include/lv_math.h
@@ -0,0 +1,56 @@
+/*
+ * SPDX-FileCopyrightText: 2024 Espressif Systems (Shanghai) CO LTD
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * This file is derived from the LVGL project.
+ * See https://github.com/lvgl/lvgl for details.
+ */
+
+/**
+ * @file lv_math.h
+ *
+ */
+
+#ifndef LV_MATH_H
+#define LV_MATH_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*********************
+ *      INCLUDES
+ *********************/
+#include "lv_types.h"
+
+/*********************
+ *      DEFINES
+ *********************/
+
+/**********************
+ *      MACROS
+ **********************/
+#define LV_MIN(a, b) ((a) < (b) ? (a) : (b))
+#define LV_MIN3(a, b, c) (LV_MIN(LV_MIN(a,b), c))
+#define LV_MIN4(a, b, c, d) (LV_MIN(LV_MIN(a,b), LV_MIN(c,d)))
+
+#define LV_MAX(a, b) ((a) > (b) ? (a) : (b))
+#define LV_MAX3(a, b, c) (LV_MAX(LV_MAX(a,b), c))
+#define LV_MAX4(a, b, c, d) (LV_MAX(LV_MAX(a,b), LV_MAX(c,d)))
+
+#define LV_CLAMP(min, val, max) (LV_MAX(min, (LV_MIN(val, max))))
+
+#define LV_ABS(x) ((x) > 0 ? (x) : (-(x)))
+#define LV_UDIV255(x) (((x) * 0x8081U) >> 0x17)
+
+#define LV_IS_SIGNED(t) (((t)(-1)) < ((t)0))
+#define LV_UMAX_OF(t) (((0x1ULL << ((sizeof(t) * 8ULL) - 1ULL)) - 1ULL) | (0xFULL << ((sizeof(t) * 8ULL) - 4ULL)))
+#define LV_SMAX_OF(t) (((0x1ULL << ((sizeof(t) * 8ULL) - 1ULL)) - 1ULL) | (0x7ULL << ((sizeof(t) * 8ULL) - 4ULL)))
+#define LV_MAX_OF(t) ((unsigned long)(LV_IS_SIGNED(t) ? LV_SMAX_OF(t) : LV_UMAX_OF(t)))
+
+#ifdef __cplusplus
+} /*extern "C"*/
+#endif
+
+#endif
diff --git a/components/esp_lvgl_port/test_apps/simd/main/lv_blend/include/lv_style.h b/components/esp_lvgl_port/test_apps/simd/main/lv_blend/include/lv_style.h
new file mode 100644
index 00000000..dd813add
--- /dev/null
+++ b/components/esp_lvgl_port/test_apps/simd/main/lv_blend/include/lv_style.h
@@ -0,0 +1,48 @@
+/*
+ * SPDX-FileCopyrightText: 2024 Espressif Systems (Shanghai) CO LTD
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * This file is derived from the LVGL project.
+ * See https://github.com/lvgl/lvgl for details.
+ */
+
+/**
+ * @file lv_style.h
+ *
+ */
+
+#ifndef LV_STYLE_H
+#define LV_STYLE_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*********************
+ *      INCLUDES
+ *********************/
+
+/*********************
+ *      DEFINES
+ *********************/
+
+/**********************
+ *      TYPEDEFS
+ **********************/
+
+/**
+ * Possible options how to blend opaque drawings
+ */
+typedef enum {
+    LV_BLEND_MODE_NORMAL,     /**< Simply mix according to the opacity value*/
+    LV_BLEND_MODE_ADDITIVE,   /**< Add the respective color channels*/
+    LV_BLEND_MODE_SUBTRACTIVE,/**< Subtract the foreground from the background*/
+    LV_BLEND_MODE_MULTIPLY,   /**< Multiply the foreground and background*/
+} lv_blend_mode_t;
+
+#ifdef __cplusplus
+} /*extern "C"*/
+#endif
+
+#endif /*LV_STYLE_H*/
diff --git a/components/esp_lvgl_port/test_apps/simd/main/lv_blend/include/lv_types.h b/components/esp_lvgl_port/test_apps/simd/main/lv_blend/include/lv_types.h
new file mode 100644
index 00000000..2e9244fe
--- /dev/null
+++ b/components/esp_lvgl_port/test_apps/simd/main/lv_blend/include/lv_types.h
@@ -0,0 +1,51 @@
+/*
+ * SPDX-FileCopyrightText: 2024 Espressif Systems (Shanghai) CO LTD
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * This file is derived from the LVGL project.
+ * See https://github.com/lvgl/lvgl for details.
+ */
+
+/**
+ * @file lv_types.h
+ *
+ */
+
+#ifndef LV_TYPES_H
+#define LV_TYPES_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/**********************
+ *      TYPEDEFS
+ **********************/
+
+/**
+ * LVGL error codes.
+ */
+typedef enum {
+    LV_RESULT_INVALID = 0, /*Typically indicates that the object is deleted (become invalid) in the action
+                      function or an operation was failed*/
+    LV_RESULT_OK,      /*The object is valid (no deleted) after the action*/
+} lv_result_t;
+
+/**********************
+ *      TYPEDEFS
+ **********************/
+
+typedef uintptr_t lv_uintptr_t;
+
+/**********************
+ *      MACROS
+ **********************/
+
+#define LV_UNUSED(x) ((void)x)
+
+#ifdef __cplusplus
+} /*extern "C"*/
+#endif
+
+#endif /*LV_TYPES_H*/
diff --git a/components/esp_lvgl_port/test_apps/simd/main/lv_blend/src/lv_color.c b/components/esp_lvgl_port/test_apps/simd/main/lv_blend/src/lv_color.c
new file mode 100644
index 00000000..a2865e47
--- /dev/null
+++ b/components/esp_lvgl_port/test_apps/simd/main/lv_blend/src/lv_color.c
@@ -0,0 +1,66 @@
+/*
+ * SPDX-FileCopyrightText: 2024 Espressif Systems (Shanghai) CO LTD
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * This file is derived from the LVGL project.
+ * See https://github.com/lvgl/lvgl for details.
+ */
+
+/**
+ * @file lv_color.c
+ *
+ */
+
+/*********************
+ *      INCLUDES
+ *********************/
+#include "lv_color.h"
+
+/*********************
+ *      DEFINES
+ *********************/
+
+/**********************
+ *      TYPEDEFS
+ **********************/
+
+/**********************
+ *  STATIC PROTOTYPES
+ **********************/
+
+/**********************
+ *  GLOBAL VARIABLES
+ **********************/
+
+/**********************
+ *  STATIC VARIABLES
+ **********************/
+
+/**********************
+ *      MACROS
+ **********************/
+
+/**********************
+ *   GLOBAL FUNCTIONS
+ **********************/
+
+lv_color32_t lv_color_to_32(lv_color_t color, lv_opa_t opa)
+{
+    lv_color32_t c32;
+    c32.red = color.red;
+    c32.green = color.green;
+    c32.blue = color.blue;
+    c32.alpha = opa;
+    return c32;
+}
+
+uint16_t lv_color_to_u16(lv_color_t color)
+{
+    return ((color.red & 0xF8) << 8) + ((color.green & 0xFC) << 3) + ((color.blue & 0xF8) >> 3);
+}
+
+uint32_t lv_color_to_u32(lv_color_t color)
+{
+    return (uint32_t)((uint32_t)0xff << 24) + (color.red << 16) + (color.green << 8) + (color.blue);
+}
diff --git a/components/esp_lvgl_port/test_apps/simd/main/lv_blend/src/lv_draw_sw_blend_to_argb8888.c b/components/esp_lvgl_port/test_apps/simd/main/lv_blend/src/lv_draw_sw_blend_to_argb8888.c
new file mode 100644
index 00000000..f18e3670
--- /dev/null
+++ b/components/esp_lvgl_port/test_apps/simd/main/lv_blend/src/lv_draw_sw_blend_to_argb8888.c
@@ -0,0 +1,911 @@
+﻿/*
+ * SPDX-FileCopyrightText: 2024 Espressif Systems (Shanghai) CO LTD
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * This file is derived from the LVGL project.
+ * See https://github.com/lvgl/lvgl for details.
+ */
+
+/**
+ * @file lv_draw_sw_blend.c
+ *
+ */
+
+/*********************
+ *      INCLUDES
+ *********************/
+#include "lv_draw_sw_blend_to_argb8888.h"
+
+#include "lv_assert.h"
+#include "lv_types.h"
+#include "lv_log.h"
+#include "lv_draw_sw_blend.h"
+#include "lv_math.h"
+#include "lv_color.h"
+#include "string.h"
+
+#include "esp_lvgl_port_lv_blend.h"
+
+/*********************
+ *      DEFINES
+ *********************/
+
+#define LV_ATTRIBUTE_FAST_MEM
+
+/**********************
+ *      TYPEDEFS
+ **********************/
+
+typedef struct {
+    lv_color32_t fg_saved;
+    lv_color32_t bg_saved;
+    lv_color32_t res_saved;
+    lv_opa_t res_alpha_saved;
+    lv_opa_t ratio_saved;
+} lv_color_mix_alpha_cache_t;
+
+/**********************
+ *  STATIC PROTOTYPES
+ **********************/
+
+static void /* LV_ATTRIBUTE_FAST_MEM */ al88_image_blend(_lv_draw_sw_blend_image_dsc_t *dsc);
+
+static void /* LV_ATTRIBUTE_FAST_MEM */ l8_image_blend(_lv_draw_sw_blend_image_dsc_t *dsc);
+
+static void /* LV_ATTRIBUTE_FAST_MEM */ rgb565_image_blend(_lv_draw_sw_blend_image_dsc_t *dsc);
+
+static void /* LV_ATTRIBUTE_FAST_MEM */ rgb888_image_blend(_lv_draw_sw_blend_image_dsc_t *dsc,
+        const uint8_t src_px_size);
+
+static void /* LV_ATTRIBUTE_FAST_MEM */ argb8888_image_blend(_lv_draw_sw_blend_image_dsc_t *dsc);
+
+static inline void /* LV_ATTRIBUTE_FAST_MEM */ lv_color_8_32_mix(const uint8_t src, lv_color32_t *dest, uint8_t mix);
+
+static inline lv_color32_t /* LV_ATTRIBUTE_FAST_MEM */ lv_color_32_32_mix(lv_color32_t fg, lv_color32_t bg,
+        lv_color_mix_alpha_cache_t *cache);
+
+static void lv_color_mix_with_alpha_cache_init(lv_color_mix_alpha_cache_t *cache);
+
+static inline void /* LV_ATTRIBUTE_FAST_MEM */ blend_non_normal_pixel(lv_color32_t *dest, lv_color32_t src,
+        lv_blend_mode_t mode, lv_color_mix_alpha_cache_t *cache);
+static inline void * /* LV_ATTRIBUTE_FAST_MEM */ drawbuf_next_row(const void *buf, uint32_t stride);
+
+/**********************
+ *  STATIC VARIABLES
+ **********************/
+
+/**********************
+ *      MACROS
+ **********************/
+
+#ifndef LV_DRAW_SW_COLOR_BLEND_TO_ARGB8888
+#define LV_DRAW_SW_COLOR_BLEND_TO_ARGB8888(...)                         LV_RESULT_INVALID
+#endif
+
+#ifndef LV_DRAW_SW_COLOR_BLEND_TO_ARGB8888_WITH_OPA
+#define LV_DRAW_SW_COLOR_BLEND_TO_ARGB8888_WITH_OPA(...)                LV_RESULT_INVALID
+#endif
+
+#ifndef LV_DRAW_SW_COLOR_BLEND_TO_ARGB8888_WITH_MASK
+#define LV_DRAW_SW_COLOR_BLEND_TO_ARGB8888_WITH_MASK(...)               LV_RESULT_INVALID
+#endif
+
+#ifndef LV_DRAW_SW_COLOR_BLEND_TO_ARGB8888_MIX_MASK_OPA
+#define LV_DRAW_SW_COLOR_BLEND_TO_ARGB8888_MIX_MASK_OPA(...)            LV_RESULT_INVALID
+#endif
+
+#ifndef LV_DRAW_SW_L8_BLEND_NORMAL_TO_ARGB8888
+#define LV_DRAW_SW_L8_BLEND_NORMAL_TO_ARGB8888(...)                            LV_RESULT_INVALID
+#endif
+
+#ifndef LV_DRAW_SW_L8_BLEND_NORMAL_TO_ARGB8888_WITH_OPA
+#define LV_DRAW_SW_L8_BLEND_NORMAL_TO_ARGB8888_WITH_OPA(...)                   LV_RESULT_INVALID
+#endif
+
+#ifndef LV_DRAW_SW_L8_BLEND_NORMAL_TO_ARGB8888_WITH_MASK
+#define LV_DRAW_SW_L8_BLEND_NORMAL_TO_ARGB8888_WITH_MASK(...)                  LV_RESULT_INVALID
+#endif
+
+#ifndef LV_DRAW_SW_L8_BLEND_NORMAL_TO_ARGB8888_MIX_MASK_OPA
+#define LV_DRAW_SW_L8_BLEND_NORMAL_TO_ARGB8888_MIX_MASK_OPA(...)               LV_RESULT_INVALID
+#endif
+
+#ifndef LV_DRAW_SW_AL88_BLEND_NORMAL_TO_ARGB8888
+#define LV_DRAW_SW_AL88_BLEND_NORMAL_TO_ARGB8888(...)                            LV_RESULT_INVALID
+#endif
+
+#ifndef LV_DRAW_SW_AL88_BLEND_NORMAL_TO_ARGB8888_WITH_OPA
+#define LV_DRAW_SW_AL88_BLEND_NORMAL_TO_ARGB8888_WITH_OPA(...)                   LV_RESULT_INVALID
+#endif
+
+#ifndef LV_DRAW_SW_AL88_BLEND_NORMAL_TO_ARGB8888_WITH_MASK
+#define LV_DRAW_SW_AL88_BLEND_NORMAL_TO_ARGB8888_WITH_MASK(...)                  LV_RESULT_INVALID
+#endif
+
+#ifndef LV_DRAW_SW_AL88_BLEND_NORMAL_TO_ARGB8888_MIX_MASK_OPA
+#define LV_DRAW_SW_AL88_BLEND_NORMAL_TO_ARGB8888_MIX_MASK_OPA(...)               LV_RESULT_INVALID
+#endif
+
+#ifndef LV_DRAW_SW_RGB565_BLEND_NORMAL_TO_ARGB8888
+#define LV_DRAW_SW_RGB565_BLEND_NORMAL_TO_ARGB8888(...)                 LV_RESULT_INVALID
+#endif
+
+#ifndef LV_DRAW_SW_RGB565_BLEND_NORMAL_TO_ARGB8888_WITH_OPA
+#define LV_DRAW_SW_RGB565_BLEND_NORMAL_TO_ARGB8888_WITH_OPA(...)        LV_RESULT_INVALID
+#endif
+
+#ifndef LV_DRAW_SW_RGB565_BLEND_NORMAL_TO_ARGB8888_WITH_MASK
+#define LV_DRAW_SW_RGB565_BLEND_NORMAL_TO_ARGB8888_WITH_MASK(...)       LV_RESULT_INVALID
+#endif
+
+#ifndef LV_DRAW_SW_RGB565_BLEND_NORMAL_TO_ARGB8888_MIX_MASK_OPA
+#define LV_DRAW_SW_RGB565_BLEND_NORMAL_TO_ARGB8888_MIX_MASK_OPA(...)    LV_RESULT_INVALID
+#endif
+
+#ifndef LV_DRAW_SW_RGB888_BLEND_NORMAL_TO_ARGB8888
+#define LV_DRAW_SW_RGB888_BLEND_NORMAL_TO_ARGB8888(...)                 LV_RESULT_INVALID
+#endif
+
+#ifndef LV_DRAW_SW_RGB888_BLEND_NORMAL_TO_ARGB8888_WITH_OPA
+#define LV_DRAW_SW_RGB888_BLEND_NORMAL_TO_ARGB8888_WITH_OPA(...)        LV_RESULT_INVALID
+#endif
+
+#ifndef LV_DRAW_SW_RGB888_BLEND_NORMAL_TO_ARGB8888_WITH_MASK
+#define LV_DRAW_SW_RGB888_BLEND_NORMAL_TO_ARGB8888_WITH_MASK(...)       LV_RESULT_INVALID
+#endif
+
+#ifndef LV_DRAW_SW_RGB888_BLEND_NORMAL_TO_ARGB8888_MIX_MASK_OPA
+#define LV_DRAW_SW_RGB888_BLEND_NORMAL_TO_ARGB8888_MIX_MASK_OPA(...)    LV_RESULT_INVALID
+#endif
+
+#ifndef LV_DRAW_SW_ARGB8888_BLEND_NORMAL_TO_ARGB8888
+#define LV_DRAW_SW_ARGB8888_BLEND_NORMAL_TO_ARGB8888(...)               LV_RESULT_INVALID
+#endif
+
+#ifndef LV_DRAW_SW_ARGB8888_BLEND_NORMAL_TO_ARGB8888_WITH_OPA
+#define LV_DRAW_SW_ARGB8888_BLEND_NORMAL_TO_ARGB8888_WITH_OPA(...)      LV_RESULT_INVALID
+#endif
+
+#ifndef LV_DRAW_SW_ARGB8888_BLEND_NORMAL_TO_ARGB8888_WITH_MASK
+#define LV_DRAW_SW_ARGB8888_BLEND_NORMAL_TO_ARGB8888_WITH_MASK(...)     LV_RESULT_INVALID
+#endif
+
+#ifndef LV_DRAW_SW_ARGB8888_BLEND_NORMAL_TO_ARGB8888_MIX_MASK_OPA
+#define LV_DRAW_SW_ARGB8888_BLEND_NORMAL_TO_ARGB8888_MIX_MASK_OPA(...)  LV_RESULT_INVALID
+#endif
+
+/**********************
+ *   GLOBAL FUNCTIONS
+ **********************/
+
+void LV_ATTRIBUTE_FAST_MEM lv_draw_sw_blend_color_to_argb8888(_lv_draw_sw_blend_fill_dsc_t *dsc)
+{
+    int32_t w = dsc->dest_w;
+    int32_t h = dsc->dest_h;
+    lv_opa_t opa = dsc->opa;
+    const lv_opa_t *mask = dsc->mask_buf;
+    int32_t mask_stride = dsc->mask_stride;
+    int32_t dest_stride = dsc->dest_stride;
+
+    lv_color_mix_alpha_cache_t cache;
+    lv_color_mix_with_alpha_cache_init(&cache);
+
+    int32_t x;
+    int32_t y;
+
+    LV_UNUSED(w);
+    LV_UNUSED(h);
+    LV_UNUSED(x);
+    LV_UNUSED(y);
+    LV_UNUSED(opa);
+    LV_UNUSED(mask);
+    LV_UNUSED(mask_stride);
+    LV_UNUSED(dest_stride);
+
+    /*Simple fill*/
+    if (mask == NULL && opa >= LV_OPA_MAX) {
+        if (dsc->use_asm) {
+            LV_DRAW_SW_COLOR_BLEND_TO_ARGB8888(dsc);
+        } else {
+            uint32_t color32 = lv_color_to_u32(dsc->color);
+            uint32_t *dest_buf = dsc->dest_buf;
+            for (y = 0; y < h; y++) {
+                for (x = 0; x < w - 16; x += 16) {
+                    dest_buf[x + 0] = color32;
+                    dest_buf[x + 1] = color32;
+                    dest_buf[x + 2] = color32;
+                    dest_buf[x + 3] = color32;
+
+                    dest_buf[x + 4] = color32;
+                    dest_buf[x + 5] = color32;
+                    dest_buf[x + 6] = color32;
+                    dest_buf[x + 7] = color32;
+
+                    dest_buf[x + 8] = color32;
+                    dest_buf[x + 9] = color32;
+                    dest_buf[x + 10] = color32;
+                    dest_buf[x + 11] = color32;
+
+                    dest_buf[x + 12] = color32;
+                    dest_buf[x + 13] = color32;
+                    dest_buf[x + 14] = color32;
+                    dest_buf[x + 15] = color32;
+                }
+                for (; x < w; x ++) {
+                    dest_buf[x] = color32;
+                }
+
+                dest_buf = drawbuf_next_row(dest_buf, dest_stride);
+            }
+        }
+
+    }
+    /*Opacity only*/
+    else if (mask == NULL && opa < LV_OPA_MAX) {
+        if (LV_RESULT_INVALID == LV_DRAW_SW_COLOR_BLEND_TO_ARGB8888_WITH_OPA(dsc)) {
+            lv_color32_t color_argb = lv_color_to_32(dsc->color, opa);
+            lv_color32_t *dest_buf = dsc->dest_buf;
+
+            for (y = 0; y < h; y++) {
+                for (x = 0; x < w; x++) {
+                    dest_buf[x] = lv_color_32_32_mix(color_argb, dest_buf[x], &cache);
+                }
+                dest_buf = drawbuf_next_row(dest_buf, dest_stride);
+            }
+        }
+
+    }
+    /*Masked with full opacity*/
+    else if (mask && opa >= LV_OPA_MAX) {
+        if (LV_RESULT_INVALID == LV_DRAW_SW_COLOR_BLEND_TO_ARGB8888_WITH_MASK(dsc)) {
+            lv_color32_t color_argb = lv_color_to_32(dsc->color, 0xff);
+            lv_color32_t *dest_buf = dsc->dest_buf;
+            for (y = 0; y < h; y++) {
+                for (x = 0; x < w; x++) {
+                    color_argb.alpha = mask[x];
+                    dest_buf[x] = lv_color_32_32_mix(color_argb, dest_buf[x], &cache);
+                }
+
+                dest_buf = drawbuf_next_row(dest_buf, dest_stride);
+                mask += mask_stride;
+            }
+        }
+
+    }
+    /*Masked with opacity*/
+    else {
+        if (LV_RESULT_INVALID == LV_DRAW_SW_COLOR_BLEND_TO_ARGB8888_MIX_MASK_OPA(dsc)) {
+            lv_color32_t color_argb = lv_color_to_32(dsc->color, opa);
+            lv_color32_t *dest_buf = dsc->dest_buf;
+            for (y = 0; y < h; y++) {
+                for (x = 0; x < w; x++) {
+                    color_argb.alpha = LV_OPA_MIX2(mask[x], opa);
+                    dest_buf[x] = lv_color_32_32_mix(color_argb, dest_buf[x], &cache);
+                }
+                dest_buf = drawbuf_next_row(dest_buf, dest_stride);
+                mask += mask_stride;
+            }
+        }
+    }
+}
+
+void LV_ATTRIBUTE_FAST_MEM lv_draw_sw_blend_image_to_argb8888(_lv_draw_sw_blend_image_dsc_t *dsc)
+{
+    switch (dsc->src_color_format) {
+    case LV_COLOR_FORMAT_RGB565:
+        rgb565_image_blend(dsc);
+        break;
+    case LV_COLOR_FORMAT_RGB888:
+        rgb888_image_blend(dsc, 3);
+        break;
+    case LV_COLOR_FORMAT_XRGB8888:
+        rgb888_image_blend(dsc, 4);
+        break;
+    case LV_COLOR_FORMAT_ARGB8888:
+        argb8888_image_blend(dsc);
+        break;
+    case LV_COLOR_FORMAT_L8:
+        l8_image_blend(dsc);
+        break;
+    case LV_COLOR_FORMAT_AL88:
+        al88_image_blend(dsc);
+        break;
+    default:
+        LV_LOG_WARN("Not supported source color format");
+        break;
+    }
+}
+
+/**********************
+ *   STATIC FUNCTIONS
+ **********************/
+
+static void LV_ATTRIBUTE_FAST_MEM al88_image_blend(_lv_draw_sw_blend_image_dsc_t *dsc)
+{
+    int32_t w = dsc->dest_w;
+    int32_t h = dsc->dest_h;
+    lv_opa_t opa = dsc->opa;
+    lv_color32_t *dest_buf_c32 = dsc->dest_buf;
+    int32_t dest_stride = dsc->dest_stride;
+    const lv_color16a_t *src_buf_al88 = dsc->src_buf;
+    int32_t src_stride = dsc->src_stride;
+    const lv_opa_t *mask_buf = dsc->mask_buf;
+    int32_t mask_stride = dsc->mask_stride;
+
+    int32_t dest_x;
+    int32_t src_x;
+    int32_t y;
+
+    if (dsc->blend_mode == LV_BLEND_MODE_NORMAL) {
+        if (mask_buf == NULL && opa >= LV_OPA_MAX) {
+            if (LV_RESULT_INVALID == LV_DRAW_SW_AL88_BLEND_NORMAL_TO_ARGB8888(dsc)) {
+                for (y = 0; y < h; y++) {
+                    for (dest_x = 0, src_x = 0; src_x < w; dest_x++, src_x++) {
+                        /*
+                                                dest_buf_c32[dest_x].alpha = src_buf_al88[src_x].alpha;
+                                                dest_buf_c32[dest_x].red = src_buf_al88[src_x].lumi;
+                                                dest_buf_c32[dest_x].green = src_buf_al88[src_x].lumi;
+                                                dest_buf_c32[dest_x].blue = src_buf_al88[src_x].lumi;
+                                                */
+                        lv_color_8_32_mix(src_buf_al88[src_x].lumi, &dest_buf_c32[dest_x], src_buf_al88[src_x].alpha);
+                    }
+                    dest_buf_c32 = drawbuf_next_row(dest_buf_c32, dest_stride);
+                    src_buf_al88 = drawbuf_next_row(src_buf_al88, src_stride);
+                }
+            }
+        } else if (mask_buf == NULL && opa < LV_OPA_MAX) {
+            if (LV_RESULT_INVALID == LV_DRAW_SW_AL88_BLEND_NORMAL_TO_ARGB8888_WITH_OPA(dsc)) {
+                for (y = 0; y < h; y++) {
+                    for (dest_x = 0, src_x = 0; src_x < w; dest_x++, src_x++) {
+                        lv_color_8_32_mix(src_buf_al88[src_x].lumi, &dest_buf_c32[dest_x], LV_OPA_MIX2(src_buf_al88[src_x].alpha, opa));
+                    }
+                    dest_buf_c32 = drawbuf_next_row(dest_buf_c32, dest_stride);
+                    src_buf_al88 = drawbuf_next_row(src_buf_al88, src_stride);
+                }
+            }
+        } else if (mask_buf && opa >= LV_OPA_MAX) {
+            if (LV_RESULT_INVALID == LV_DRAW_SW_AL88_BLEND_NORMAL_TO_ARGB8888_WITH_MASK(dsc)) {
+                for (y = 0; y < h; y++) {
+                    for (dest_x = 0, src_x = 0; src_x < w; dest_x++, src_x++) {
+                        lv_color_8_32_mix(src_buf_al88[src_x].lumi, &dest_buf_c32[dest_x], LV_OPA_MIX2(src_buf_al88[src_x].alpha,
+                                          mask_buf[src_x]));
+                    }
+                    dest_buf_c32 = drawbuf_next_row(dest_buf_c32, dest_stride);
+                    src_buf_al88 = drawbuf_next_row(src_buf_al88, src_stride);
+                    mask_buf += mask_stride;
+                }
+            }
+        } else if (mask_buf && opa < LV_OPA_MAX) {
+            if (LV_RESULT_INVALID == LV_DRAW_SW_AL88_BLEND_NORMAL_TO_ARGB8888_MIX_MASK_OPA(dsc)) {
+                for (y = 0; y < h; y++) {
+                    for (dest_x = 0, src_x = 0; src_x < w; dest_x++, src_x++) {
+                        lv_color_8_32_mix(src_buf_al88[src_x].lumi, &dest_buf_c32[dest_x], LV_OPA_MIX3(src_buf_al88[src_x].alpha,
+                                          mask_buf[src_x], opa));
+                    }
+                    dest_buf_c32 = drawbuf_next_row(dest_buf_c32, dest_stride);
+                    src_buf_al88 = drawbuf_next_row(src_buf_al88, src_stride);
+                    mask_buf += mask_stride;
+                }
+            }
+        }
+    } else {
+        lv_color32_t src_argb;
+        lv_color_mix_alpha_cache_t cache;
+        lv_color_mix_with_alpha_cache_init(&cache);
+        for (y = 0; y < h; y++) {
+            for (dest_x = 0, src_x = 0; src_x < w; dest_x++, src_x++) {
+                src_argb.red = src_buf_al88[src_x].lumi;
+                src_argb.green = src_buf_al88[src_x].lumi;
+                src_argb.blue = src_buf_al88[src_x].lumi;
+                if (mask_buf == NULL) {
+                    src_argb.alpha = LV_OPA_MIX2(src_buf_al88[src_x].alpha, opa);
+                } else {
+                    src_argb.alpha = LV_OPA_MIX3(src_buf_al88[src_x].alpha, mask_buf[dest_x], opa);
+                }
+                blend_non_normal_pixel(&dest_buf_c32[dest_x], src_argb, dsc->blend_mode, &cache);
+            }
+            if (mask_buf) {
+                mask_buf += mask_stride;
+            }
+            dest_buf_c32 = drawbuf_next_row(dest_buf_c32, dest_stride);
+            src_buf_al88 = drawbuf_next_row(src_buf_al88, src_stride);
+        }
+    }
+}
+
+static void LV_ATTRIBUTE_FAST_MEM l8_image_blend(_lv_draw_sw_blend_image_dsc_t *dsc)
+{
+    int32_t w = dsc->dest_w;
+    int32_t h = dsc->dest_h;
+    lv_opa_t opa = dsc->opa;
+    lv_color32_t *dest_buf_c32 = dsc->dest_buf;
+    int32_t dest_stride = dsc->dest_stride;
+    const uint8_t *src_buf_l8 = dsc->src_buf;
+    int32_t src_stride = dsc->src_stride;
+    const lv_opa_t *mask_buf = dsc->mask_buf;
+    int32_t mask_stride = dsc->mask_stride;
+
+    int32_t dest_x;
+    int32_t src_x;
+    int32_t y;
+
+    if (dsc->blend_mode == LV_BLEND_MODE_NORMAL) {
+        if (mask_buf == NULL && opa >= LV_OPA_MAX) {
+            if (LV_RESULT_INVALID == LV_DRAW_SW_L8_BLEND_NORMAL_TO_ARGB8888(dsc)) {
+                for (y = 0; y < h; y++) {
+                    for (dest_x = 0, src_x = 0; src_x < w; dest_x++, src_x++) {
+                        dest_buf_c32[dest_x].alpha = src_buf_l8[src_x];
+                        dest_buf_c32[dest_x].red = src_buf_l8[src_x];
+                        dest_buf_c32[dest_x].green = src_buf_l8[src_x];
+                        dest_buf_c32[dest_x].blue = src_buf_l8[src_x];
+                    }
+                    dest_buf_c32 = drawbuf_next_row(dest_buf_c32, dest_stride);
+                    src_buf_l8 = drawbuf_next_row(src_buf_l8, src_stride);
+                }
+            }
+        } else if (mask_buf == NULL && opa < LV_OPA_MAX) {
+            if (LV_RESULT_INVALID == LV_DRAW_SW_L8_BLEND_NORMAL_TO_ARGB8888_WITH_OPA(dsc)) {
+                for (y = 0; y < h; y++) {
+                    for (dest_x = 0, src_x = 0; src_x < w; dest_x++, src_x++) {
+                        lv_color_8_32_mix(src_buf_l8[src_x], &dest_buf_c32[dest_x], opa);
+                    }
+                    dest_buf_c32 = drawbuf_next_row(dest_buf_c32, dest_stride);
+                    src_buf_l8 = drawbuf_next_row(src_buf_l8, src_stride);
+                }
+            }
+        } else if (mask_buf && opa >= LV_OPA_MAX) {
+            if (LV_RESULT_INVALID == LV_DRAW_SW_L8_BLEND_NORMAL_TO_ARGB8888_WITH_MASK(dsc)) {
+                for (y = 0; y < h; y++) {
+                    for (dest_x = 0, src_x = 0; src_x < w; dest_x++, src_x++) {
+                        lv_color_8_32_mix(src_buf_l8[src_x], &dest_buf_c32[dest_x], mask_buf[src_x]);
+                    }
+                    dest_buf_c32 = drawbuf_next_row(dest_buf_c32, dest_stride);
+                    src_buf_l8 = drawbuf_next_row(src_buf_l8, src_stride);
+                    mask_buf += mask_stride;
+                }
+            }
+        } else if (mask_buf && opa < LV_OPA_MAX) {
+            if (LV_RESULT_INVALID == LV_DRAW_SW_L8_BLEND_NORMAL_TO_ARGB8888_MIX_MASK_OPA(dsc)) {
+                for (y = 0; y < h; y++) {
+                    for (dest_x = 0, src_x = 0; src_x < w; dest_x++, src_x++) {
+                        lv_color_8_32_mix(src_buf_l8[src_x], &dest_buf_c32[dest_x], LV_OPA_MIX2(mask_buf[src_x], opa));
+                    }
+                    dest_buf_c32 = drawbuf_next_row(dest_buf_c32, dest_stride);
+                    src_buf_l8 = drawbuf_next_row(src_buf_l8, src_stride);
+                    mask_buf += mask_stride;
+                }
+            }
+        }
+    } else {
+        lv_color32_t src_argb;
+        lv_color_mix_alpha_cache_t cache;
+        lv_color_mix_with_alpha_cache_init(&cache);
+        for (y = 0; y < h; y++) {
+            for (dest_x = 0, src_x = 0; src_x < w; dest_x++, src_x++) {
+                src_argb.red = src_buf_l8[src_x];
+                src_argb.green = src_buf_l8[src_x];
+                src_argb.blue = src_buf_l8[src_x];
+                if (mask_buf == NULL) {
+                    src_argb.alpha = opa;
+                } else {
+                    src_argb.alpha = LV_OPA_MIX2(mask_buf[dest_x], opa);
+                }
+                blend_non_normal_pixel(&dest_buf_c32[dest_x], src_argb, dsc->blend_mode, &cache);
+            }
+            if (mask_buf) {
+                mask_buf += mask_stride;
+            }
+            dest_buf_c32 = drawbuf_next_row(dest_buf_c32, dest_stride);
+            src_buf_l8 = drawbuf_next_row(src_buf_l8, src_stride);
+        }
+    }
+}
+
+static void LV_ATTRIBUTE_FAST_MEM rgb565_image_blend(_lv_draw_sw_blend_image_dsc_t *dsc)
+{
+    int32_t w = dsc->dest_w;
+    int32_t h = dsc->dest_h;
+    lv_opa_t opa = dsc->opa;
+    lv_color32_t *dest_buf_c32 = dsc->dest_buf;
+    int32_t dest_stride = dsc->dest_stride;
+    const lv_color16_t *src_buf_c16 = (const lv_color16_t *) dsc->src_buf;
+    int32_t src_stride = dsc->src_stride;
+    const lv_opa_t *mask_buf = dsc->mask_buf;
+    int32_t mask_stride = dsc->mask_stride;
+
+    lv_color32_t color_argb;
+    lv_color_mix_alpha_cache_t cache;
+    lv_color_mix_with_alpha_cache_init(&cache);
+
+    int32_t x;
+    int32_t y;
+
+    LV_UNUSED(color_argb);
+
+    if (dsc->blend_mode == LV_BLEND_MODE_NORMAL) {
+        if (mask_buf == NULL) {
+            lv_result_t accelerated;
+            if (opa >= LV_OPA_MAX) {
+                accelerated = LV_DRAW_SW_RGB565_BLEND_NORMAL_TO_ARGB8888(dsc);
+            } else {
+                accelerated = LV_DRAW_SW_RGB565_BLEND_NORMAL_TO_ARGB8888_WITH_OPA(dsc);
+            }
+            if (LV_RESULT_INVALID == accelerated) {
+                color_argb.alpha = opa;
+                for (y = 0; y < h; y++) {
+                    for (x = 0; x < w; x++) {
+                        color_argb.red = (src_buf_c16[x].red * 2106) >> 8;  /*To make it rounded*/
+                        color_argb.green = (src_buf_c16[x].green * 1037) >> 8;
+                        color_argb.blue = (src_buf_c16[x].blue * 2106) >> 8;
+                        dest_buf_c32[x] = lv_color_32_32_mix(color_argb, dest_buf_c32[x], &cache);
+                    }
+                    dest_buf_c32 = drawbuf_next_row(dest_buf_c32, dest_stride);
+                    src_buf_c16 = drawbuf_next_row(src_buf_c16, src_stride);
+                }
+            }
+        } else if (mask_buf && opa >= LV_OPA_MAX) {
+            if (LV_RESULT_INVALID == LV_DRAW_SW_RGB565_BLEND_NORMAL_TO_ARGB8888_WITH_MASK(dsc)) {
+                for (y = 0; y < h; y++) {
+                    for (x = 0; x < w; x++) {
+                        color_argb.alpha = mask_buf[x];
+                        color_argb.red = (src_buf_c16[x].red * 2106) >> 8;  /*To make it rounded*/
+                        color_argb.green = (src_buf_c16[x].green * 1037) >> 8;
+                        color_argb.blue = (src_buf_c16[x].blue * 2106) >> 8;
+                        dest_buf_c32[x] = lv_color_32_32_mix(color_argb, dest_buf_c32[x], &cache);
+                    }
+                    dest_buf_c32 = drawbuf_next_row(dest_buf_c32, dest_stride);
+                    src_buf_c16 = drawbuf_next_row(src_buf_c16, src_stride);
+                    mask_buf += mask_stride;
+                }
+            }
+        } else {
+            if (LV_RESULT_INVALID == LV_DRAW_SW_RGB565_BLEND_NORMAL_TO_ARGB8888_MIX_MASK_OPA(dsc)) {
+                for (y = 0; y < h; y++) {
+                    for (x = 0; x < w; x++) {
+                        color_argb.alpha = LV_OPA_MIX2(mask_buf[x], opa);
+                        color_argb.red = (src_buf_c16[x].red * 2106) >> 8;  /*To make it rounded*/
+                        color_argb.green = (src_buf_c16[x].green * 1037) >> 8;
+                        color_argb.blue = (src_buf_c16[x].blue * 2106) >> 8;
+                        dest_buf_c32[x] = lv_color_32_32_mix(color_argb, dest_buf_c32[x], &cache);
+                    }
+                    dest_buf_c32 = drawbuf_next_row(dest_buf_c32, dest_stride);
+                    src_buf_c16 = drawbuf_next_row(src_buf_c16, src_stride);
+                    mask_buf += mask_stride;
+                }
+            }
+        }
+    } else {
+        lv_color32_t src_argb;
+        for (y = 0; y < h; y++) {
+            for (x = 0; x < w; x++) {
+                src_argb.red = (src_buf_c16[x].red * 2106) >> 8;
+                src_argb.green = (src_buf_c16[x].green * 1037) >> 8;
+                src_argb.blue = (src_buf_c16[x].blue * 2106) >> 8;
+                if (mask_buf == NULL) {
+                    src_argb.alpha = opa;
+                } else {
+                    src_argb.alpha = LV_OPA_MIX2(mask_buf[x], opa);
+                }
+                blend_non_normal_pixel(&dest_buf_c32[x], src_argb, dsc->blend_mode, &cache);
+            }
+            if (mask_buf) {
+                mask_buf += mask_stride;
+            }
+            dest_buf_c32 = drawbuf_next_row(dest_buf_c32, dest_stride);
+            src_buf_c16 = drawbuf_next_row(src_buf_c16, src_stride);
+        }
+    }
+}
+
+static void LV_ATTRIBUTE_FAST_MEM rgb888_image_blend(_lv_draw_sw_blend_image_dsc_t *dsc, const uint8_t src_px_size)
+{
+
+    int32_t w = dsc->dest_w;
+    int32_t h = dsc->dest_h;
+    lv_opa_t opa = dsc->opa;
+    lv_color32_t *dest_buf_c32 = dsc->dest_buf;
+    int32_t dest_stride = dsc->dest_stride;
+    const uint8_t *src_buf = dsc->src_buf;
+    int32_t src_stride = dsc->src_stride;
+    const lv_opa_t *mask_buf = dsc->mask_buf;
+    int32_t mask_stride = dsc->mask_stride;
+
+    lv_color32_t color_argb;
+    lv_color_mix_alpha_cache_t cache;
+    lv_color_mix_with_alpha_cache_init(&cache);
+
+    int32_t dest_x;
+    int32_t src_x;
+    int32_t y;
+
+    LV_UNUSED(color_argb);
+
+    if (dsc->blend_mode == LV_BLEND_MODE_NORMAL) {
+        /*Special case*/
+        if (mask_buf == NULL && opa >= LV_OPA_MAX) {
+            if (LV_RESULT_INVALID == LV_DRAW_SW_RGB888_BLEND_NORMAL_TO_ARGB8888(dsc, src_px_size)) {
+                if (src_px_size == 4) {
+                    uint32_t line_in_bytes = w * 4;
+                    for (y = 0; y < h; y++) {
+                        memcpy(dest_buf_c32, src_buf, line_in_bytes);   // lv_memcpy
+                        dest_buf_c32 = drawbuf_next_row(dest_buf_c32, dest_stride);
+                        src_buf = drawbuf_next_row(src_buf, src_stride);
+                    }
+                } else if (src_px_size == 3) {
+                    for (y = 0; y < h; y++) {
+                        for (dest_x = 0, src_x = 0; dest_x < w; dest_x++, src_x += 3) {
+                            dest_buf_c32[dest_x].red = src_buf[src_x + 2];
+                            dest_buf_c32[dest_x].green = src_buf[src_x + 1];
+                            dest_buf_c32[dest_x].blue = src_buf[src_x + 0];
+                            dest_buf_c32[dest_x].alpha = 0xff;
+                        }
+                        dest_buf_c32 = drawbuf_next_row(dest_buf_c32, dest_stride);
+                        src_buf = drawbuf_next_row(src_buf, src_stride);
+                    }
+                }
+            }
+
+        }
+        if (mask_buf == NULL && opa < LV_OPA_MAX) {
+            if (LV_RESULT_INVALID == LV_DRAW_SW_RGB888_BLEND_NORMAL_TO_ARGB8888_WITH_OPA(dsc, src_px_size)) {
+                color_argb.alpha = opa;
+                for (y = 0; y < h; y++) {
+                    for (dest_x = 0, src_x = 0; dest_x < w; dest_x++, src_x += src_px_size) {
+                        color_argb.red = src_buf[src_x + 2];
+                        color_argb.green = src_buf[src_x + 1];
+                        color_argb.blue = src_buf[src_x + 0];
+                        dest_buf_c32[dest_x] = lv_color_32_32_mix(color_argb, dest_buf_c32[dest_x], &cache);
+                    }
+                    dest_buf_c32 = drawbuf_next_row(dest_buf_c32, dest_stride);
+                    src_buf = drawbuf_next_row(src_buf, src_stride);
+                }
+            }
+
+        }
+        if (mask_buf && opa >= LV_OPA_MAX) {
+            if (LV_RESULT_INVALID == LV_DRAW_SW_RGB888_BLEND_NORMAL_TO_ARGB8888_WITH_MASK(dsc, src_px_size)) {
+                for (y = 0; y < h; y++) {
+                    for (dest_x = 0, src_x = 0; dest_x < w; dest_x++, src_x += src_px_size) {
+                        color_argb.alpha = mask_buf[dest_x];
+                        color_argb.red = src_buf[src_x + 2];
+                        color_argb.green = src_buf[src_x + 1];
+                        color_argb.blue = src_buf[src_x + 0];
+                        dest_buf_c32[dest_x] = lv_color_32_32_mix(color_argb, dest_buf_c32[dest_x], &cache);
+                    }
+                    dest_buf_c32 = drawbuf_next_row(dest_buf_c32, dest_stride);
+                    src_buf = drawbuf_next_row(src_buf, src_stride);
+                    mask_buf += mask_stride;
+                }
+            }
+        }
+        if (mask_buf && opa < LV_OPA_MAX) {
+            if (LV_RESULT_INVALID == LV_DRAW_SW_RGB888_BLEND_NORMAL_TO_ARGB8888_MIX_MASK_OPA(dsc, src_px_size)) {
+                for (y = 0; y < h; y++) {
+                    for (dest_x = 0, src_x = 0; dest_x < w; dest_x++, src_x += src_px_size) {
+                        color_argb.alpha = (opa * mask_buf[dest_x]) >> 8;
+                        color_argb.red = src_buf[src_x + 2];
+                        color_argb.green = src_buf[src_x + 1];
+                        color_argb.blue = src_buf[src_x + 0];
+                        dest_buf_c32[dest_x] = lv_color_32_32_mix(color_argb, dest_buf_c32[dest_x], &cache);
+                    }
+                    dest_buf_c32 = drawbuf_next_row(dest_buf_c32, dest_stride);
+                    src_buf = drawbuf_next_row(src_buf, src_stride);
+                    mask_buf += mask_stride;
+                }
+            }
+        }
+    } else {
+        lv_color32_t src_argb;
+        for (y = 0; y < h; y++) {
+            for (dest_x = 0, src_x = 0; dest_x < w; dest_x++, src_x += src_px_size) {
+                src_argb.red = src_buf[src_x + 2];
+                src_argb.green = src_buf[src_x + 1];
+                src_argb.blue = src_buf[src_x + 0];
+                if (mask_buf == NULL) {
+                    src_argb.alpha = opa;
+                } else {
+                    src_argb.alpha = LV_OPA_MIX2(mask_buf[dest_x], opa);
+                }
+
+                blend_non_normal_pixel(&dest_buf_c32[dest_x], src_argb, dsc->blend_mode, &cache);
+            }
+            if (mask_buf) {
+                mask_buf += mask_stride;
+            }
+            dest_buf_c32 = drawbuf_next_row(dest_buf_c32, dest_stride);
+            src_buf = drawbuf_next_row(src_buf, src_stride);
+        }
+    }
+}
+
+static void LV_ATTRIBUTE_FAST_MEM argb8888_image_blend(_lv_draw_sw_blend_image_dsc_t *dsc)
+{
+    int32_t w = dsc->dest_w;
+    int32_t h = dsc->dest_h;
+    lv_opa_t opa = dsc->opa;
+    lv_color32_t *dest_buf_c32 = dsc->dest_buf;
+    int32_t dest_stride = dsc->dest_stride;
+    const lv_color32_t *src_buf_c32 = dsc->src_buf;
+    int32_t src_stride = dsc->src_stride;
+    const lv_opa_t *mask_buf = dsc->mask_buf;
+    int32_t mask_stride = dsc->mask_stride;
+
+    lv_color32_t color_argb;
+    lv_color_mix_alpha_cache_t cache;
+    lv_color_mix_with_alpha_cache_init(&cache);
+
+    int32_t x;
+    int32_t y;
+
+    if (dsc->blend_mode == LV_BLEND_MODE_NORMAL) {
+        if (mask_buf == NULL && opa >= LV_OPA_MAX) {
+            if (LV_RESULT_INVALID == LV_DRAW_SW_ARGB8888_BLEND_NORMAL_TO_ARGB8888(dsc)) {
+                for (y = 0; y < h; y++) {
+                    for (x = 0; x < w; x++) {
+                        dest_buf_c32[x] = lv_color_32_32_mix(src_buf_c32[x], dest_buf_c32[x], &cache);
+                    }
+                    dest_buf_c32 = drawbuf_next_row(dest_buf_c32, dest_stride);
+                    src_buf_c32 = drawbuf_next_row(src_buf_c32, src_stride);
+                }
+            }
+        } else if (mask_buf == NULL && opa < LV_OPA_MAX) {
+            if (LV_RESULT_INVALID == LV_DRAW_SW_ARGB8888_BLEND_NORMAL_TO_ARGB8888_WITH_OPA(dsc)) {
+                for (y = 0; y < h; y++) {
+                    for (x = 0; x < w; x++) {
+                        color_argb = src_buf_c32[x];
+                        color_argb.alpha = LV_OPA_MIX2(color_argb.alpha, opa);
+                        dest_buf_c32[x] = lv_color_32_32_mix(color_argb, dest_buf_c32[x], &cache);
+                    }
+                    dest_buf_c32 = drawbuf_next_row(dest_buf_c32, dest_stride);
+                    src_buf_c32 = drawbuf_next_row(src_buf_c32, src_stride);
+                }
+            }
+        } else if (mask_buf && opa >= LV_OPA_MAX) {
+            if (LV_RESULT_INVALID == LV_DRAW_SW_ARGB8888_BLEND_NORMAL_TO_ARGB8888_WITH_MASK(dsc)) {
+                for (y = 0; y < h; y++) {
+                    for (x = 0; x < w; x++) {
+                        color_argb = src_buf_c32[x];
+                        color_argb.alpha = LV_OPA_MIX2(color_argb.alpha, mask_buf[x]);
+                        dest_buf_c32[x] = lv_color_32_32_mix(color_argb, dest_buf_c32[x], &cache);
+                    }
+                    dest_buf_c32 = drawbuf_next_row(dest_buf_c32, dest_stride);
+                    src_buf_c32 = drawbuf_next_row(src_buf_c32, src_stride);
+                    mask_buf += mask_stride;
+                }
+            }
+        } else if (mask_buf && opa < LV_OPA_MAX) {
+            if (LV_RESULT_INVALID == LV_DRAW_SW_ARGB8888_BLEND_NORMAL_TO_ARGB8888_MIX_MASK_OPA(dsc)) {
+                for (y = 0; y < h; y++) {
+                    for (x = 0; x < w; x++) {
+                        color_argb = src_buf_c32[x];
+                        color_argb.alpha = LV_OPA_MIX3(color_argb.alpha, opa, mask_buf[x]);
+                        dest_buf_c32[x] = lv_color_32_32_mix(color_argb, dest_buf_c32[x], &cache);
+                    }
+                    dest_buf_c32 = drawbuf_next_row(dest_buf_c32, dest_stride);
+                    src_buf_c32 = drawbuf_next_row(src_buf_c32, src_stride);
+                    mask_buf += mask_stride;
+                }
+            }
+        }
+    } else {
+        for (y = 0; y < h; y++) {
+            for (x = 0; x < w; x++) {
+                color_argb = src_buf_c32[x];
+                if (mask_buf == NULL) {
+                    color_argb.alpha = LV_OPA_MIX2(color_argb.alpha, opa);
+                } else {
+                    color_argb.alpha = LV_OPA_MIX3(color_argb.alpha, mask_buf[x], opa);
+                }
+                blend_non_normal_pixel(&dest_buf_c32[x], color_argb, dsc->blend_mode, &cache);
+            }
+            if (mask_buf) {
+                mask_buf += mask_stride;
+            }
+            dest_buf_c32 = drawbuf_next_row(dest_buf_c32, dest_stride);
+            src_buf_c32 = drawbuf_next_row(src_buf_c32, src_stride);
+        }
+    }
+}
+
+static inline void LV_ATTRIBUTE_FAST_MEM lv_color_8_32_mix(const uint8_t src, lv_color32_t *dest, uint8_t mix)
+{
+
+    if (mix == 0) {
+        return;
+    }
+
+    dest->alpha = 255;
+    if (mix >= LV_OPA_MAX) {
+        dest->red = src;
+        dest->green = src;
+        dest->blue = src;
+    } else {
+        lv_opa_t mix_inv = 255 - mix;
+        dest->red = (uint32_t)((uint32_t)src * mix + dest->red * mix_inv) >> 8;
+        dest->green = (uint32_t)((uint32_t)src * mix + dest->green * mix_inv) >> 8;
+        dest->blue = (uint32_t)((uint32_t)src * mix + dest->blue * mix_inv) >> 8;
+    }
+}
+
+static inline lv_color32_t LV_ATTRIBUTE_FAST_MEM lv_color_32_32_mix(lv_color32_t fg, lv_color32_t bg,
+        lv_color_mix_alpha_cache_t *cache)
+{
+    /*Pick the foreground if it's fully opaque or the Background is fully transparent*/
+    if (fg.alpha >= LV_OPA_MAX || bg.alpha <= LV_OPA_MIN) {
+        return fg;
+    }
+    /*Transparent foreground: use the Background*/
+    else if (fg.alpha <= LV_OPA_MIN) {
+        return bg;
+    }
+    /*Opaque background: use simple mix*/
+    else if (bg.alpha == 255) {
+        return lv_color_mix32(fg, bg);
+    }
+    /*Both colors have alpha. Expensive calculation need to be applied*/
+    else {
+        /*Save the parameters and the result. If they will be asked again don't compute again*/
+
+        /*Update the ratio and the result alpha value if the input alpha values change*/
+        if (bg.alpha != cache->bg_saved.alpha || fg.alpha != cache->fg_saved.alpha) {
+            /*Info:
+             * https://en.wikipedia.org/wiki/Alpha_compositing#Analytical_derivation_of_the_over_operator*/
+            cache->res_alpha_saved  = 255 - LV_OPA_MIX2(255 - fg.alpha, 255 - bg.alpha);
+            LV_ASSERT(cache->res_alpha_saved != 0);
+            cache->ratio_saved = (uint32_t)((uint32_t)fg.alpha * 255) / cache->res_alpha_saved;
+        }
+
+        if (!lv_color32_eq(bg, cache->bg_saved) || !lv_color32_eq(fg, cache->fg_saved)) {
+            cache->fg_saved = fg;
+            cache->bg_saved = bg;
+            fg.alpha = cache->ratio_saved;
+            cache->res_saved = lv_color_mix32(fg, bg);
+            cache->res_saved.alpha = cache->res_alpha_saved;
+        }
+
+        return cache->res_saved;
+    }
+}
+
+void lv_color_mix_with_alpha_cache_init(lv_color_mix_alpha_cache_t *cache)
+{
+    memset(&cache->fg_saved, 0x00, sizeof(lv_color32_t));   //lv_memzero
+    memset(&cache->bg_saved, 0x00, sizeof(lv_color32_t));   //lv_memzero
+    memset(&cache->res_saved, 0x00, sizeof(lv_color32_t));  //lv_memzero
+    cache->res_alpha_saved = 255;
+    cache->ratio_saved = 255;
+}
+
+static inline void LV_ATTRIBUTE_FAST_MEM blend_non_normal_pixel(lv_color32_t *dest, lv_color32_t src,
+        lv_blend_mode_t mode, lv_color_mix_alpha_cache_t *cache)
+{
+    lv_color32_t res;
+    switch (mode) {
+    case LV_BLEND_MODE_ADDITIVE:
+        res.red = LV_MIN(dest->red + src.red, 255);
+        res.green = LV_MIN(dest->green + src.green, 255);
+        res.blue = LV_MIN(dest->blue + src.blue, 255);
+        break;
+    case LV_BLEND_MODE_SUBTRACTIVE:
+        res.red = LV_MAX(dest->red - src.red, 0);
+        res.green = LV_MAX(dest->green - src.green, 0);
+        res.blue = LV_MAX(dest->blue - src.blue, 0);
+        break;
+    case LV_BLEND_MODE_MULTIPLY:
+        res.red = (dest->red * src.red) >> 8;
+        res.green = (dest->green * src.green) >> 8;
+        res.blue = (dest->blue * src.blue) >> 8;
+        break;
+    default:
+        LV_LOG_WARN("Not supported blend mode: %d", mode);
+        return;
+    }
+    res.alpha = src.alpha;
+    *dest = lv_color_32_32_mix(res, *dest, cache);
+}
+
+static inline void *LV_ATTRIBUTE_FAST_MEM drawbuf_next_row(const void *buf, uint32_t stride)
+{
+    return (void *)((uint8_t *)buf + stride);
+}
diff --git a/components/esp_lvgl_port/test_apps/simd/main/lv_blend/src/lv_draw_sw_blend_to_rgb565.c b/components/esp_lvgl_port/test_apps/simd/main/lv_blend/src/lv_draw_sw_blend_to_rgb565.c
new file mode 100644
index 00000000..361571ff
--- /dev/null
+++ b/components/esp_lvgl_port/test_apps/simd/main/lv_blend/src/lv_draw_sw_blend_to_rgb565.c
@@ -0,0 +1,960 @@
+﻿/*
+ * SPDX-FileCopyrightText: 2024 Espressif Systems (Shanghai) CO LTD
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * This file is derived from the LVGL project.
+ * See https://github.com/lvgl/lvgl for details.
+ */
+
+/**
+ * @file lv_draw_sw_blend_to_rgb565.c
+ *
+ */
+
+/*********************
+ *      INCLUDES
+ *********************/
+#include "lv_draw_sw_blend_to_rgb565.h"
+
+#include "lv_assert.h"
+#include "lv_types.h"
+#include "lv_log.h"
+#include "lv_draw_sw_blend.h"
+#include "lv_math.h"
+#include "lv_color.h"
+#include "string.h"
+
+#include "esp_lvgl_port_lv_blend.h"
+
+
+/*********************
+ *      DEFINES
+ *********************/
+
+#define LV_ATTRIBUTE_FAST_MEM
+
+/**********************
+ *      TYPEDEFS
+ **********************/
+
+/**********************
+ *  STATIC PROTOTYPES
+ **********************/
+
+static void /* LV_ATTRIBUTE_FAST_MEM */ al88_image_blend(_lv_draw_sw_blend_image_dsc_t *dsc);
+
+static void /* LV_ATTRIBUTE_FAST_MEM */ l8_image_blend(_lv_draw_sw_blend_image_dsc_t *dsc);
+
+static void /* LV_ATTRIBUTE_FAST_MEM */ rgb565_image_blend(_lv_draw_sw_blend_image_dsc_t *dsc);
+
+static void /* LV_ATTRIBUTE_FAST_MEM */ rgb888_image_blend(_lv_draw_sw_blend_image_dsc_t *dsc,
+        const uint8_t src_px_size);
+
+static void /* LV_ATTRIBUTE_FAST_MEM */ argb8888_image_blend(_lv_draw_sw_blend_image_dsc_t *dsc);
+
+static inline uint16_t /* LV_ATTRIBUTE_FAST_MEM */ l8_to_rgb565(const uint8_t c1);
+
+static inline uint16_t /* LV_ATTRIBUTE_FAST_MEM */ lv_color_8_16_mix(const uint8_t c1, uint16_t c2, uint8_t mix);
+
+static inline uint16_t /* LV_ATTRIBUTE_FAST_MEM */ lv_color_24_16_mix(const uint8_t *c1, uint16_t c2, uint8_t mix);
+
+static inline void * /* LV_ATTRIBUTE_FAST_MEM */ drawbuf_next_row(const void *buf, uint32_t stride);
+
+/**********************
+ *  STATIC VARIABLES
+ **********************/
+
+/**********************
+ *      MACROS
+ **********************/
+
+#ifndef LV_DRAW_SW_COLOR_BLEND_TO_RGB565
+#define LV_DRAW_SW_COLOR_BLEND_TO_RGB565(...)                           LV_RESULT_INVALID
+#endif
+
+#ifndef LV_DRAW_SW_COLOR_BLEND_TO_RGB565_WITH_OPA
+#define LV_DRAW_SW_COLOR_BLEND_TO_RGB565_WITH_OPA(...)                  LV_RESULT_INVALID
+#endif
+
+#ifndef LV_DRAW_SW_COLOR_BLEND_TO_RGB565_WITH_MASK
+#define LV_DRAW_SW_COLOR_BLEND_TO_RGB565_WITH_MASK(...)                 LV_RESULT_INVALID
+#endif
+
+#ifndef LV_DRAW_SW_COLOR_BLEND_TO_RGB565_MIX_MASK_OPA
+#define LV_DRAW_SW_COLOR_BLEND_TO_RGB565_MIX_MASK_OPA(...)              LV_RESULT_INVALID
+#endif
+
+#ifndef LV_DRAW_SW_L8_BLEND_NORMAL_TO_RGB565
+#define LV_DRAW_SW_L8_BLEND_NORMAL_TO_RGB565(...)                       LV_RESULT_INVALID
+#endif
+
+#ifndef LV_DRAW_SW_L8_BLEND_NORMAL_TO_RGB565_WITH_OPA
+#define LV_DRAW_SW_L8_BLEND_NORMAL_TO_RGB565_WITH_OPA(...)              LV_RESULT_INVALID
+#endif
+
+#ifndef LV_DRAW_SW_L8_BLEND_NORMAL_TO_RGB565_WITH_MASK
+#define LV_DRAW_SW_L8_BLEND_NORMAL_TO_RGB565_WITH_MASK(...)             LV_RESULT_INVALID
+#endif
+
+#ifndef LV_DRAW_SW_L8_BLEND_NORMAL_TO_RGB565_MIX_MASK_OPA
+#define LV_DRAW_SW_L8_BLEND_NORMAL_TO_RGB565_MIX_MASK_OPA(...)          LV_RESULT_INVALID
+#endif
+
+#ifndef LV_DRAW_SW_AL88_BLEND_NORMAL_TO_RGB565
+#define LV_DRAW_SW_AL88_BLEND_NORMAL_TO_RGB565(...)                       LV_RESULT_INVALID
+#endif
+
+#ifndef LV_DRAW_SW_AL88_BLEND_NORMAL_TO_RGB565_WITH_OPA
+#define LV_DRAW_SW_AL88_BLEND_NORMAL_TO_RGB565_WITH_OPA(...)              LV_RESULT_INVALID
+#endif
+
+#ifndef LV_DRAW_SW_AL88_BLEND_NORMAL_TO_RGB565_WITH_MASK
+#define LV_DRAW_SW_AL88_BLEND_NORMAL_TO_RGB565_WITH_MASK(...)             LV_RESULT_INVALID
+#endif
+
+#ifndef LV_DRAW_SW_AL88_BLEND_NORMAL_TO_RGB565_MIX_MASK_OPA
+#define LV_DRAW_SW_AL88_BLEND_NORMAL_TO_RGB565_MIX_MASK_OPA(...)          LV_RESULT_INVALID
+#endif
+
+#ifndef LV_DRAW_SW_RGB565_BLEND_NORMAL_TO_RGB565
+#define LV_DRAW_SW_RGB565_BLEND_NORMAL_TO_RGB565(...)                   LV_RESULT_INVALID
+#endif
+
+#ifndef LV_DRAW_SW_RGB565_BLEND_NORMAL_TO_RGB565_WITH_OPA
+#define LV_DRAW_SW_RGB565_BLEND_NORMAL_TO_RGB565_WITH_OPA(...)          LV_RESULT_INVALID
+#endif
+
+#ifndef LV_DRAW_SW_RGB565_BLEND_NORMAL_TO_RGB565_WITH_MASK
+#define LV_DRAW_SW_RGB565_BLEND_NORMAL_TO_RGB565_WITH_MASK(...)         LV_RESULT_INVALID
+#endif
+
+#ifndef LV_DRAW_SW_RGB565_BLEND_NORMAL_TO_RGB565_MIX_MASK_OPA
+#define LV_DRAW_SW_RGB565_BLEND_NORMAL_TO_RGB565_MIX_MASK_OPA(...)      LV_RESULT_INVALID
+#endif
+
+#ifndef LV_DRAW_SW_RGB888_BLEND_NORMAL_TO_RGB565
+#define LV_DRAW_SW_RGB888_BLEND_NORMAL_TO_RGB565(...)                   LV_RESULT_INVALID
+#endif
+
+#ifndef LV_DRAW_SW_RGB888_BLEND_NORMAL_TO_RGB565_WITH_OPA
+#define LV_DRAW_SW_RGB888_BLEND_NORMAL_TO_RGB565_WITH_OPA(...)          LV_RESULT_INVALID
+#endif
+
+#ifndef LV_DRAW_SW_RGB888_BLEND_NORMAL_TO_RGB565_WITH_MASK
+#define LV_DRAW_SW_RGB888_BLEND_NORMAL_TO_RGB565_WITH_MASK(...)         LV_RESULT_INVALID
+#endif
+
+#ifndef LV_DRAW_SW_RGB888_BLEND_NORMAL_TO_RGB565_MIX_MASK_OPA
+#define LV_DRAW_SW_RGB888_BLEND_NORMAL_TO_RGB565_MIX_MASK_OPA(...)      LV_RESULT_INVALID
+#endif
+
+#ifndef LV_DRAW_SW_ARGB8888_BLEND_NORMAL_TO_RGB565
+#define LV_DRAW_SW_ARGB8888_BLEND_NORMAL_TO_RGB565(...)                 LV_RESULT_INVALID
+#endif
+
+#ifndef LV_DRAW_SW_ARGB8888_BLEND_NORMAL_TO_RGB565_WITH_OPA
+#define LV_DRAW_SW_ARGB8888_BLEND_NORMAL_TO_RGB565_WITH_OPA(...)        LV_RESULT_INVALID
+#endif
+
+#ifndef LV_DRAW_SW_ARGB8888_BLEND_NORMAL_TO_RGB565_WITH_MASK
+#define LV_DRAW_SW_ARGB8888_BLEND_NORMAL_TO_RGB565_WITH_MASK(...)       LV_RESULT_INVALID
+#endif
+
+#ifndef LV_DRAW_SW_ARGB8888_BLEND_NORMAL_TO_RGB565_MIX_MASK_OPA
+#define LV_DRAW_SW_ARGB8888_BLEND_NORMAL_TO_RGB565_MIX_MASK_OPA(...)    LV_RESULT_INVALID
+#endif
+
+/**********************
+ *   GLOBAL FUNCTIONS
+ **********************/
+
+/**
+ * Fill an area with a color.
+ * Supports normal fill, fill with opacity, fill with mask, and fill with mask and opacity.
+ * dest_buf and color have native color depth. (RGB565, RGB888, XRGB8888)
+ * The background (dest_buf) cannot have alpha channel
+ * @param dest_buf
+ * @param dest_area
+ * @param dest_stride
+ * @param color
+ * @param opa
+ * @param mask
+ * @param mask_stride
+ */
+void LV_ATTRIBUTE_FAST_MEM lv_draw_sw_blend_color_to_rgb565(_lv_draw_sw_blend_fill_dsc_t *dsc)
+{
+    int32_t w = dsc->dest_w;
+    int32_t h = dsc->dest_h;
+    uint16_t color16 = lv_color_to_u16(dsc->color);
+    lv_opa_t opa = dsc->opa;
+    const lv_opa_t *mask = dsc->mask_buf;
+    int32_t mask_stride = dsc->mask_stride;
+    uint16_t *dest_buf_u16 = dsc->dest_buf;
+    int32_t dest_stride = dsc->dest_stride;
+
+    int32_t x;
+    int32_t y;
+
+    LV_UNUSED(w);
+    LV_UNUSED(h);
+    LV_UNUSED(x);
+    LV_UNUSED(y);
+    LV_UNUSED(opa);
+    LV_UNUSED(mask);
+    LV_UNUSED(color16);
+    LV_UNUSED(mask_stride);
+    LV_UNUSED(dest_stride);
+    LV_UNUSED(dest_buf_u16);
+
+    /*Simple fill*/
+    if (mask == NULL && opa >= LV_OPA_MAX)  {
+        if (dsc->use_asm) {
+            LV_DRAW_SW_COLOR_BLEND_TO_RGB565(dsc);
+        } else {
+            for (y = 0; y < h; y++) {
+                uint16_t *dest_end_final = dest_buf_u16 + w;
+                uint32_t *dest_end_mid = (uint32_t *)((uint16_t *) dest_buf_u16 + ((w - 1) & ~(0xF)));
+                if ((lv_uintptr_t)&dest_buf_u16[0] & 0x3) {
+                    dest_buf_u16[0] = color16;
+                    dest_buf_u16++;
+                }
+
+                uint32_t c32 = (uint32_t)color16 + ((uint32_t)color16 << 16);
+                uint32_t *dest32 = (uint32_t *)dest_buf_u16;
+                while (dest32 < dest_end_mid) {
+                    dest32[0] = c32;
+                    dest32[1] = c32;
+                    dest32[2] = c32;
+                    dest32[3] = c32;
+                    dest32[4] = c32;
+                    dest32[5] = c32;
+                    dest32[6] = c32;
+                    dest32[7] = c32;
+                    dest32 += 8;
+                }
+
+                dest_buf_u16 = (uint16_t *)dest32;
+
+                while (dest_buf_u16 < dest_end_final) {
+                    *dest_buf_u16 = color16;
+                    dest_buf_u16++;
+                }
+
+                dest_buf_u16 = drawbuf_next_row(dest_buf_u16, dest_stride);
+                dest_buf_u16 -= w;
+            }
+        }
+
+    }
+    /*Opacity only*/
+    else if (mask == NULL && opa < LV_OPA_MAX) {
+        if (LV_RESULT_INVALID == LV_DRAW_SW_COLOR_BLEND_TO_RGB565_WITH_OPA(dsc)) {
+            uint32_t last_dest32_color = dest_buf_u16[0] + 1; /*Set to value which is not equal to the first pixel*/
+            uint32_t last_res32_color = 0;
+
+            for (y = 0; y < h; y++) {
+                x = 0;
+                if ((lv_uintptr_t)&dest_buf_u16[0] & 0x3) {
+                    dest_buf_u16[0] = lv_color_16_16_mix(color16, dest_buf_u16[0], opa);
+                    x = 1;
+                }
+
+                for (; x < w - 2; x += 2) {
+                    if (dest_buf_u16[x] != dest_buf_u16[x + 1]) {
+                        dest_buf_u16[x + 0] = lv_color_16_16_mix(color16, dest_buf_u16[x + 0], opa);
+                        dest_buf_u16[x + 1] = lv_color_16_16_mix(color16, dest_buf_u16[x + 1], opa);
+                    } else {
+                        volatile uint32_t *dest32 = (uint32_t *)&dest_buf_u16[x];
+                        if (last_dest32_color == *dest32) {
+                            *dest32 = last_res32_color;
+                        } else {
+                            last_dest32_color =  *dest32;
+
+                            dest_buf_u16[x] = lv_color_16_16_mix(color16, dest_buf_u16[x + 0], opa);
+                            dest_buf_u16[x + 1] = dest_buf_u16[x];
+
+                            last_res32_color = *dest32;
+                        }
+                    }
+                }
+
+                for (; x < w ; x++) {
+                    dest_buf_u16[x] = lv_color_16_16_mix(color16, dest_buf_u16[x], opa);
+                }
+                dest_buf_u16 = drawbuf_next_row(dest_buf_u16, dest_stride);
+            }
+        }
+
+    }
+
+    /*Masked with full opacity*/
+    else if (mask && opa >= LV_OPA_MAX) {
+        if (LV_RESULT_INVALID == LV_DRAW_SW_COLOR_BLEND_TO_RGB565_WITH_MASK(dsc)) {
+            for (y = 0; y < h; y++) {
+                x = 0;
+                if ((lv_uintptr_t)(mask) & 0x1) {
+                    dest_buf_u16[x] = lv_color_16_16_mix(color16, dest_buf_u16[x], mask[x]);
+                    x++;
+                }
+
+                for (; x <= w - 2; x += 2) {
+                    uint16_t mask16 = *((uint16_t *)&mask[x]);
+                    if (mask16 == 0xFFFF) {
+                        dest_buf_u16[x + 0] = color16;
+                        dest_buf_u16[x + 1] = color16;
+                    } else if (mask16 != 0) {
+                        dest_buf_u16[x + 0] = lv_color_16_16_mix(color16, dest_buf_u16[x + 0], mask[x + 0]);
+                        dest_buf_u16[x + 1] = lv_color_16_16_mix(color16, dest_buf_u16[x + 1], mask[x + 1]);
+                    }
+                }
+
+                for (; x < w ; x++) {
+                    dest_buf_u16[x] = lv_color_16_16_mix(color16, dest_buf_u16[x], mask[x]);
+                }
+                dest_buf_u16 = drawbuf_next_row(dest_buf_u16, dest_stride);
+                mask += mask_stride;
+            }
+        }
+
+    }
+    /*Masked with opacity*/
+    else if (mask && opa < LV_OPA_MAX) {
+        if (LV_RESULT_INVALID == LV_DRAW_SW_COLOR_BLEND_TO_RGB565_MIX_MASK_OPA(dsc)) {
+            for (y = 0; y < h; y++) {
+                for (x = 0; x < w; x++) {
+                    dest_buf_u16[x] = lv_color_16_16_mix(color16, dest_buf_u16[x], LV_OPA_MIX2(mask[x], opa));
+                }
+                dest_buf_u16 = drawbuf_next_row(dest_buf_u16, dest_stride);
+                mask += mask_stride;
+            }
+        }
+    }
+}
+
+void LV_ATTRIBUTE_FAST_MEM lv_draw_sw_blend_image_to_rgb565(_lv_draw_sw_blend_image_dsc_t *dsc)
+{
+    switch (dsc->src_color_format) {
+    case LV_COLOR_FORMAT_RGB565:
+        rgb565_image_blend(dsc);
+        break;
+    case LV_COLOR_FORMAT_RGB888:
+        rgb888_image_blend(dsc, 3);
+        break;
+    case LV_COLOR_FORMAT_XRGB8888:
+        rgb888_image_blend(dsc, 4);
+        break;
+    case LV_COLOR_FORMAT_ARGB8888:
+        argb8888_image_blend(dsc);
+        break;
+    case LV_COLOR_FORMAT_L8:
+        l8_image_blend(dsc);
+        break;
+    case LV_COLOR_FORMAT_AL88:
+        al88_image_blend(dsc);
+        break;
+    default:
+        LV_LOG_WARN("Not supported source color format");
+        break;
+    }
+}
+
+/**********************
+ *   STATIC FUNCTIONS
+ **********************/
+
+static void LV_ATTRIBUTE_FAST_MEM al88_image_blend(_lv_draw_sw_blend_image_dsc_t *dsc)
+{
+    int32_t w = dsc->dest_w;
+    int32_t h = dsc->dest_h;
+    lv_opa_t opa = dsc->opa;
+    uint16_t *dest_buf_u16 = dsc->dest_buf;
+    int32_t dest_stride = dsc->dest_stride;
+    const lv_color16a_t *src_buf_al88 = dsc->src_buf;
+    int32_t src_stride = dsc->src_stride;
+    const lv_opa_t *mask_buf = dsc->mask_buf;
+    int32_t mask_stride = dsc->mask_stride;
+
+    int32_t dest_x;
+    int32_t src_x;
+    int32_t y;
+
+    if (dsc->blend_mode == LV_BLEND_MODE_NORMAL) {
+        if (mask_buf == NULL && opa >= LV_OPA_MAX) {
+            if (LV_RESULT_INVALID == LV_DRAW_SW_AL88_BLEND_NORMAL_TO_RGB565(dsc)) {
+                for (y = 0; y < h; y++) {
+                    for (dest_x = 0, src_x = 0; dest_x < w; dest_x++, src_x++) {
+                        dest_buf_u16[dest_x] = lv_color_8_16_mix(src_buf_al88[src_x].lumi, dest_buf_u16[dest_x], src_buf_al88[src_x].alpha);
+                    }
+                    dest_buf_u16 = drawbuf_next_row(dest_buf_u16, dest_stride);
+                    src_buf_al88 = drawbuf_next_row(src_buf_al88, src_stride);
+                }
+            }
+        } else if (mask_buf == NULL && opa < LV_OPA_MAX) {
+            if (LV_RESULT_INVALID == LV_DRAW_SW_AL88_BLEND_NORMAL_TO_RGB565_WITH_OPA(dsc)) {
+                for (y = 0; y < h; y++) {
+                    for (dest_x = 0, src_x = 0; dest_x < w; dest_x++, src_x++) {
+                        dest_buf_u16[dest_x] = lv_color_8_16_mix(src_buf_al88[src_x].lumi, dest_buf_u16[dest_x],
+                                               LV_OPA_MIX2(src_buf_al88[src_x].alpha, opa));
+                    }
+                    dest_buf_u16 = drawbuf_next_row(dest_buf_u16, dest_stride);
+                    src_buf_al88 = drawbuf_next_row(src_buf_al88, src_stride);
+                }
+            }
+        } else if (mask_buf && opa >= LV_OPA_MAX) {
+            if (LV_RESULT_INVALID == LV_DRAW_SW_AL88_BLEND_NORMAL_TO_RGB565_WITH_MASK(dsc)) {
+                for (y = 0; y < h; y++) {
+                    for (dest_x = 0, src_x = 0; dest_x < w; dest_x++, src_x++) {
+                        dest_buf_u16[dest_x] = lv_color_8_16_mix(src_buf_al88[src_x].lumi, dest_buf_u16[dest_x],
+                                               LV_OPA_MIX2(src_buf_al88[src_x].alpha, mask_buf[dest_x]));
+                    }
+                    dest_buf_u16 = drawbuf_next_row(dest_buf_u16, dest_stride);
+                    src_buf_al88 = drawbuf_next_row(src_buf_al88, src_stride);
+                    mask_buf += mask_stride;
+                }
+            }
+        } else if (mask_buf && opa < LV_OPA_MAX) {
+            if (LV_RESULT_INVALID == LV_DRAW_SW_AL88_BLEND_NORMAL_TO_RGB565_MIX_MASK_OPA(dsc)) {
+                for (y = 0; y < h; y++) {
+                    for (dest_x = 0, src_x = 0; dest_x < w; dest_x++, src_x++) {
+                        dest_buf_u16[dest_x] = lv_color_8_16_mix(src_buf_al88[src_x].lumi, dest_buf_u16[dest_x],
+                                               LV_OPA_MIX3(src_buf_al88[src_x].alpha, mask_buf[dest_x], opa));
+                    }
+                    dest_buf_u16 = drawbuf_next_row(dest_buf_u16, dest_stride);
+                    src_buf_al88 = drawbuf_next_row(src_buf_al88, src_stride);
+                    mask_buf += mask_stride;
+                }
+            }
+        }
+    } else {
+        uint16_t res = 0;
+        for (y = 0; y < h; y++) {
+            lv_color16_t *dest_buf_c16 = (lv_color16_t *)dest_buf_u16;
+            for (dest_x = 0, src_x = 0; dest_x < w; dest_x++, src_x += 4) {
+                uint8_t rb = src_buf_al88[src_x].lumi >> 3;
+                uint8_t g = src_buf_al88[src_x].lumi >> 2;
+                switch (dsc->blend_mode) {
+                case LV_BLEND_MODE_ADDITIVE:
+                    res = (LV_MIN(dest_buf_c16[dest_x].red + rb, 31)) << 11;
+                    res += (LV_MIN(dest_buf_c16[dest_x].green + g, 63)) << 5;
+                    res += LV_MIN(dest_buf_c16[dest_x].blue + rb, 31);
+                    break;
+                case LV_BLEND_MODE_SUBTRACTIVE:
+                    res = (LV_MAX(dest_buf_c16[dest_x].red - rb, 0)) << 11;
+                    res += (LV_MAX(dest_buf_c16[dest_x].green - g, 0)) << 5;
+                    res += LV_MAX(dest_buf_c16[dest_x].blue - rb, 0);
+                    break;
+                case LV_BLEND_MODE_MULTIPLY:
+                    res = ((dest_buf_c16[dest_x].red * rb) >> 5) << 11;
+                    res += ((dest_buf_c16[dest_x].green * g) >> 6) << 5;
+                    res += (dest_buf_c16[dest_x].blue * rb) >> 5;
+                    break;
+                default:
+                    LV_LOG_WARN("Not supported blend mode: %d", dsc->blend_mode);
+                    return;
+                }
+                if (mask_buf == NULL && opa >= LV_OPA_MAX) {
+                    dest_buf_u16[dest_x] = lv_color_16_16_mix(res, dest_buf_u16[dest_x], src_buf_al88[src_x].alpha);
+                } else if (mask_buf == NULL && opa < LV_OPA_MAX) {
+                    dest_buf_u16[dest_x] = lv_color_16_16_mix(res, dest_buf_u16[dest_x], LV_OPA_MIX2(opa, src_buf_al88[src_x].alpha));
+                } else {
+                    if (opa >= LV_OPA_MAX) {
+                        dest_buf_u16[dest_x] = lv_color_16_16_mix(res, dest_buf_u16[dest_x], mask_buf[dest_x]);
+                    } else dest_buf_u16[dest_x] = lv_color_16_16_mix(res, dest_buf_u16[dest_x], LV_OPA_MIX3(mask_buf[dest_x], opa,
+                                                      src_buf_al88[src_x].alpha));
+                }
+            }
+
+            dest_buf_u16 = drawbuf_next_row(dest_buf_u16, dest_stride);
+            src_buf_al88 = drawbuf_next_row(src_buf_al88, src_stride);
+            if (mask_buf) {
+                mask_buf += mask_stride;
+            }
+        }
+    }
+}
+
+static void LV_ATTRIBUTE_FAST_MEM l8_image_blend(_lv_draw_sw_blend_image_dsc_t *dsc)
+{
+    int32_t w = dsc->dest_w;
+    int32_t h = dsc->dest_h;
+    lv_opa_t opa = dsc->opa;
+    uint16_t *dest_buf_u16 = dsc->dest_buf;
+    int32_t dest_stride = dsc->dest_stride;
+    const uint8_t *src_buf_l8 = dsc->src_buf;
+    int32_t src_stride = dsc->src_stride;
+    const lv_opa_t *mask_buf = dsc->mask_buf;
+    int32_t mask_stride = dsc->mask_stride;
+
+    int32_t dest_x;
+    int32_t src_x;
+    int32_t y;
+
+    if (dsc->blend_mode == LV_BLEND_MODE_NORMAL) {
+        if (mask_buf == NULL && opa >= LV_OPA_MAX) {
+            if (LV_RESULT_INVALID == LV_DRAW_SW_L8_BLEND_NORMAL_TO_RGB565(dsc)) {
+                for (y = 0; y < h; y++) {
+                    for (dest_x = 0, src_x = 0; dest_x < w; dest_x++, src_x++) {
+                        dest_buf_u16[dest_x] = l8_to_rgb565(src_buf_l8[src_x]);
+                    }
+                    dest_buf_u16 = drawbuf_next_row(dest_buf_u16, dest_stride);
+                    src_buf_l8 += src_stride;
+                }
+            }
+        } else if (mask_buf == NULL && opa < LV_OPA_MAX) {
+            if (LV_RESULT_INVALID == LV_DRAW_SW_L8_BLEND_NORMAL_TO_RGB565_WITH_OPA(dsc)) {
+                for (y = 0; y < h; y++) {
+                    for (dest_x = 0, src_x = 0; dest_x < w; dest_x++, src_x++) {
+                        dest_buf_u16[dest_x] = lv_color_8_16_mix(src_buf_l8[src_x], dest_buf_u16[dest_x], opa);
+                    }
+                    dest_buf_u16 = drawbuf_next_row(dest_buf_u16, dest_stride);
+                    src_buf_l8 += src_stride;
+                }
+            }
+        } else if (mask_buf && opa >= LV_OPA_MAX) {
+            if (LV_RESULT_INVALID == LV_DRAW_SW_L8_BLEND_NORMAL_TO_RGB565_WITH_MASK(dsc)) {
+                for (y = 0; y < h; y++) {
+                    for (dest_x = 0, src_x = 0; dest_x < w; dest_x++, src_x++) {
+                        dest_buf_u16[dest_x] = lv_color_8_16_mix(src_buf_l8[src_x], dest_buf_u16[dest_x], mask_buf[dest_x]);
+                    }
+                    dest_buf_u16 = drawbuf_next_row(dest_buf_u16, dest_stride);
+                    src_buf_l8 += src_stride;
+                    mask_buf += mask_stride;
+                }
+            }
+        } else if (mask_buf && opa < LV_OPA_MAX) {
+            if (LV_RESULT_INVALID == LV_DRAW_SW_L8_BLEND_NORMAL_TO_RGB565_MIX_MASK_OPA(dsc)) {
+                for (y = 0; y < h; y++) {
+                    for (dest_x = 0, src_x = 0; dest_x < w; dest_x++, src_x++) {
+                        dest_buf_u16[dest_x] = lv_color_8_16_mix(src_buf_l8[src_x], dest_buf_u16[dest_x], LV_OPA_MIX2(mask_buf[dest_x], opa));
+                    }
+                    dest_buf_u16 = drawbuf_next_row(dest_buf_u16, dest_stride);
+                    src_buf_l8 += src_stride;
+                    mask_buf += mask_stride;
+                }
+            }
+        }
+    } else {
+        uint16_t res = 0;
+        for (y = 0; y < h; y++) {
+            lv_color16_t *dest_buf_c16 = (lv_color16_t *)dest_buf_u16;
+            for (dest_x = 0, src_x = 0; dest_x < w; dest_x++, src_x += 4) {
+                uint8_t rb = src_buf_l8[src_x] >> 3;
+                uint8_t g = src_buf_l8[src_x] >> 2;
+                switch (dsc->blend_mode) {
+                case LV_BLEND_MODE_ADDITIVE:
+                    res = (LV_MIN(dest_buf_c16[dest_x].red + rb, 31)) << 11;
+                    res += (LV_MIN(dest_buf_c16[dest_x].green + g, 63)) << 5;
+                    res += LV_MIN(dest_buf_c16[dest_x].blue + rb, 31);
+                    break;
+                case LV_BLEND_MODE_SUBTRACTIVE:
+                    res = (LV_MAX(dest_buf_c16[dest_x].red - rb, 0)) << 11;
+                    res += (LV_MAX(dest_buf_c16[dest_x].green - g, 0)) << 5;
+                    res += LV_MAX(dest_buf_c16[dest_x].blue - rb, 0);
+                    break;
+                case LV_BLEND_MODE_MULTIPLY:
+                    res = ((dest_buf_c16[dest_x].red * rb) >> 5) << 11;
+                    res += ((dest_buf_c16[dest_x].green * g) >> 6) << 5;
+                    res += (dest_buf_c16[dest_x].blue * rb) >> 5;
+                    break;
+                default:
+                    LV_LOG_WARN("Not supported blend mode: %d", dsc->blend_mode);
+                    return;
+                }
+
+                if (mask_buf == NULL && opa >= LV_OPA_MAX) {
+                    dest_buf_u16[dest_x] = res;
+                } else if (mask_buf == NULL && opa < LV_OPA_MAX) {
+                    dest_buf_u16[dest_x] = lv_color_16_16_mix(res, dest_buf_u16[dest_x], opa);
+                } else {
+                    if (opa >= LV_OPA_MAX) {
+                        dest_buf_u16[dest_x] = lv_color_16_16_mix(res, dest_buf_u16[dest_x], mask_buf[dest_x]);
+                    } else {
+                        dest_buf_u16[dest_x] = lv_color_16_16_mix(res, dest_buf_u16[dest_x], LV_OPA_MIX2(mask_buf[dest_x], opa));
+                    }
+                }
+            }
+
+            dest_buf_u16 = drawbuf_next_row(dest_buf_u16, dest_stride);
+            src_buf_l8 += src_stride;
+            if (mask_buf) {
+                mask_buf += mask_stride;
+            }
+        }
+    }
+}
+
+static void LV_ATTRIBUTE_FAST_MEM rgb565_image_blend(_lv_draw_sw_blend_image_dsc_t *dsc)
+{
+    int32_t w = dsc->dest_w;
+    int32_t h = dsc->dest_h;
+    lv_opa_t opa = dsc->opa;
+    uint16_t *dest_buf_u16 = dsc->dest_buf;
+    int32_t dest_stride = dsc->dest_stride;
+    const uint16_t *src_buf_u16 = dsc->src_buf;
+    int32_t src_stride = dsc->src_stride;
+    const lv_opa_t *mask_buf = dsc->mask_buf;
+    int32_t mask_stride = dsc->mask_stride;
+
+    int32_t x;
+    int32_t y;
+
+    if (dsc->blend_mode == LV_BLEND_MODE_NORMAL) {
+        if (mask_buf == NULL && opa >= LV_OPA_MAX) {
+            if (LV_RESULT_INVALID == LV_DRAW_SW_RGB565_BLEND_NORMAL_TO_RGB565(dsc)) {
+                uint32_t line_in_bytes = w * 2;
+                for (y = 0; y < h; y++) {
+                    memcpy(dest_buf_u16, src_buf_u16, line_in_bytes);   // lv_memcpy
+                    dest_buf_u16 = drawbuf_next_row(dest_buf_u16, dest_stride);
+                    src_buf_u16 = drawbuf_next_row(src_buf_u16, src_stride);
+                }
+            }
+        } else if (mask_buf == NULL && opa < LV_OPA_MAX) {
+            if (LV_RESULT_INVALID == LV_DRAW_SW_RGB565_BLEND_NORMAL_TO_RGB565_WITH_OPA(dsc)) {
+                for (y = 0; y < h; y++) {
+                    for (x = 0; x < w; x++) {
+                        dest_buf_u16[x] = lv_color_16_16_mix(src_buf_u16[x], dest_buf_u16[x], opa);
+                    }
+                    dest_buf_u16 = drawbuf_next_row(dest_buf_u16, dest_stride);
+                    src_buf_u16 = drawbuf_next_row(src_buf_u16, src_stride);
+                }
+            }
+        } else if (mask_buf && opa >= LV_OPA_MAX) {
+            if (LV_RESULT_INVALID == LV_DRAW_SW_RGB565_BLEND_NORMAL_TO_RGB565_WITH_MASK(dsc)) {
+                for (y = 0; y < h; y++) {
+                    for (x = 0; x < w; x++) {
+                        dest_buf_u16[x] = lv_color_16_16_mix(src_buf_u16[x], dest_buf_u16[x], mask_buf[x]);
+                    }
+                    dest_buf_u16 = drawbuf_next_row(dest_buf_u16, dest_stride);
+                    src_buf_u16 = drawbuf_next_row(src_buf_u16, src_stride);
+                    mask_buf += mask_stride;
+                }
+            }
+        } else {
+            if (LV_RESULT_INVALID == LV_DRAW_SW_RGB565_BLEND_NORMAL_TO_RGB565_MIX_MASK_OPA(dsc)) {
+                for (y = 0; y < h; y++) {
+                    for (x = 0; x < w; x++) {
+                        dest_buf_u16[x] = lv_color_16_16_mix(src_buf_u16[x], dest_buf_u16[x], LV_OPA_MIX2(mask_buf[x], opa));
+                    }
+                    dest_buf_u16 = drawbuf_next_row(dest_buf_u16, dest_stride);
+                    src_buf_u16 = drawbuf_next_row(src_buf_u16, src_stride);
+                    mask_buf += mask_stride;
+                }
+            }
+        }
+    } else {
+        uint16_t res = 0;
+        for (y = 0; y < h; y++) {
+            lv_color16_t *dest_buf_c16 = (lv_color16_t *) dest_buf_u16;
+            lv_color16_t *src_buf_c16 = (lv_color16_t *) src_buf_u16;
+            for (x = 0; x < w; x++) {
+                switch (dsc->blend_mode) {
+                case LV_BLEND_MODE_ADDITIVE:
+                    if (src_buf_u16[x] == 0x0000) {
+                        continue;    /*Do not add pure black*/
+                    }
+                    res = (LV_MIN(dest_buf_c16[x].red + src_buf_c16[x].red, 31)) << 11;
+                    res += (LV_MIN(dest_buf_c16[x].green + src_buf_c16[x].green, 63)) << 5;
+                    res += LV_MIN(dest_buf_c16[x].blue + src_buf_c16[x].blue, 31);
+                    break;
+                case LV_BLEND_MODE_SUBTRACTIVE:
+                    if (src_buf_u16[x] == 0x0000) {
+                        continue;    /*Do not subtract pure black*/
+                    }
+                    res = (LV_MAX(dest_buf_c16[x].red - src_buf_c16[x].red, 0)) << 11;
+                    res += (LV_MAX(dest_buf_c16[x].green - src_buf_c16[x].green, 0)) << 5;
+                    res += LV_MAX(dest_buf_c16[x].blue - src_buf_c16[x].blue, 0);
+                    break;
+                case LV_BLEND_MODE_MULTIPLY:
+                    if (src_buf_u16[x] == 0xffff) {
+                        continue;    /*Do not multiply with pure white (considered as 1)*/
+                    }
+                    res = ((dest_buf_c16[x].red * src_buf_c16[x].red) >> 5) << 11;
+                    res += ((dest_buf_c16[x].green * src_buf_c16[x].green) >> 6) << 5;
+                    res += (dest_buf_c16[x].blue * src_buf_c16[x].blue) >> 5;
+                    break;
+                default:
+                    LV_LOG_WARN("Not supported blend mode: %d", dsc->blend_mode);
+                    return;
+                }
+
+                if (mask_buf == NULL) {
+                    dest_buf_u16[x] = lv_color_16_16_mix(res, dest_buf_u16[x], opa);
+                } else {
+                    if (opa >= LV_OPA_MAX) {
+                        dest_buf_u16[x] = lv_color_16_16_mix(res, dest_buf_u16[x], mask_buf[x]);
+                    } else {
+                        dest_buf_u16[x] = lv_color_16_16_mix(res, dest_buf_u16[x], LV_OPA_MIX2(mask_buf[x], opa));
+                    }
+                }
+            }
+
+            dest_buf_u16 = drawbuf_next_row(dest_buf_u16, dest_stride);
+            src_buf_u16 = drawbuf_next_row(src_buf_u16, src_stride);
+            if (mask_buf) {
+                mask_buf += mask_stride;
+            }
+        }
+    }
+}
+
+static void LV_ATTRIBUTE_FAST_MEM rgb888_image_blend(_lv_draw_sw_blend_image_dsc_t *dsc, const uint8_t src_px_size)
+{
+    int32_t w = dsc->dest_w;
+    int32_t h = dsc->dest_h;
+    lv_opa_t opa = dsc->opa;
+    uint16_t *dest_buf_u16 = dsc->dest_buf;
+    int32_t dest_stride = dsc->dest_stride;
+    const uint8_t *src_buf_u8 = dsc->src_buf;
+    int32_t src_stride = dsc->src_stride;
+    const lv_opa_t *mask_buf = dsc->mask_buf;
+    int32_t mask_stride = dsc->mask_stride;
+
+    int32_t dest_x;
+    int32_t src_x;
+    int32_t y;
+
+    if (dsc->blend_mode == LV_BLEND_MODE_NORMAL) {
+        if (mask_buf == NULL && opa >= LV_OPA_MAX) {
+            if (LV_RESULT_INVALID == LV_DRAW_SW_RGB888_BLEND_NORMAL_TO_RGB565(dsc, src_px_size)) {
+                for (y = 0; y < h; y++) {
+                    for (dest_x = 0, src_x = 0; dest_x < w; dest_x++, src_x += src_px_size) {
+                        dest_buf_u16[dest_x]  = ((src_buf_u8[src_x + 2] & 0xF8) << 8) +
+                                                ((src_buf_u8[src_x + 1] & 0xFC) << 3) +
+                                                ((src_buf_u8[src_x + 0] & 0xF8) >> 3);
+                    }
+                    dest_buf_u16 = drawbuf_next_row(dest_buf_u16, dest_stride);
+                    src_buf_u8 += src_stride;
+                }
+            }
+        } else if (mask_buf == NULL && opa < LV_OPA_MAX) {
+            if (LV_RESULT_INVALID == LV_DRAW_SW_RGB888_BLEND_NORMAL_TO_RGB565_WITH_OPA(dsc, src_px_size)) {
+                for (y = 0; y < h; y++) {
+                    for (dest_x = 0, src_x = 0; dest_x < w; dest_x++, src_x += src_px_size) {
+                        dest_buf_u16[dest_x] = lv_color_24_16_mix(&src_buf_u8[src_x], dest_buf_u16[dest_x], opa);
+                    }
+                    dest_buf_u16 = drawbuf_next_row(dest_buf_u16, dest_stride);
+                    src_buf_u8 += src_stride;
+                }
+            }
+        }
+        if (mask_buf && opa >= LV_OPA_MAX) {
+            if (LV_RESULT_INVALID == LV_DRAW_SW_RGB888_BLEND_NORMAL_TO_RGB565_WITH_MASK(dsc, src_px_size)) {
+                for (y = 0; y < h; y++) {
+                    for (dest_x = 0, src_x = 0; dest_x < w; dest_x++, src_x += src_px_size) {
+                        dest_buf_u16[dest_x] = lv_color_24_16_mix(&src_buf_u8[src_x], dest_buf_u16[dest_x], mask_buf[dest_x]);
+                    }
+                    dest_buf_u16 = drawbuf_next_row(dest_buf_u16, dest_stride);
+                    src_buf_u8 += src_stride;
+                    mask_buf += mask_stride;
+                }
+            }
+        }
+        if (mask_buf && opa < LV_OPA_MAX) {
+            if (LV_RESULT_INVALID == LV_DRAW_SW_RGB888_BLEND_NORMAL_TO_RGB565_MIX_MASK_OPA(dsc, src_px_size)) {
+                for (y = 0; y < h; y++) {
+                    for (dest_x = 0, src_x = 0; dest_x < w; dest_x++, src_x += src_px_size) {
+                        dest_buf_u16[dest_x] = lv_color_24_16_mix(&src_buf_u8[src_x], dest_buf_u16[dest_x], LV_OPA_MIX2(mask_buf[dest_x], opa));
+                    }
+                    dest_buf_u16 = drawbuf_next_row(dest_buf_u16, dest_stride);
+                    src_buf_u8 += src_stride;
+                    mask_buf += mask_stride;
+                }
+            }
+        }
+    } else {
+        uint16_t res = 0;
+        for (y = 0; y < h; y++) {
+            lv_color16_t *dest_buf_c16 = (lv_color16_t *) dest_buf_u16;
+            for (dest_x = 0, src_x = 0; dest_x < w; dest_x++, src_x += src_px_size) {
+                switch (dsc->blend_mode) {
+                case LV_BLEND_MODE_ADDITIVE:
+                    res = (LV_MIN(dest_buf_c16[dest_x].red + (src_buf_u8[src_x + 2] >> 3), 31)) << 11;
+                    res += (LV_MIN(dest_buf_c16[dest_x].green + (src_buf_u8[src_x + 1] >> 2), 63)) << 5;
+                    res += LV_MIN(dest_buf_c16[dest_x].blue + (src_buf_u8[src_x + 0] >> 3), 31);
+                    break;
+                case LV_BLEND_MODE_SUBTRACTIVE:
+                    res = (LV_MAX(dest_buf_c16[dest_x].red - (src_buf_u8[src_x + 2] >> 3), 0)) << 11;
+                    res += (LV_MAX(dest_buf_c16[dest_x].green - (src_buf_u8[src_x + 1] >> 2), 0)) << 5;
+                    res += LV_MAX(dest_buf_c16[dest_x].blue - (src_buf_u8[src_x + 0] >> 3), 0);
+                    break;
+                case LV_BLEND_MODE_MULTIPLY:
+                    res = ((dest_buf_c16[dest_x].red * (src_buf_u8[src_x + 2] >> 3)) >> 5) << 11;
+                    res += ((dest_buf_c16[dest_x].green * (src_buf_u8[src_x + 1] >> 2)) >> 6) << 5;
+                    res += (dest_buf_c16[dest_x].blue * (src_buf_u8[src_x + 0] >> 3)) >> 5;
+                    break;
+                default:
+                    LV_LOG_WARN("Not supported blend mode: %d", dsc->blend_mode);
+                    return;
+                }
+
+                if (mask_buf == NULL) {
+                    dest_buf_u16[dest_x] = lv_color_16_16_mix(res, dest_buf_u16[dest_x], opa);
+                } else {
+                    if (opa >= LV_OPA_MAX) {
+                        dest_buf_u16[dest_x] = lv_color_16_16_mix(res, dest_buf_u16[dest_x], mask_buf[dest_x]);
+                    } else {
+                        dest_buf_u16[dest_x] = lv_color_16_16_mix(res, dest_buf_u16[dest_x], LV_OPA_MIX2(mask_buf[dest_x], opa));
+                    }
+                }
+            }
+            dest_buf_u16 = drawbuf_next_row(dest_buf_u16, dest_stride);
+            src_buf_u8 += src_stride;
+            if (mask_buf) {
+                mask_buf += mask_stride;
+            }
+        }
+
+    }
+}
+
+static void LV_ATTRIBUTE_FAST_MEM argb8888_image_blend(_lv_draw_sw_blend_image_dsc_t *dsc)
+{
+    int32_t w = dsc->dest_w;
+    int32_t h = dsc->dest_h;
+    lv_opa_t opa = dsc->opa;
+    uint16_t *dest_buf_u16 = dsc->dest_buf;
+    int32_t dest_stride = dsc->dest_stride;
+    const uint8_t *src_buf_u8 = dsc->src_buf;
+    int32_t src_stride = dsc->src_stride;
+    const lv_opa_t *mask_buf = dsc->mask_buf;
+    int32_t mask_stride = dsc->mask_stride;
+
+    int32_t dest_x;
+    int32_t src_x;
+    int32_t y;
+
+    if (dsc->blend_mode == LV_BLEND_MODE_NORMAL) {
+        if (mask_buf == NULL && opa >= LV_OPA_MAX) {
+            if (LV_RESULT_INVALID == LV_DRAW_SW_ARGB8888_BLEND_NORMAL_TO_RGB565(dsc)) {
+                for (y = 0; y < h; y++) {
+                    for (dest_x = 0, src_x = 0; dest_x < w; dest_x++, src_x += 4) {
+                        dest_buf_u16[dest_x] = lv_color_24_16_mix(&src_buf_u8[src_x], dest_buf_u16[dest_x], src_buf_u8[src_x + 3]);
+                    }
+                    dest_buf_u16 = drawbuf_next_row(dest_buf_u16, dest_stride);
+                    src_buf_u8 += src_stride;
+                }
+            }
+        } else if (mask_buf == NULL && opa < LV_OPA_MAX) {
+            if (LV_RESULT_INVALID == LV_DRAW_SW_ARGB8888_BLEND_NORMAL_TO_RGB565_WITH_OPA(dsc)) {
+                for (y = 0; y < h; y++) {
+                    for (dest_x = 0, src_x = 0; dest_x < w; dest_x++, src_x += 4) {
+                        dest_buf_u16[dest_x] = lv_color_24_16_mix(&src_buf_u8[src_x], dest_buf_u16[dest_x], LV_OPA_MIX2(src_buf_u8[src_x + 3],
+                                               opa));
+                    }
+                    dest_buf_u16 = drawbuf_next_row(dest_buf_u16, dest_stride);
+                    src_buf_u8 += src_stride;
+                }
+            }
+        } else if (mask_buf && opa >= LV_OPA_MAX) {
+            if (LV_RESULT_INVALID == LV_DRAW_SW_ARGB8888_BLEND_NORMAL_TO_RGB565_WITH_MASK(dsc)) {
+                for (y = 0; y < h; y++) {
+                    for (dest_x = 0, src_x = 0; dest_x < w; dest_x++, src_x += 4) {
+                        dest_buf_u16[dest_x] = lv_color_24_16_mix(&src_buf_u8[src_x], dest_buf_u16[dest_x],
+                                               LV_OPA_MIX2(src_buf_u8[src_x + 3], mask_buf[dest_x]));
+                    }
+                    dest_buf_u16 = drawbuf_next_row(dest_buf_u16, dest_stride);
+                    src_buf_u8 += src_stride;
+                    mask_buf += mask_stride;
+                }
+            }
+        } else if (mask_buf && opa < LV_OPA_MAX) {
+            if (LV_RESULT_INVALID == LV_DRAW_SW_ARGB8888_BLEND_NORMAL_TO_RGB565_MIX_MASK_OPA(dsc)) {
+                for (y = 0; y < h; y++) {
+                    for (dest_x = 0, src_x = 0; dest_x < w; dest_x++, src_x += 4) {
+                        dest_buf_u16[dest_x] = lv_color_24_16_mix(&src_buf_u8[src_x], dest_buf_u16[dest_x],
+                                               LV_OPA_MIX3(src_buf_u8[src_x + 3], mask_buf[dest_x], opa));
+                    }
+                    dest_buf_u16 = drawbuf_next_row(dest_buf_u16, dest_stride);
+                    src_buf_u8 += src_stride;
+                    mask_buf += mask_stride;
+                }
+            }
+        }
+    } else {
+        uint16_t res = 0;
+        for (y = 0; y < h; y++) {
+            lv_color16_t *dest_buf_c16 = (lv_color16_t *) dest_buf_u16;
+            for (dest_x = 0, src_x = 0; dest_x < w; dest_x++, src_x += 4) {
+                switch (dsc->blend_mode) {
+                case LV_BLEND_MODE_ADDITIVE:
+                    res = (LV_MIN(dest_buf_c16[dest_x].red + (src_buf_u8[src_x + 2] >> 3), 31)) << 11;
+                    res += (LV_MIN(dest_buf_c16[dest_x].green + (src_buf_u8[src_x + 1] >> 2), 63)) << 5;
+                    res += LV_MIN(dest_buf_c16[dest_x].blue + (src_buf_u8[src_x + 0] >> 3), 31);
+                    break;
+                case LV_BLEND_MODE_SUBTRACTIVE:
+                    res = (LV_MAX(dest_buf_c16[dest_x].red - (src_buf_u8[src_x + 2] >> 3), 0)) << 11;
+                    res += (LV_MAX(dest_buf_c16[dest_x].green - (src_buf_u8[src_x + 1] >> 2), 0)) << 5;
+                    res += LV_MAX(dest_buf_c16[dest_x].blue - (src_buf_u8[src_x + 0] >> 3), 0);
+                    break;
+                case LV_BLEND_MODE_MULTIPLY:
+                    res = ((dest_buf_c16[dest_x].red * (src_buf_u8[src_x + 2] >> 3)) >> 5) << 11;
+                    res += ((dest_buf_c16[dest_x].green * (src_buf_u8[src_x + 1] >> 2)) >> 6) << 5;
+                    res += (dest_buf_c16[dest_x].blue * (src_buf_u8[src_x + 0] >> 3)) >> 5;
+                    break;
+                default:
+                    LV_LOG_WARN("Not supported blend mode: %d", dsc->blend_mode);
+                    return;
+                }
+
+                if (mask_buf == NULL && opa >= LV_OPA_MAX) {
+                    dest_buf_u16[dest_x] = lv_color_16_16_mix(res, dest_buf_u16[dest_x], src_buf_u8[src_x + 3]);
+                } else if (mask_buf == NULL && opa < LV_OPA_MAX) {
+                    dest_buf_u16[dest_x] = lv_color_16_16_mix(res, dest_buf_u16[dest_x], LV_OPA_MIX2(opa, src_buf_u8[src_x + 3]));
+                } else {
+                    if (opa >= LV_OPA_MAX) {
+                        dest_buf_u16[dest_x] = lv_color_16_16_mix(res, dest_buf_u16[dest_x], mask_buf[dest_x]);
+                    } else dest_buf_u16[dest_x] = lv_color_16_16_mix(res, dest_buf_u16[dest_x], LV_OPA_MIX3(mask_buf[dest_x], opa,
+                                                      src_buf_u8[src_x + 3]));
+                }
+            }
+
+            dest_buf_u16 = drawbuf_next_row(dest_buf_u16, dest_stride);
+            src_buf_u8 += src_stride;
+            if (mask_buf) {
+                mask_buf += mask_stride;
+            }
+        }
+    }
+}
+
+static inline uint16_t LV_ATTRIBUTE_FAST_MEM l8_to_rgb565(const uint8_t c1)
+{
+    return ((c1 & 0xF8) << 8) + ((c1 & 0xFC) << 3) + ((c1 & 0xF8) >> 3);
+}
+
+static inline uint16_t LV_ATTRIBUTE_FAST_MEM lv_color_8_16_mix(const uint8_t c1, uint16_t c2, uint8_t mix)
+{
+
+    if (mix == 0) {
+        return c2;
+    } else if (mix == 255) {
+        return ((c1 & 0xF8) << 8) + ((c1 & 0xFC) << 3) + ((c1 & 0xF8) >> 3);
+    } else {
+        lv_opa_t mix_inv = 255 - mix;
+
+        return ((((c1 >> 3) * mix + ((c2 >> 11) & 0x1F) * mix_inv) << 3) & 0xF800) +
+               ((((c1 >> 2) * mix + ((c2 >> 5) & 0x3F) * mix_inv) >> 3) & 0x07E0) +
+               (((c1 >> 3) * mix + (c2 & 0x1F) * mix_inv) >> 8);
+    }
+}
+
+static inline uint16_t LV_ATTRIBUTE_FAST_MEM lv_color_24_16_mix(const uint8_t *c1, uint16_t c2, uint8_t mix)
+{
+    if (mix == 0) {
+        return c2;
+    } else if (mix == 255) {
+        return ((c1[2] & 0xF8) << 8)  + ((c1[1] & 0xFC) << 3) + ((c1[0] & 0xF8) >> 3);
+    } else {
+        lv_opa_t mix_inv = 255 - mix;
+
+        return ((((c1[2] >> 3) * mix + ((c2 >> 11) & 0x1F) * mix_inv) << 3) & 0xF800) +
+               ((((c1[1] >> 2) * mix + ((c2 >> 5) & 0x3F) * mix_inv) >> 3) & 0x07E0) +
+               (((c1[0] >> 3) * mix + (c2 & 0x1F) * mix_inv) >> 8);
+    }
+}
+
+static inline void *LV_ATTRIBUTE_FAST_MEM drawbuf_next_row(const void *buf, uint32_t stride)
+{
+    return (void *)((uint8_t *)buf + stride);
+}
diff --git a/components/esp_lvgl_port/test_apps/simd/main/lv_fill_common.h b/components/esp_lvgl_port/test_apps/simd/main/lv_fill_common.h
new file mode 100644
index 00000000..5243857e
--- /dev/null
+++ b/components/esp_lvgl_port/test_apps/simd/main/lv_fill_common.h
@@ -0,0 +1,73 @@
+/*
+ * SPDX-FileCopyrightText: 2024 Espressif Systems (Shanghai) CO LTD
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+#pragma once
+
+#include "esp_err.h"
+#include <stdint.h>
+#include "lv_color.h"
+#include "lv_draw_sw_blend.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+// ------------------------------------------------- Macros and Types --------------------------------------------------
+
+/**
+ * @brief Functionality test combinations
+ */
+typedef struct {
+    unsigned int min_w;                     // Minimum width of the test array
+    unsigned int min_h;                     // Minimum height of the test array
+    unsigned int max_w;                     // Maximum width of the test array
+    unsigned int max_h;                     // Maximum height of the test array
+    unsigned int min_unalign_byte;          // Minimum amount of unaligned bytes of the test array
+    unsigned int max_unalign_byte;          // Maximum amount of unaligned bytes of the test array
+    unsigned int unalign_step;              // Increment step in bytes unalignment of the test array
+    unsigned int dest_stride_step;          // Increment step in destination stride of the test array
+    unsigned int test_combinations_count;   // Count of fest combinations
+} test_matrix_params_t;
+
+/**
+ * @brief Functionality test case parameters
+ */
+typedef struct {
+    struct {
+        void *p_asm;                                        // pointer to the working ASM test buf
+        void *p_ansi;                                       // pointer to the working ANSI test buf
+        void *p_asm_alloc;                                  // pointer to the beginning of the memory allocated for ASM test buf, used in free()
+        void *p_ansi_alloc;                                 // pointer to the beginning of the memory allocated for ANSI test buf, used in free()
+    } buf;
+    void (*blend_api_func)(_lv_draw_sw_blend_fill_dsc_t *); // pointer to LVGL API function
+    lv_color_format_t color_format;                         // LV color format
+    size_t data_type_size;                                  // Used data type size, eg sizeof()
+    size_t active_buf_len;                                  // Length of buffer, where the actual data are stored (not including Canary bytes)
+    size_t total_buf_len;                                   // Total length of buffer (including Canary bytes)
+    unsigned int dest_w;                                    // Destination buffer width
+    unsigned int dest_h;                                    // Destination buffer height
+    unsigned int dest_stride;                               // Destination buffer stride
+    unsigned int unalign_byte;                              // Destination buffer memory unalignment
+} func_test_case_params_t;
+
+/**
+ * @brief Benchmark test case parameters
+ */
+typedef struct {
+    unsigned int height;                                    // Test array height
+    unsigned int width;                                     // Test array width
+    unsigned int stride;                                    // Test array stride
+    unsigned int cc_height;                                 // Corner case test array height
+    unsigned int cc_width;                                  // Corner case test array width
+    unsigned int benchmark_cycles;                          // Count of benchmark cycles
+    void *array_align16;                                    // test array with 16 byte alignment - testing most ideal case
+    void *array_align1;                                     // test array with 1 byte alignment - testing wort case
+    void (*blend_api_func)(_lv_draw_sw_blend_fill_dsc_t *); // pointer to LVGL API function
+} bench_test_case_params_t;
+
+#ifdef __cplusplus
+} /*extern "C"*/
+#endif
diff --git a/components/esp_lvgl_port/test_apps/simd/main/test_app_main.c b/components/esp_lvgl_port/test_apps/simd/main/test_app_main.c
new file mode 100644
index 00000000..c02cc997
--- /dev/null
+++ b/components/esp_lvgl_port/test_apps/simd/main/test_app_main.c
@@ -0,0 +1,50 @@
+/*
+ * SPDX-FileCopyrightText: 2024 Espressif Systems (Shanghai) CO LTD
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+#include <stdio.h>
+#include "unity.h"
+#include "unity_test_utils.h"
+#include "lv_fill_common.h"
+
+#define TEST_MEMORY_LEAK_THRESHOLD (300)
+
+void app_main(void)
+{
+
+    // ______  _____ ______   _               _
+    // |  _  \/  ___|| ___ \ | |             | |
+    // | | | |\ `--. | |_/ / | |_   ___  ___ | |_
+    // | | | | `--. \|  __/  | __| / _ \/ __|| __|
+    // | |/ / /\__/ /| |     | |_ |  __/\__ \| |_
+    // |___/  \____/ \_|      \__| \___||___/ \__|
+
+    printf("______  _____ ______   _               _   \r\n");
+    printf("|  _  \\/  ___|| ___ \\ | |             | |  \r\n");
+    printf("| | | |\\ `--. | |_/ / | |_   ___  ___ | |_ \r\n");
+    printf("| | | | `--. \\|  __/  | __| / _ \\/ __|| __|\r\n");
+    printf("| |/ / /\\__/ /| |     | |_ |  __/\\__ \\| |_ \r\n");
+    printf("|___/  \\____/ \\_|      \\__| \\___||___/ \\__|\r\n");
+
+
+    UNITY_BEGIN();
+    unity_run_menu();
+    UNITY_END();
+}
+
+/* setUp runs before every test */
+void setUp(void)
+{
+    // Check for memory leaks
+    unity_utils_set_leak_level(TEST_MEMORY_LEAK_THRESHOLD);
+    unity_utils_record_free_mem();
+}
+
+/* tearDown runs after every test */
+void tearDown(void)
+{
+    // Evaluate memory leaks
+    unity_utils_evaluate_leaks();
+}
diff --git a/components/esp_lvgl_port/test_apps/simd/main/test_lv_fill_benchmark.c b/components/esp_lvgl_port/test_apps/simd/main/test_lv_fill_benchmark.c
new file mode 100644
index 00000000..85935985
--- /dev/null
+++ b/components/esp_lvgl_port/test_apps/simd/main/test_lv_fill_benchmark.c
@@ -0,0 +1,176 @@
+/*
+ * SPDX-FileCopyrightText: 2024 Espressif Systems (Shanghai) CO LTD
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+#include <string.h>
+#include <malloc.h>
+#include <sdkconfig.h>
+
+#include "unity.h"
+#include "esp_log.h"
+#include "freertos/FreeRTOS.h"  // for xthal_get_ccount()
+#include "lv_fill_common.h"
+#include "lv_draw_sw_blend.h"
+#include "lv_draw_sw_blend_to_argb8888.h"
+#include "lv_draw_sw_blend_to_rgb565.h"
+
+#define WIDTH 128
+#define HEIGHT 128
+#define STRIDE WIDTH
+#define UNALIGN_BYTES 1
+#define BENCHMARK_CYCLES 1000
+
+// ------------------------------------------------- Macros and Types --------------------------------------------------
+
+static const char *TAG_LV_FILL_BENCH = "LV Fill Benchmark";
+static const char *asm_ansi_func[] = {"ASM", "ANSI"};
+static lv_color_t test_color = {
+    .blue = 0x56,
+    .green = 0x34,
+    .red = 0x12,
+};
+
+// ------------------------------------------------ Static function headers --------------------------------------------
+
+/**
+ * @brief Initialize the benchmark test
+ */
+static void lv_fill_benchmark_init(bench_test_case_params_t *test_params);
+
+/**
+ * @brief Run the benchmark test
+ */
+static float lv_fill_benchmark_run(bench_test_case_params_t *test_params, _lv_draw_sw_blend_fill_dsc_t *dsc);
+
+// ------------------------------------------------ Test cases ---------------------------------------------------------
+
+/*
+Benchmark tests
+
+Requires:
+    - To pass functionality tests first
+
+Purpose:
+    - Test that an acceleration is achieved by an assembly implementation of LVGL blending API
+
+Procedure:
+    - Initialize input parameters (test array length, width, allocate array...) of the benchmark test
+    - Run assembly version of LVGL blending API multiple times (1000-times or so)
+    - Firstly use an input test parameters for the most ideal case (16-byte aligned array, array width and height divisible by 4 for ARGB8888 color format)
+    - Then use worst-case input test parameters (1-byte aligned array, array width and height NOT divisible by 4 for ARGB8888 color format)
+    - Count how many CPU cycles does it take to run a function from the LVGL blending API for each case (ideal and worst case)
+    - Run ansi version of LVGL blending API multiple times (1000-times or so) and repeat the 2 above steps for the ansi version
+    - Free test arrays and structures needed for LVGL blending API
+*/
+// ------------------------------------------------ Test cases stages --------------------------------------------------
+
+TEST_CASE("LV Fill benchmark ARGB8888", "[fill][benchmark][ARGB8888]")
+{
+    uint32_t *dest_array_align16  = (uint32_t *)memalign(16, STRIDE * HEIGHT * sizeof(uint32_t) + UNALIGN_BYTES);
+    TEST_ASSERT_NOT_EQUAL(NULL, dest_array_align16);
+
+    // Apply byte unalignment for the worst-case test scenario
+    uint32_t *dest_array_align1 = dest_array_align16 + UNALIGN_BYTES;
+
+    bench_test_case_params_t test_params = {
+        .height = HEIGHT,
+        .width = WIDTH,
+        .stride = STRIDE * sizeof(uint32_t),
+        .cc_height = HEIGHT - 1,
+        .cc_width = WIDTH - 1,
+        .benchmark_cycles = BENCHMARK_CYCLES,
+        .array_align16 = (void *)dest_array_align16,
+        .array_align1 = (void *)dest_array_align1,
+        .blend_api_func = &lv_draw_sw_blend_color_to_argb8888,
+    };
+
+    ESP_LOGI(TAG_LV_FILL_BENCH, "running test for ARGB8888 color format");
+    lv_fill_benchmark_init(&test_params);
+    free(dest_array_align16);
+}
+
+TEST_CASE("LV Fill benchmark RGB565", "[fill][benchmark][RGB565]")
+{
+    uint16_t *dest_array_align16  = (uint16_t *)memalign(16, STRIDE * HEIGHT * sizeof(uint16_t) + UNALIGN_BYTES);
+    TEST_ASSERT_NOT_EQUAL(NULL, dest_array_align16);
+
+    // Apply byte unalignment for the worst-case test scenario
+    uint16_t *dest_array_align1 = dest_array_align16 + UNALIGN_BYTES;
+
+    bench_test_case_params_t test_params = {
+        .height = HEIGHT,
+        .width = WIDTH,
+        .stride = STRIDE * sizeof(uint16_t),
+        .cc_height = HEIGHT - 1,
+        .cc_width = WIDTH - 1,
+        .benchmark_cycles = BENCHMARK_CYCLES,
+        .array_align16 = (void *)dest_array_align16,
+        .array_align1 = (void *)dest_array_align1,
+        .blend_api_func = &lv_draw_sw_blend_color_to_rgb565,
+    };
+
+    ESP_LOGI(TAG_LV_FILL_BENCH, "running test for RGB565 color format");
+    lv_fill_benchmark_init(&test_params);
+    free(dest_array_align16);
+}
+// ------------------------------------------------ Static test functions ----------------------------------------------
+
+static void lv_fill_benchmark_init(bench_test_case_params_t *test_params)
+{
+    // Init structure for LVGL blend API, to call the Assembly API
+    _lv_draw_sw_blend_fill_dsc_t dsc = {
+        .dest_buf = test_params->array_align16,
+        .dest_w = test_params->width,
+        .dest_h = test_params->height,
+        .dest_stride = test_params->stride,  // stride * sizeof()
+        .mask_buf = NULL,
+        .color = test_color,
+        .opa = LV_OPA_MAX,
+        .use_asm = true,
+    };
+
+    // Init structure for LVGL blend API, to call the ANSI API
+    _lv_draw_sw_blend_fill_dsc_t dsc_cc = dsc;
+    dsc_cc.dest_buf = test_params->array_align1;
+    dsc_cc.dest_w = test_params->cc_width;
+    dsc_cc.dest_h = test_params->cc_height;
+
+    // Run benchmark 2 times:
+    // First run using assembly, second run using ANSI
+    for (int i = 0; i < 2; i++) {
+
+        // Run benchmark with the most ideal input parameters
+        // Dest array is 16 byte aligned, dest_w and dest_h are dividable by 4
+        float cycles = lv_fill_benchmark_run(test_params, &dsc);        // Call Benchmark cycle
+        float per_sample = cycles / ((float)(dsc.dest_w * dsc.dest_h));
+        ESP_LOGI(TAG_LV_FILL_BENCH, " %s ideal case: %.3f cycles for %"PRIi32"x%"PRIi32" matrix, %.3f cycles per sample", asm_ansi_func[i], cycles, dsc.dest_w, dsc.dest_h, per_sample);
+
+        // Run benchmark with the corner case input parameters
+        // Dest array is 1 byte aligned, dest_w and dest_h are not dividable by 4
+        cycles = lv_fill_benchmark_run(test_params, &dsc_cc);           // Call Benchmark cycle
+        per_sample = cycles / ((float)(dsc_cc.dest_w * dsc_cc.dest_h));
+        ESP_LOGI(TAG_LV_FILL_BENCH, " %s corner case: %.3f cycles for %"PRIi32"x%"PRIi32" matrix, %.3f cycles per sample\n", asm_ansi_func[i], cycles, dsc_cc.dest_w, dsc_cc.dest_h, per_sample);
+
+        // change to ANSI
+        dsc.use_asm = false;
+        dsc_cc.use_asm = false;
+    }
+}
+
+static float lv_fill_benchmark_run(bench_test_case_params_t *test_params, _lv_draw_sw_blend_fill_dsc_t *dsc)
+{
+    // Call the DUT function for the first time to init the benchmark test
+    test_params->blend_api_func(dsc);
+
+    const unsigned int start_b = xthal_get_ccount();
+    for (int i = 0; i < test_params->benchmark_cycles; i++) {
+        test_params->blend_api_func(dsc);
+    }
+    const unsigned int end_b = xthal_get_ccount();
+
+    const float total_b = end_b - start_b;
+    const float cycles = total_b / (test_params->benchmark_cycles);
+    return cycles;
+}
diff --git a/components/esp_lvgl_port/test_apps/simd/main/test_lv_fill_functionality.c b/components/esp_lvgl_port/test_apps/simd/main/test_lv_fill_functionality.c
new file mode 100644
index 00000000..972f8edf
--- /dev/null
+++ b/components/esp_lvgl_port/test_apps/simd/main/test_lv_fill_functionality.c
@@ -0,0 +1,311 @@
+/*
+ * SPDX-FileCopyrightText: 2024 Espressif Systems (Shanghai) CO LTD
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+#include <string.h>
+#include <malloc.h>
+#include <inttypes.h>
+#include "unity.h"
+#include "esp_log.h"
+#include "lv_fill_common.h"
+#include "lv_draw_sw_blend.h"
+#include "lv_draw_sw_blend_to_argb8888.h"
+#include "lv_draw_sw_blend_to_rgb565.h"
+
+// ------------------------------------------------- Defines -----------------------------------------------------------
+
+#define DBG_PRINT_OUTPUT false
+#define CANARY_BYTES 4
+
+// ------------------------------------------------- Macros and Types --------------------------------------------------
+
+#define UPDATE_TEST_CASE(test_case_ptr, dest_w, dest_h, dest_stride, unalign_byte) ({       \
+    (test_case_ptr)->active_buf_len = (size_t)(dest_h * dest_stride);                       \
+    (test_case_ptr)->total_buf_len = (size_t)((dest_h * dest_stride) + (CANARY_BYTES * 2)); \
+    (test_case_ptr)->dest_w = (dest_w);             \
+    (test_case_ptr)->dest_h = (dest_h);             \
+    (test_case_ptr)->dest_stride = (dest_stride);   \
+    (test_case_ptr)->unalign_byte = (unalign_byte); \
+})
+
+static const char *TAG_LV_FILL_FUNC = "LV Fill Functionality";
+static char test_msg_buf[128];
+
+static lv_color_t test_color = {
+    .blue = 0x56,
+    .green = 0x34,
+    .red = 0x12,
+};
+
+// ------------------------------------------------ Static function headers --------------------------------------------
+
+/**
+ * @brief Generate all the functionality test combinations
+ *
+ * - generate functionality test combinations, based on the provided test_matrix struct
+ *
+ * @param[in] test_matrix Pointer to structure defining test matrix - all the test combinations
+ * @param[in] test_case Pointer ot structure defining functionality test case
+ */
+static void functionality_test_matrix(test_matrix_params_t *test_matrix, func_test_case_params_t *test_case);
+
+/**
+ * @brief Fill test buffers for functionality test
+ *
+ * @param[in] test_case Pointer ot structure defining functionality test case
+ */
+static void fill_test_bufs(func_test_case_params_t *test_case);
+
+/**
+ * @brief The actual functionality test
+ *
+ * - function prepares structures for functionality testing and runs the LVGL API
+ *
+ * @param[in] test_case Pointer ot structure defining functionality test case
+ */
+static void lv_fill_functionality(func_test_case_params_t *test_case);
+
+/**
+ * @brief Evaluate results for 32bit data length
+ *
+ * @param[in] test_case Pointer ot structure defining functionality test case
+ */
+static void test_eval_32bit_data(func_test_case_params_t *test_case);
+
+/**
+ * @brief Evaluate results for 16bit data length
+ *
+ * @param[in] test_case Pointer ot structure defining functionality test case
+ */
+static void test_eval_16bit_data(func_test_case_params_t *test_case);
+
+// ------------------------------------------------ Test cases ---------------------------------------------------------
+
+/*
+Functionality tests
+
+Purpose:
+    - Test that an assembly version of LVGL blending API achieves the same results as the ANSI version
+
+Procedure:
+    - Prepare testing matrix, to cover all the possible combinations of destination array widths, lengths, memory alignment...
+    - Run assembly version of the LVGL blending API
+    - Run ANSI C version of the LVGL blending API
+    - Compare the results
+    - Repeat above 3 steps for each test matrix setup
+*/
+
+// ------------------------------------------------ Test cases stages --------------------------------------------------
+
+TEST_CASE("Test fill functionality ARGB8888", "[fill][functionality][ARGB8888]")
+{
+    test_matrix_params_t test_matrix = {
+        .min_w = 8,             // 8 is the lower limit for the esp32s3 asm implementation, otherwise esp32 is executed
+        .min_h = 1,
+        .max_w = 16,
+        .max_h = 16,
+        .min_unalign_byte = 0,
+        .max_unalign_byte = 16,
+        .unalign_step = 1,
+        .dest_stride_step = 1,
+        .test_combinations_count = 0,
+    };
+
+    func_test_case_params_t test_case = {
+        .blend_api_func = &lv_draw_sw_blend_color_to_argb8888,
+        .color_format = LV_COLOR_FORMAT_ARGB8888,
+        .data_type_size = sizeof(uint32_t),
+    };
+
+    ESP_LOGI(TAG_LV_FILL_FUNC, "running test for ARGB8888 color format");
+    functionality_test_matrix(&test_matrix, &test_case);
+}
+
+TEST_CASE("Test fill functionality RGB565", "[fill][functionality][RGB565]")
+{
+    test_matrix_params_t test_matrix = {
+        .min_w = 8,             // 8 is the lower limit for the esp32s3 asm implementation, otherwise esp32 is executed
+        .min_h = 1,
+        .max_w = 16,
+        .max_h = 16,
+        .min_unalign_byte = 0,
+        .max_unalign_byte = 16,
+        .unalign_step = 1,
+        .dest_stride_step = 1,
+        .test_combinations_count = 0,
+    };
+
+    func_test_case_params_t test_case = {
+        .blend_api_func = &lv_draw_sw_blend_color_to_rgb565,
+        .color_format = LV_COLOR_FORMAT_RGB565,
+        .data_type_size = sizeof(uint16_t),
+    };
+
+    ESP_LOGI(TAG_LV_FILL_FUNC, "running test for RGB565 color format");
+    functionality_test_matrix(&test_matrix, &test_case);
+}
+
+// ------------------------------------------------ Static test functions ----------------------------------------------
+
+static void functionality_test_matrix(test_matrix_params_t *test_matrix, func_test_case_params_t *test_case)
+{
+    // Step destination array width
+    for (int dest_w = test_matrix->min_w; dest_w <= test_matrix->max_w; dest_w++) {
+
+        // Step destination array height
+        for (int dest_h = test_matrix->min_h; dest_h <= test_matrix->max_h; dest_h++) {
+
+            // Step destination array stride
+            for (int dest_stride = dest_w; dest_stride <= dest_w * 2; dest_stride += test_matrix->dest_stride_step) {
+
+                // Step destination array unalignment
+                for (int unalign_byte = test_matrix->min_unalign_byte; unalign_byte <= test_matrix->max_unalign_byte; unalign_byte += test_matrix->unalign_step) {
+
+                    // Call functionality test
+                    UPDATE_TEST_CASE(test_case, dest_w, dest_h, dest_stride, unalign_byte);
+                    lv_fill_functionality(test_case);
+                    test_matrix->test_combinations_count++;
+                }
+            }
+        }
+    }
+    ESP_LOGI(TAG_LV_FILL_FUNC, "test combinations: %d\n", test_matrix->test_combinations_count);
+}
+
+static void lv_fill_functionality(func_test_case_params_t *test_case)
+{
+    fill_test_bufs(test_case);
+
+    // Init structure for LVGL blend API, to call the Assembly API
+    _lv_draw_sw_blend_fill_dsc_t dsc_asm = {
+        .dest_buf = test_case->buf.p_asm,
+        .dest_w = test_case->dest_w,
+        .dest_h = test_case->dest_h,
+        .dest_stride = test_case->dest_stride * test_case->data_type_size,  // stride * sizeof()
+        .mask_buf = NULL,
+        .color = test_color,
+        .opa = LV_OPA_MAX,
+        .use_asm = true,
+    };
+
+    // Init structure for LVGL blend API, to call the ANSI API
+    _lv_draw_sw_blend_fill_dsc_t dsc_ansi = dsc_asm;
+    dsc_ansi.dest_buf = test_case->buf.p_ansi;
+    dsc_ansi.use_asm = false;
+
+    test_case->blend_api_func(&dsc_asm);    // Call the LVGL API with Assembly code
+    test_case->blend_api_func(&dsc_ansi);   // Call the LVGL API with ANSI code
+
+    // Shift array pointers by Canary Bytes amount back
+    test_case->buf.p_asm -= CANARY_BYTES * test_case->data_type_size;
+    test_case->buf.p_ansi -= CANARY_BYTES * test_case->data_type_size;
+
+    // Evaluate the results
+    sprintf(test_msg_buf, "Test case: dest_w = %d, dest_h = %d, dest_stride = %d, unalign_byte = %d\n", test_case->dest_w, test_case->dest_h, test_case->dest_stride, test_case->unalign_byte);
+
+    switch (test_case->color_format) {
+    case LV_COLOR_FORMAT_ARGB8888: {
+        test_eval_32bit_data(test_case);
+        break;
+    }
+
+    case LV_COLOR_FORMAT_RGB565: {
+        test_eval_16bit_data(test_case);
+        break;
+    }
+
+    default:
+        TEST_ASSERT_MESSAGE(false, "LV Color format not found");
+    }
+
+    free(test_case->buf.p_asm_alloc);
+    free(test_case->buf.p_ansi_alloc);
+
+}
+
+static void fill_test_bufs(func_test_case_params_t *test_case)
+{
+    const size_t data_type_size = test_case->data_type_size;        // sizeof() of used data type
+    const size_t total_buf_len = test_case->total_buf_len;          // Total buffer length, data part of the buffer including the Canary bytes
+    const size_t active_buf_len = test_case->active_buf_len;        // Length of buffer
+    const unsigned int unalign_byte = test_case->unalign_byte;
+
+    // Allocate destination arrays for Assembly and ANSI LVGL Blend API
+    void *mem_asm   = memalign(16, (total_buf_len * data_type_size) + unalign_byte);
+    void *mem_ansi  = memalign(16, (total_buf_len * data_type_size) + unalign_byte);
+    TEST_ASSERT_NOT_NULL_MESSAGE(mem_asm, "Lack of memory");
+    TEST_ASSERT_NOT_NULL_MESSAGE(mem_ansi, "Lack of memory");
+
+    // Save a pointer to the beginning of the allocated memory which will be used to free()
+    test_case->buf.p_asm_alloc = mem_asm;
+    test_case->buf.p_ansi_alloc = mem_ansi;
+
+    // Apply destination array unalignment
+    uint8_t *dest_buf_asm = (uint8_t *)mem_asm + unalign_byte;
+    uint8_t *dest_buf_ansi = (uint8_t *)mem_ansi + unalign_byte;
+
+    // Set the whole buffer to 0, including the Canary bytes part
+    memset(dest_buf_asm, 0, total_buf_len * data_type_size);
+    memset(dest_buf_ansi, 0, total_buf_len * data_type_size);
+
+    // Fill the actual part of the destination buffers with known values,
+    // Values must be same, because of the stride
+    for (int i = CANARY_BYTES; i < active_buf_len + CANARY_BYTES; i++) {
+        dest_buf_asm[i * data_type_size] = (uint8_t)(i % 255);
+        dest_buf_ansi[i * data_type_size] = (uint8_t)(i % 255);
+    }
+
+    // Shift array pointers by Canary Bytes amount
+    dest_buf_asm += CANARY_BYTES * data_type_size;
+    dest_buf_ansi += CANARY_BYTES * data_type_size;
+
+    // Save a pointer to the working part of the memory, where the test data are stored
+    test_case->buf.p_asm = (void *)dest_buf_asm;
+    test_case->buf.p_ansi = (void *)dest_buf_ansi;
+}
+
+static void test_eval_32bit_data(func_test_case_params_t *test_case)
+{
+    // Print results 32bit data
+#if DBG_PRINT_OUTPUT
+    for (uint32_t i = 0; i < test_case->total_buf_len; i++) {
+        printf("dest_buf[%"PRIi32"] %s ansi = %8"PRIx32" \t asm = %8"PRIx32" \n", i, ((i < 10) ? (" ") : ("")), ((uint32_t *)test_case->buf.p_ansi)[i], ((uint32_t *)test_case->buf.p_asm)[i]);
+    }
+    printf("\n");
+#endif
+
+    // Canary bytes area must stay 0
+    TEST_ASSERT_EACH_EQUAL_UINT32_MESSAGE(0, (uint32_t *)test_case->buf.p_ansi, CANARY_BYTES, test_msg_buf);
+    TEST_ASSERT_EACH_EQUAL_UINT32_MESSAGE(0, (uint32_t *)test_case->buf.p_asm, CANARY_BYTES, test_msg_buf);
+
+    // dest_buf_asm and dest_buf_ansi must be equal
+    TEST_ASSERT_EQUAL_UINT32_ARRAY_MESSAGE((uint32_t *)test_case->buf.p_asm + CANARY_BYTES, (uint32_t *)test_case->buf.p_ansi + CANARY_BYTES, test_case->active_buf_len, test_msg_buf);
+
+    // Canary bytes area must stay 0
+    TEST_ASSERT_EACH_EQUAL_UINT32_MESSAGE(0, (uint32_t *)test_case->buf.p_ansi + (test_case->total_buf_len - CANARY_BYTES), CANARY_BYTES, test_msg_buf);
+    TEST_ASSERT_EACH_EQUAL_UINT32_MESSAGE(0, (uint32_t *)test_case->buf.p_asm + (test_case->total_buf_len - CANARY_BYTES), CANARY_BYTES, test_msg_buf);
+}
+
+static void test_eval_16bit_data(func_test_case_params_t *test_case)
+{
+    // Print results, 16bit data
+#if DBG_PRINT_OUTPUT
+    for (uint32_t i = 0; i < test_case->total_buf_len; i++) {
+        printf("dest_buf[%"PRIi32"] %s ansi = %8"PRIx16" \t asm = %8"PRIx16" \n", i, ((i < 10) ? (" ") : ("")), ((uint16_t *)test_case->buf.p_ansi)[i], ((uint16_t *)test_case->buf.p_asm)[i]);
+    }
+    printf("\n");
+#endif
+
+    // Canary bytes area must stay 0
+    TEST_ASSERT_EACH_EQUAL_UINT16_MESSAGE(0, (uint16_t *)test_case->buf.p_ansi, CANARY_BYTES, test_msg_buf);
+    TEST_ASSERT_EACH_EQUAL_UINT16_MESSAGE(0, (uint16_t *)test_case->buf.p_asm, CANARY_BYTES, test_msg_buf);
+
+    // dest_buf_asm and dest_buf_ansi must be equal
+    TEST_ASSERT_EQUAL_UINT16_ARRAY_MESSAGE((uint16_t *)test_case->buf.p_asm + CANARY_BYTES, (uint16_t *)test_case->buf.p_ansi + CANARY_BYTES, test_case->active_buf_len, test_msg_buf);
+
+    // Canary bytes area must stay 0
+    TEST_ASSERT_EACH_EQUAL_UINT16_MESSAGE(0, (uint16_t *)test_case->buf.p_ansi + (test_case->total_buf_len - CANARY_BYTES), CANARY_BYTES, test_msg_buf);
+    TEST_ASSERT_EACH_EQUAL_UINT16_MESSAGE(0, (uint16_t *)test_case->buf.p_asm + (test_case->total_buf_len - CANARY_BYTES), CANARY_BYTES, test_msg_buf);
+}
diff --git a/components/esp_lvgl_port/test_apps/simd/sdkconfig.defaults b/components/esp_lvgl_port/test_apps/simd/sdkconfig.defaults
new file mode 100644
index 00000000..384a6053
--- /dev/null
+++ b/components/esp_lvgl_port/test_apps/simd/sdkconfig.defaults
@@ -0,0 +1,3 @@
+CONFIG_ESP_TASK_WDT=n
+CONFIG_OPTIMIZATION_LEVEL_RELEASE=y
+CONFIG_COMPILER_OPTIMIZATION_PERF=y
\ No newline at end of file