Merge branch 'feat/async_memcpy_any_alignment_v5.3' into 'release/v5.3'

async memcpy destination address doesn't have to be cache aligned (v5.3) See merge request espressif/esp-idf!36634
2025-03-12 18:49:08 -04:00 · 2025-02-10 13:32:22 +08:00 · 2025-02-10 13:32:22 +08:00 · 33cc36595d
commit 33cc36595d
parent 57061d6336 74615ed1a7
7 changed files with 717 additions and 462 deletions
--- a/components/esp_hw_support/dma/async_memcpy_gdma.c
+++ b/components/esp_hw_support/dma/async_memcpy_gdma.c
@ -1,5 +1,5 @@
 /*
- * SPDX-FileCopyrightText: 2020-2024 Espressif Systems (Shanghai) CO LTD
+ * SPDX-FileCopyrightText: 2020-2025 Espressif Systems (Shanghai) CO LTD
 *
 * SPDX-License-Identifier: Apache-2.0
 */
@ -16,69 +16,49 @@
 #include "esp_attr.h"
 #include "esp_err.h"
 #include "esp_private/gdma.h"
+#include "esp_private/gdma_link.h"
+#include "esp_private/esp_dma_utils.h"
 #include "esp_memory_utils.h"
+#include "esp_cache.h"
 #include "esp_async_memcpy.h"
 #include "esp_async_memcpy_priv.h"
-#include "esp_cache.h"
-#include "hal/dma_types.h"
 #include "hal/cache_hal.h"
 #include "hal/cache_ll.h"
+#include "hal/gdma_ll.h"

 static const char *TAG = "async_mcp.gdma";

-#ifdef CACHE_LL_L2MEM_NON_CACHE_ADDR
-#define MCP_GET_NON_CACHE_ADDR(addr) ((addr) ? CACHE_LL_L2MEM_NON_CACHE_ADDR(addr) : 0)
-#else
-#define MCP_GET_NON_CACHE_ADDR(addr) (addr)
-#endif
-
-#if SOC_AXI_GDMA_SUPPORTED
-#define MCP_DMA_DESC_ALIGN 8
-typedef dma_descriptor_align8_t mcp_dma_descriptor_t;
-#elif SOC_AHB_GDMA_SUPPORTED
-#define MCP_DMA_DESC_ALIGN 4
-typedef dma_descriptor_align4_t mcp_dma_descriptor_t;
-#else
-#error "Unsupported GDMA type"
-#endif
+#define MCP_DMA_DESCRIPTOR_BUFFER_MAX_SIZE 4095

 /// @brief Transaction object for async memcpy
-/// @note - GDMA requires the DMA descriptors to be 4 or 8 bytes aligned
-/// @note - The DMA descriptor link list is allocated dynamically from DMA-able memory
-/// @note - Because of the eof_node, the transaction object should also be allocated from DMA-able memory
 typedef struct async_memcpy_transaction_t {
-    mcp_dma_descriptor_t eof_node;      // this is the DMA node which act as the EOF descriptor (RX path only)
-    mcp_dma_descriptor_t *tx_desc_link; // descriptor link list, the length of the link is determined by the copy buffer size
-    mcp_dma_descriptor_t *tx_desc_nc;   // non-cacheable version of tx_desc_link
-    mcp_dma_descriptor_t *rx_desc_link; // descriptor link list, the length of the link is determined by the copy buffer size
-    mcp_dma_descriptor_t *rx_desc_nc;   // non-cacheable version of rx_desc_link
-    intptr_t tx_start_desc_addr; // TX start descriptor address
-    intptr_t rx_start_desc_addr; // RX start descriptor address
-    void *memcpy_dst_addr;       // memcpy destination address
-    size_t memcpy_size;          // memcpy size
-    async_memcpy_isr_cb_t cb;    // user callback
-    void *cb_args;               // user callback args
+    gdma_link_list_handle_t tx_link_list;  // DMA link list for TX direction
+    gdma_link_list_handle_t rx_link_list;  // DMA link list for RX direction
+    dma_buffer_split_array_t rx_buf_array; // Split the destination buffer into cache aligned ones, save the splits in this array
+    uint8_t* stash_buffer;                 // Stash buffer for cache aligned buffer
+    async_memcpy_isr_cb_t cb; // user callback
+    void *cb_args;            // user callback args
    STAILQ_ENTRY(async_memcpy_transaction_t) idle_queue_entry;  // Entry for the idle queue
    STAILQ_ENTRY(async_memcpy_transaction_t) ready_queue_entry; // Entry for the ready queue
 } async_memcpy_transaction_t;

 /// @brief Context of async memcpy driver
 /// @note - It saves two queues, one for idle transaction objects, one for ready transaction objects
-/// @note - Transaction objects are allocated from DMA-able memory
 /// @note - Number of transaction objects are determined by the backlog parameter
 typedef struct {
    async_memcpy_context_t parent; // Parent IO interface
-    size_t rx_int_mem_alignment;   // DMA buffer alignment (both in size and address) for internal RX memory
-    size_t rx_ext_mem_alignment;   // DMA buffer alignment (both in size and address) for external RX memory
-    size_t tx_int_mem_alignment;   // DMA buffer alignment (both in size and address) for internal TX memory
-    size_t tx_ext_mem_alignment;   // DMA buffer alignment (both in size and address) for external TX memory
-    size_t max_single_dma_buffer;  // max DMA buffer size by a single descriptor
+    size_t rx_int_mem_alignment;   // Required DMA buffer alignment for internal RX memory
+    size_t rx_ext_mem_alignment;   // Required DMA buffer alignment for external RX memory
+    size_t tx_int_mem_alignment;   // Required DMA buffer alignment for internal TX memory
+    size_t tx_ext_mem_alignment;   // Required DMA buffer alignment for external TX memory
    int gdma_bus_id;               // GDMA bus id (AHB, AXI, etc.)
    gdma_channel_handle_t tx_channel; // GDMA TX channel handle
    gdma_channel_handle_t rx_channel; // GDMA RX channel handle
    portMUX_TYPE spin_lock;           // spin lock to avoid threads and isr from accessing the same resource simultaneously
    _Atomic async_memcpy_fsm_t fsm;   // driver state machine, changing state should be atomic
-    async_memcpy_transaction_t *transaction_pool; // transaction object pool
+    size_t num_trans_objs;            // number of transaction objects
+    async_memcpy_transaction_t *transaction_pool;    // transaction object pool
+    async_memcpy_transaction_t *current_transaction; // current transaction object
    STAILQ_HEAD(, async_memcpy_transaction_t) idle_queue_head;  // Head of the idle queue
    STAILQ_HEAD(, async_memcpy_transaction_t) ready_queue_head; // Head of the ready queue
 } async_memcpy_gdma_context_t;
@ -92,9 +72,23 @@ static esp_err_t mcp_new_etm_event(async_memcpy_context_t *ctx, async_memcpy_etm

 static esp_err_t mcp_gdma_destroy(async_memcpy_gdma_context_t *mcp_gdma)
 {
+    // clean up transaction pool
    if (mcp_gdma->transaction_pool) {
+        for (size_t i = 0; i < mcp_gdma->num_trans_objs; i++) {
+            async_memcpy_transaction_t* trans = &mcp_gdma->transaction_pool[i];
+            if (trans->tx_link_list) {
+                gdma_del_link_list(trans->tx_link_list);
+            }
+            if (trans->rx_link_list) {
+                gdma_del_link_list(trans->rx_link_list);
+            }
+            if (trans->stash_buffer) {
+                free(trans->stash_buffer);
+            }
+        }
        free(mcp_gdma->transaction_pool);
    }
+    // clean up GDMA channels
    if (mcp_gdma->tx_channel) {
        gdma_disconnect(mcp_gdma->tx_channel);
        gdma_del_channel(mcp_gdma->tx_channel);
@ -108,19 +102,19 @@ static esp_err_t mcp_gdma_destroy(async_memcpy_gdma_context_t *mcp_gdma)
 }

 static esp_err_t esp_async_memcpy_install_gdma_template(const async_memcpy_config_t *config, async_memcpy_handle_t *mcp,
-                                                        esp_err_t (*new_channel)(const gdma_channel_alloc_config_t *, gdma_channel_handle_t *),
+                                                        esp_err_t (*new_channel_func)(const gdma_channel_alloc_config_t *, gdma_channel_handle_t *),
                                                        int gdma_bus_id)
 {
    esp_err_t ret = ESP_OK;
    async_memcpy_gdma_context_t *mcp_gdma = NULL;
    ESP_RETURN_ON_FALSE(config && mcp, ESP_ERR_INVALID_ARG, TAG, "invalid argument");
-    // allocate memory of driver context from internal memory
+
+    // allocate memory of driver context from internal memory (because it contains atomic variable)
    mcp_gdma = heap_caps_calloc(1, sizeof(async_memcpy_gdma_context_t), MALLOC_CAP_INTERNAL | MALLOC_CAP_8BIT);
    ESP_GOTO_ON_FALSE(mcp_gdma, ESP_ERR_NO_MEM, err, TAG, "no mem for driver context");
    uint32_t trans_queue_len = config->backlog ? config->backlog : DEFAULT_TRANSACTION_QUEUE_LENGTH;
-    // allocate memory for transaction pool from internal memory because transaction structure contains DMA descriptor
-    mcp_gdma->transaction_pool = heap_caps_aligned_calloc(MCP_DMA_DESC_ALIGN, trans_queue_len, sizeof(async_memcpy_transaction_t),
-                                                          MALLOC_CAP_INTERNAL | MALLOC_CAP_8BIT | MALLOC_CAP_DMA);
+    // allocate memory for transaction pool from internal memory
+    mcp_gdma->transaction_pool = heap_caps_calloc(trans_queue_len, sizeof(async_memcpy_transaction_t), MALLOC_CAP_INTERNAL | MALLOC_CAP_8BIT);
    ESP_GOTO_ON_FALSE(mcp_gdma->transaction_pool, ESP_ERR_NO_MEM, err, TAG, "no mem for transaction pool");

    // create TX channel and RX channel, they should reside in the same DMA pair
@ -128,29 +122,39 @@ static esp_err_t esp_async_memcpy_install_gdma_template(const async_memcpy_confi
        .flags.reserve_sibling = 1,
        .direction = GDMA_CHANNEL_DIRECTION_TX,
    };
-    ESP_GOTO_ON_ERROR(new_channel(&tx_alloc_config, &mcp_gdma->tx_channel), err, TAG, "failed to create GDMA TX channel");
+    ESP_GOTO_ON_ERROR(new_channel_func(&tx_alloc_config, &mcp_gdma->tx_channel), err, TAG, "failed to alloc GDMA TX channel");
    gdma_channel_alloc_config_t rx_alloc_config = {
        .direction = GDMA_CHANNEL_DIRECTION_RX,
        .sibling_chan = mcp_gdma->tx_channel,
    };
-    ESP_GOTO_ON_ERROR(new_channel(&rx_alloc_config, &mcp_gdma->rx_channel), err, TAG, "failed to create GDMA RX channel");
+    ESP_GOTO_ON_ERROR(new_channel_func(&rx_alloc_config, &mcp_gdma->rx_channel), err, TAG, "failed to alloc GDMA RX channel");

-    // initialize GDMA channels
-    gdma_trigger_t m2m_trigger = GDMA_MAKE_TRIGGER(GDMA_TRIG_PERIPH_M2M, 0);
    // get a free DMA trigger ID for memory copy
+    gdma_trigger_t m2m_trigger = GDMA_MAKE_TRIGGER(GDMA_TRIG_PERIPH_M2M, 0);
    uint32_t free_m2m_id_mask = 0;
    gdma_get_free_m2m_trig_id_mask(mcp_gdma->tx_channel, &free_m2m_id_mask);
    m2m_trigger.instance_id = __builtin_ctz(free_m2m_id_mask);
    ESP_GOTO_ON_ERROR(gdma_connect(mcp_gdma->rx_channel, m2m_trigger), err, TAG, "GDMA rx connect failed");
    ESP_GOTO_ON_ERROR(gdma_connect(mcp_gdma->tx_channel, m2m_trigger), err, TAG, "GDMA tx connect failed");

+    gdma_strategy_config_t strategy_cfg = {
+        .owner_check = true,
+        .auto_update_desc = true,
+    };
+    gdma_apply_strategy(mcp_gdma->tx_channel, &strategy_cfg);
+    gdma_apply_strategy(mcp_gdma->rx_channel, &strategy_cfg);
+
    gdma_transfer_config_t transfer_cfg = {
-        .max_data_burst_size = config->dma_burst_size ? config->dma_burst_size : 16,
+        .max_data_burst_size = config->dma_burst_size,
        .access_ext_mem = true, // allow to do memory copy from/to external memory
    };
    ESP_GOTO_ON_ERROR(gdma_config_transfer(mcp_gdma->tx_channel, &transfer_cfg), err, TAG, "config transfer for tx channel failed");
    ESP_GOTO_ON_ERROR(gdma_config_transfer(mcp_gdma->rx_channel, &transfer_cfg), err, TAG, "config transfer for rx channel failed");

+    // get the buffer alignment required by the GDMA channel
+    gdma_get_alignment_constraints(mcp_gdma->rx_channel, &mcp_gdma->rx_int_mem_alignment, &mcp_gdma->rx_ext_mem_alignment);
+    gdma_get_alignment_constraints(mcp_gdma->tx_channel, &mcp_gdma->tx_int_mem_alignment, &mcp_gdma->tx_ext_mem_alignment);
+
    // register rx eof callback
    gdma_rx_event_callbacks_t cbs = {
        .on_recv_eof = mcp_gdma_rx_eof_callback,
@ -169,20 +173,14 @@ static esp_err_t esp_async_memcpy_install_gdma_template(const async_memcpy_confi
    portMUX_INITIALIZE(&mcp_gdma->spin_lock);
    atomic_init(&mcp_gdma->fsm, MCP_FSM_IDLE);
    mcp_gdma->gdma_bus_id = gdma_bus_id;
+    mcp_gdma->num_trans_objs = trans_queue_len;

-    // get the buffer alignment required by the GDMA channel
-    gdma_get_alignment_constraints(mcp_gdma->rx_channel, &mcp_gdma->rx_int_mem_alignment, &mcp_gdma->rx_ext_mem_alignment);
-    gdma_get_alignment_constraints(mcp_gdma->tx_channel, &mcp_gdma->tx_int_mem_alignment, &mcp_gdma->tx_ext_mem_alignment);
-
-    size_t buf_align = MAX(MAX(mcp_gdma->rx_int_mem_alignment, mcp_gdma->rx_ext_mem_alignment),
-                           MAX(mcp_gdma->tx_int_mem_alignment, mcp_gdma->tx_ext_mem_alignment));
-    mcp_gdma->max_single_dma_buffer = ALIGN_DOWN(DMA_DESCRIPTOR_BUFFER_MAX_SIZE, buf_align);
    mcp_gdma->parent.del = mcp_gdma_del;
    mcp_gdma->parent.memcpy = mcp_gdma_memcpy;
 #if SOC_GDMA_SUPPORT_ETM
    mcp_gdma->parent.new_etm_event = mcp_new_etm_event;
 #endif
-    // return driver object
+    // return base object
    *mcp = &mcp_gdma->parent;
    return ESP_OK;

@ -227,61 +225,6 @@ static esp_err_t mcp_gdma_del(async_memcpy_context_t *ctx)
    return mcp_gdma_destroy(mcp_gdma);
 }

-static void mount_tx_buffer_to_dma(async_memcpy_transaction_t *trans, int num_desc,
-                                   uint8_t *buf, size_t buf_sz, size_t max_single_dma_buffer)
-{
-    mcp_dma_descriptor_t *desc_array = trans->tx_desc_link;
-    mcp_dma_descriptor_t *desc_nc = trans->tx_desc_nc;
-    uint32_t prepared_length = 0;
-    size_t len = buf_sz;
-    for (int i = 0; i < num_desc - 1; i++) {
-        desc_nc[i].buffer = &buf[prepared_length];
-        desc_nc[i].dw0.owner = DMA_DESCRIPTOR_BUFFER_OWNER_DMA;
-        desc_nc[i].dw0.suc_eof = 0;
-        desc_nc[i].dw0.size = max_single_dma_buffer;
-        desc_nc[i].dw0.length = max_single_dma_buffer;
-        desc_nc[i].next = &desc_array[i + 1];
-        prepared_length += max_single_dma_buffer;
-        len -= max_single_dma_buffer;
-    }
-    // take special care to the EOF descriptor
-    desc_nc[num_desc - 1].buffer = &buf[prepared_length];
-    desc_nc[num_desc - 1].next = NULL;
-    desc_nc[num_desc - 1].dw0.owner = DMA_DESCRIPTOR_BUFFER_OWNER_DMA;
-    desc_nc[num_desc - 1].dw0.suc_eof = 1;
-    desc_nc[num_desc - 1].dw0.size = len;
-    desc_nc[num_desc - 1].dw0.length = len;
-}
-
-static void mount_rx_buffer_to_dma(async_memcpy_transaction_t *trans, int num_desc,
-                                   uint8_t *buf, size_t buf_sz, size_t max_single_dma_buffer)
-{
-    mcp_dma_descriptor_t *desc_array = trans->rx_desc_link;
-    mcp_dma_descriptor_t *desc_nc = trans->rx_desc_nc;
-    mcp_dma_descriptor_t *eof_desc = &trans->eof_node;
-    mcp_dma_descriptor_t *eof_nc = (mcp_dma_descriptor_t *)MCP_GET_NON_CACHE_ADDR(eof_desc);
-    uint32_t prepared_length = 0;
-    size_t len = buf_sz;
-    if (desc_array) {
-        assert(num_desc > 0);
-        for (int i = 0; i < num_desc; i++) {
-            desc_nc[i].buffer = &buf[prepared_length];
-            desc_nc[i].dw0.owner = DMA_DESCRIPTOR_BUFFER_OWNER_DMA;
-            desc_nc[i].dw0.size = max_single_dma_buffer;
-            desc_nc[i].dw0.length = max_single_dma_buffer;
-            desc_nc[i].next = &desc_array[i + 1];
-            prepared_length += max_single_dma_buffer;
-            len -= max_single_dma_buffer;
-        }
-        desc_nc[num_desc - 1].next = eof_desc;
-    }
-    eof_nc->buffer = &buf[prepared_length];
-    eof_nc->next = NULL;
-    eof_nc->dw0.owner = DMA_DESCRIPTOR_BUFFER_OWNER_DMA;
-    eof_nc->dw0.size = len;
-    eof_nc->dw0.length = len;
-}
-
 /// @brief help function to get one transaction from the ready queue
 /// @note this function is allowed to be called in ISR
 static async_memcpy_transaction_t *try_pop_trans_from_ready_queue(async_memcpy_gdma_context_t *mcp_gdma)
@ -306,8 +249,9 @@ static void try_start_pending_transaction(async_memcpy_gdma_context_t *mcp_gdma)
        trans = try_pop_trans_from_ready_queue(mcp_gdma);
        if (trans) {
            atomic_store(&mcp_gdma->fsm, MCP_FSM_RUN);
-            gdma_start(mcp_gdma->rx_channel, trans->rx_start_desc_addr);
-            gdma_start(mcp_gdma->tx_channel, trans->tx_start_desc_addr);
+            mcp_gdma->current_transaction = trans;
+            gdma_start(mcp_gdma->rx_channel, gdma_link_get_head_addr(trans->rx_link_list));
+            gdma_start(mcp_gdma->tx_channel, gdma_link_get_head_addr(trans->tx_link_list));
        } else {
            atomic_store(&mcp_gdma->fsm, MCP_FSM_IDLE);
        }
@ -328,6 +272,7 @@ static async_memcpy_transaction_t *try_pop_trans_from_idle_queue(async_memcpy_gd
    return trans;
 }

+/// @brief Check if the address and size can meet the requirement of the DMA engine
 static bool check_buffer_alignment(async_memcpy_gdma_context_t *mcp_gdma, void *src, void *dst, size_t n)
 {
    bool valid = true;
@ -355,19 +300,26 @@ static esp_err_t mcp_gdma_memcpy(async_memcpy_context_t *ctx, void *dst, void *s
 {
    esp_err_t ret = ESP_OK;
    async_memcpy_gdma_context_t *mcp_gdma = __containerof(ctx, async_memcpy_gdma_context_t, parent);
+    size_t dma_link_item_alignment = 4;
    // buffer location check
-#if SOC_AHB_GDMA_SUPPORTED && !SOC_AHB_GDMA_SUPPORT_PSRAM
+#if SOC_AHB_GDMA_SUPPORTED
    if (mcp_gdma->gdma_bus_id == SOC_GDMA_BUS_AHB) {
+#if !SOC_AHB_GDMA_SUPPORT_PSRAM
        ESP_RETURN_ON_FALSE(esp_ptr_internal(src) && esp_ptr_internal(dst), ESP_ERR_INVALID_ARG, TAG, "AHB GDMA can only access SRAM");
+#endif // !SOC_AHB_GDMA_SUPPORT_PSRAM
+        dma_link_item_alignment = GDMA_LL_AHB_DESC_ALIGNMENT;
    }
-#endif // SOC_AHB_GDMA_SUPPORTED && !SOC_AHB_GDMA_SUPPORT_PSRAM
-#if SOC_AXI_GDMA_SUPPORTED && !SOC_AXI_GDMA_SUPPORT_PSRAM
+#endif // SOC_AHB_GDMA_SUPPORTED
+#if SOC_AXI_GDMA_SUPPORTED
    if (mcp_gdma->gdma_bus_id == SOC_GDMA_BUS_AXI) {
-        ESP_RETURN_ON_FALSE(esp_ptr_internal(src) && esp_ptr_internal(dst), ESP_ERR_INVALID_ARG, TAG, "AXI DMA can only access SRAM");
+#if !SOC_AXI_GDMA_SUPPORT_PSRAM
+        ESP_RETURN_ON_FALSE(esp_ptr_internal(src) && esp_ptr_internal(dst), ESP_ERR_INVALID_ARG, TAG, "AXI GDMA can only access SRAM");
+#endif // !SOC_AXI_GDMA_SUPPORT_PSRAM
+        dma_link_item_alignment = GDMA_LL_AXI_DESC_ALIGNMENT;
    }
-#endif // SOC_AXI_GDMA_SUPPORTED && !SOC_AXI_GDMA_SUPPORT_PSRAM
+#endif // SOC_AXI_GDMA_SUPPORTED
    // alignment check
-    ESP_RETURN_ON_FALSE(check_buffer_alignment(mcp_gdma, src, dst, n), ESP_ERR_INVALID_ARG, TAG, "buffer not aligned: %p -> %p, sz=%zu", src, dst, n);
+    ESP_RETURN_ON_FALSE(check_buffer_alignment(mcp_gdma, src, dst, n), ESP_ERR_INVALID_ARG, TAG, "address|size not aligned: %p -> %p, sz=%zu", src, dst, n);

    async_memcpy_transaction_t *trans = NULL;
    // pick one transaction node from idle queue
@ -375,51 +327,84 @@ static esp_err_t mcp_gdma_memcpy(async_memcpy_context_t *ctx, void *dst, void *s
    // check if we get the transaction object successfully
    ESP_RETURN_ON_FALSE(trans, ESP_ERR_INVALID_STATE, TAG, "no free node in the idle queue");

-    // calculate how many descriptors we want
-    size_t max_single_dma_buffer = mcp_gdma->max_single_dma_buffer;
-    uint32_t num_desc_per_path = (n + max_single_dma_buffer - 1) / max_single_dma_buffer;
-    // allocate DMA descriptors from internal memory
-    trans->tx_desc_link = heap_caps_aligned_calloc(MCP_DMA_DESC_ALIGN, num_desc_per_path, sizeof(mcp_dma_descriptor_t),
-                                                   MALLOC_CAP_INTERNAL | MALLOC_CAP_8BIT | MALLOC_CAP_DMA);
-    ESP_GOTO_ON_FALSE(trans->tx_desc_link, ESP_ERR_NO_MEM, err, TAG, "no mem for DMA descriptors");
-    trans->tx_desc_nc = (mcp_dma_descriptor_t *)MCP_GET_NON_CACHE_ADDR(trans->tx_desc_link);
-    // don't have to allocate the EOF descriptor, we will use trans->eof_node as the RX EOF descriptor
-    if (num_desc_per_path > 1) {
-        trans->rx_desc_link = heap_caps_aligned_calloc(MCP_DMA_DESC_ALIGN, num_desc_per_path - 1, sizeof(mcp_dma_descriptor_t),
-                                                       MALLOC_CAP_INTERNAL | MALLOC_CAP_8BIT | MALLOC_CAP_DMA);
-        ESP_GOTO_ON_FALSE(trans->rx_desc_link, ESP_ERR_NO_MEM, err, TAG, "no mem for DMA descriptors");
-        trans->rx_desc_nc = (mcp_dma_descriptor_t *)MCP_GET_NON_CACHE_ADDR(trans->rx_desc_link);
-    } else {
-        // small copy buffer, use the trans->eof_node is sufficient
-        trans->rx_desc_link = NULL;
-        trans->rx_desc_nc = NULL;
+    // clean up the transaction configuration comes from the last one
+    if (trans->tx_link_list) {
+        gdma_del_link_list(trans->tx_link_list);
+        trans->tx_link_list = NULL;
+    }
+    if (trans->rx_link_list) {
+        gdma_del_link_list(trans->rx_link_list);
+        trans->rx_link_list = NULL;
+    }
+    if (trans->stash_buffer) {
+        free(trans->stash_buffer);
+        trans->stash_buffer = NULL;
    }

-    // (preload) mount src data to the TX descriptor
-    mount_tx_buffer_to_dma(trans, num_desc_per_path, src, n, max_single_dma_buffer);
-    // (preload) mount dst data to the RX descriptor
-    mount_rx_buffer_to_dma(trans, num_desc_per_path - 1, dst, n, max_single_dma_buffer);
+    // allocate gdma TX link
+    gdma_link_list_config_t tx_link_cfg = {
+        .buffer_alignment = esp_ptr_internal(src) ? mcp_gdma->tx_int_mem_alignment : mcp_gdma->tx_ext_mem_alignment,
+        .item_alignment = dma_link_item_alignment,
+        .num_items = n / MCP_DMA_DESCRIPTOR_BUFFER_MAX_SIZE + 1,
+        .flags = {
+            .check_owner = true,
+            .items_in_ext_mem = false, // TODO: if the memcopy size is too large, we may need to allocate the link list items from external memory
+        },
+    };
+    ESP_GOTO_ON_ERROR(gdma_new_link_list(&tx_link_cfg, &trans->tx_link_list), err, TAG, "failed to create TX link list");
+    // mount the source buffer to the TX link list
+    gdma_buffer_mount_config_t tx_buf_mount_config[1] = {
+        [0] = {
+            .buffer = src,
+            .length = n,
+            .flags = {
+                .mark_eof = true,   // mark the last item as EOF, so the RX channel can also received an EOF list item
+                .mark_final = true, // using singly list, so terminate the link here
+            }
+        }
+    };
+    gdma_link_mount_buffers(trans->tx_link_list, 0, tx_buf_mount_config, 1, NULL);

-    // if the data is in the cache, write back, then DMA can see the latest data
+    // read the cache line size of internal and external memory, we use this information to check if a given memory is behind the cache
+    // write back the source data if it's behind the cache
+    size_t int_mem_cache_line_size = cache_hal_get_cache_line_size(CACHE_LL_LEVEL_INT_MEM, CACHE_TYPE_DATA);
+    size_t ext_mem_cache_line_size = cache_hal_get_cache_line_size(CACHE_LL_LEVEL_EXT_MEM, CACHE_TYPE_DATA);
    bool need_write_back = false;
    if (esp_ptr_external_ram(src)) {
-        need_write_back = true;
+        need_write_back = ext_mem_cache_line_size > 0;
    } else if (esp_ptr_internal(src)) {
-#if SOC_CACHE_INTERNAL_MEM_VIA_L1CACHE
-        need_write_back = true;
-#endif
+        need_write_back = int_mem_cache_line_size > 0;
    }
    if (need_write_back) {
-        esp_cache_msync(src, n, ESP_CACHE_MSYNC_FLAG_DIR_C2M);
+        esp_cache_msync(src, n, ESP_CACHE_MSYNC_FLAG_DIR_C2M | ESP_CACHE_MSYNC_FLAG_UNALIGNED);
    }

+    // allocate gdma RX link
+    gdma_link_list_config_t rx_link_cfg = {
+        .buffer_alignment = esp_ptr_internal(dst) ? mcp_gdma->rx_int_mem_alignment : mcp_gdma->rx_ext_mem_alignment,
+        .item_alignment = dma_link_item_alignment,
+        .num_items = n / MCP_DMA_DESCRIPTOR_BUFFER_MAX_SIZE + 3,
+        .flags = {
+            .check_owner = true,
+            .items_in_ext_mem = false, // TODO: if the memcopy size is too large, we may need to allocate the link list items from external memory
+        },
+    };
+    ESP_GOTO_ON_ERROR(gdma_new_link_list(&rx_link_cfg, &trans->rx_link_list), err, TAG, "failed to create RX link list");
+
+    // if the destination buffer address is not cache line aligned, we need to split the buffer into cache line aligned ones
+    ESP_GOTO_ON_ERROR(esp_dma_split_rx_buffer_to_cache_aligned(dst, n, &trans->rx_buf_array, &trans->stash_buffer),
+                      err, TAG, "failed to split RX buffer into aligned ones");
+    // mount the destination buffer to the RX link list
+    gdma_buffer_mount_config_t rx_buf_mount_config[3] = {0};
+    for (int i = 0; i < 3; i++) {
+        rx_buf_mount_config[i].buffer = trans->rx_buf_array.aligned_buffer[i].aligned_buffer;
+        rx_buf_mount_config[i].length = trans->rx_buf_array.aligned_buffer[i].length;
+    }
+    gdma_link_mount_buffers(trans->rx_link_list, 0, rx_buf_mount_config, 3, NULL);
+
    // save other transaction context
    trans->cb = cb_isr;
    trans->cb_args = cb_args;
-    trans->memcpy_size = n;
-    trans->memcpy_dst_addr = dst; // save the destination buffer address, because we may need to do data cache invalidate later
-    trans->tx_start_desc_addr = (intptr_t)trans->tx_desc_link;
-    trans->rx_start_desc_addr = trans->rx_desc_link ? (intptr_t)trans->rx_desc_link : (intptr_t)&trans->eof_node;

    portENTER_CRITICAL(&mcp_gdma->spin_lock);
    // insert the trans to ready queue
@ -433,14 +418,6 @@ static esp_err_t mcp_gdma_memcpy(async_memcpy_context_t *ctx, void *dst, void *s

 err:
    if (trans) {
-        if (trans->tx_desc_link) {
-            free(trans->tx_desc_link);
-            trans->tx_desc_link = NULL;
-        }
-        if (trans->rx_desc_link) {
-            free(trans->rx_desc_link);
-            trans->rx_desc_link = NULL;
-        }
        // return back the trans to idle queue
        portENTER_CRITICAL(&mcp_gdma->spin_lock);
        STAILQ_INSERT_TAIL(&mcp_gdma->idle_queue_head, trans, idle_queue_entry);
@ -453,26 +430,14 @@ static bool mcp_gdma_rx_eof_callback(gdma_channel_handle_t dma_chan, gdma_event_
 {
    bool need_yield = false;
    async_memcpy_gdma_context_t *mcp_gdma = (async_memcpy_gdma_context_t *)user_data;
-    mcp_dma_descriptor_t *eof_desc = (mcp_dma_descriptor_t *)event_data->rx_eof_desc_addr;
-    // get the transaction object address by the EOF descriptor address
-    async_memcpy_transaction_t *trans = __containerof(eof_desc, async_memcpy_transaction_t, eof_node);
+    async_memcpy_transaction_t *trans = mcp_gdma->current_transaction;
+    dma_buffer_split_array_t *rx_buf_array = &trans->rx_buf_array;

    // switch driver state from RUN to IDLE
    async_memcpy_fsm_t expected_fsm = MCP_FSM_RUN;
    if (atomic_compare_exchange_strong(&mcp_gdma->fsm, &expected_fsm, MCP_FSM_IDLE_WAIT)) {
-        void *dst = trans->memcpy_dst_addr;
-        // if the data is in the cache, invalidate, then CPU can see the latest data
-        bool need_invalidate = false;
-        if (esp_ptr_external_ram(dst)) {
-            need_invalidate = true;
-        } else if (esp_ptr_internal(dst)) {
-#if SOC_CACHE_INTERNAL_MEM_VIA_L1CACHE
-            need_invalidate = true;
-#endif
-        }
-        if (need_invalidate) {
-            esp_cache_msync(dst, trans->memcpy_size, ESP_CACHE_MSYNC_FLAG_DIR_M2C);
-        }
+        // merge the cache aligned buffers to the original buffer
+        esp_dma_merge_aligned_rx_buffers(rx_buf_array);

        // invoked callback registered by user
        async_memcpy_isr_cb_t cb = trans->cb;
@ -482,15 +447,6 @@ static bool mcp_gdma_rx_eof_callback(gdma_channel_handle_t dma_chan, gdma_event_
            };
            need_yield = cb(&mcp_gdma->parent, &e, trans->cb_args);
        }
-        // recycle descriptor memory
-        if (trans->tx_desc_link) {
-            free(trans->tx_desc_link);
-            trans->tx_desc_link = NULL;
-        }
-        if (trans->rx_desc_link) {
-            free(trans->rx_desc_link);
-            trans->rx_desc_link = NULL;
-        }
        trans->cb = NULL;

        portENTER_CRITICAL_ISR(&mcp_gdma->spin_lock);
--- a/components/esp_hw_support/dma/esp_async_memcpy_priv.h
+++ b/components/esp_hw_support/dma/esp_async_memcpy_priv.h
@ -13,8 +13,6 @@
 #include "esp_async_memcpy.h"
 #include "soc/soc_caps.h"

-#define ALIGN_DOWN(val, align)  ((val) & ~((align) - 1))
-
 #define DEFAULT_TRANSACTION_QUEUE_LENGTH 4

 #ifdef __cplusplus
--- a/components/esp_hw_support/dma/esp_dma_utils.c
+++ b/components/esp_hw_support/dma/esp_dma_utils.c
@ -1,5 +1,5 @@
 /*
- * SPDX-FileCopyrightText: 2023-2024 Espressif Systems (Shanghai) CO LTD
+ * SPDX-FileCopyrightText: 2023-2025 Espressif Systems (Shanghai) CO LTD
 *
 * SPDX-License-Identifier: Apache-2.0
 */
@ -13,14 +13,118 @@
 #include "esp_heap_caps.h"
 #include "esp_memory_utils.h"
 #include "esp_dma_utils.h"
+#include "esp_private/esp_dma_utils.h"
 #include "esp_private/esp_cache_private.h"
 #include "soc/soc_caps.h"
 #include "hal/hal_utils.h"
+#include "hal/cache_hal.h"
+#include "hal/cache_ll.h"
+#include "esp_cache.h"

 static const char *TAG = "dma_utils";

 #define ALIGN_UP_BY(num, align) (((num) + ((align) - 1)) & ~((align) - 1))
-#define ALIGN_DOWN_BY(num, align) ((num) & (~((align) - 1)))
+
+esp_err_t esp_dma_split_rx_buffer_to_cache_aligned(void *rx_buffer, size_t buffer_len, dma_buffer_split_array_t *align_buf_array, uint8_t** ret_stash_buffer)
+{
+    ESP_RETURN_ON_FALSE(rx_buffer && buffer_len && align_buf_array, ESP_ERR_INVALID_ARG, TAG, "invalid argument");
+
+    // read the cache line size of internal and external memory, we also use this information to check if a given memory is behind the cache
+    size_t int_mem_cache_line_size = cache_hal_get_cache_line_size(CACHE_LL_LEVEL_INT_MEM, CACHE_TYPE_DATA);
+    size_t ext_mem_cache_line_size = cache_hal_get_cache_line_size(CACHE_LL_LEVEL_EXT_MEM, CACHE_TYPE_DATA);
+
+    size_t split_line_size = 0;
+    if (esp_ptr_external_ram(rx_buffer)) {
+        split_line_size = ext_mem_cache_line_size;
+    } else if (esp_ptr_internal(rx_buffer)) {
+        split_line_size = int_mem_cache_line_size;
+    }
+    ESP_LOGV(TAG, "split_line_size:%zu", split_line_size);
+
+    // allocate the stash buffer from internal RAM
+    // Note, the split_line_size can be 0, in this case, the stash_buffer is also NULL, which is fine
+    uint8_t* stash_buffer = heap_caps_calloc(2, split_line_size, MALLOC_CAP_DMA | MALLOC_CAP_INTERNAL | MALLOC_CAP_8BIT);
+    ESP_RETURN_ON_FALSE(!(split_line_size && !stash_buffer), ESP_ERR_NO_MEM, TAG, "no mem for stash buffer");
+
+    // clear align_array to avoid garbage data
+    memset(align_buf_array, 0, sizeof(dma_buffer_split_array_t));
+    bool need_cache_sync[3] = {false};
+
+    // if split_line_size is non-zero, split the buffer into head, body and tail
+    if (split_line_size > 0) {
+        // calculate head_overflow_len
+        size_t head_overflow_len = (uintptr_t)rx_buffer % split_line_size;
+        head_overflow_len = head_overflow_len ? split_line_size - head_overflow_len : 0;
+        ESP_LOGV(TAG, "head_addr:%p head_overflow_len:%zu", rx_buffer, head_overflow_len);
+        // calculate tail_overflow_len
+        size_t tail_overflow_len = ((uintptr_t)rx_buffer + buffer_len) % split_line_size;
+        ESP_LOGV(TAG, "tail_addr:%p tail_overflow_len:%zu", rx_buffer + buffer_len - tail_overflow_len, tail_overflow_len);
+
+        uint8_t extra_buf_count = 0;
+        uint8_t* input_buffer = (uint8_t*)rx_buffer;
+        align_buf_array->buf.head.recovery_address = input_buffer;
+        align_buf_array->buf.head.aligned_buffer = stash_buffer + split_line_size * extra_buf_count++;
+        align_buf_array->buf.head.length = head_overflow_len;
+        need_cache_sync[0] = int_mem_cache_line_size > 0;
+        align_buf_array->buf.body.recovery_address = input_buffer + head_overflow_len;
+        align_buf_array->buf.body.aligned_buffer = input_buffer + head_overflow_len;
+        align_buf_array->buf.body.length = buffer_len - head_overflow_len - tail_overflow_len;
+        need_cache_sync[1] = true;
+        align_buf_array->buf.tail.recovery_address = input_buffer + buffer_len - tail_overflow_len;
+        align_buf_array->buf.tail.aligned_buffer = stash_buffer + split_line_size * extra_buf_count++;
+        align_buf_array->buf.tail.length = tail_overflow_len;
+        need_cache_sync[2] = int_mem_cache_line_size > 0;
+
+        // special handling when input_buffer length is no more than buffer alignment
+        if (head_overflow_len >= buffer_len || tail_overflow_len >= buffer_len) {
+            align_buf_array->buf.head.length  = buffer_len ;
+            align_buf_array->buf.body.length  = 0 ;
+            align_buf_array->buf.tail.length  = 0 ;
+        }
+    } else {
+        align_buf_array->buf.body.aligned_buffer = rx_buffer;
+        align_buf_array->buf.body.recovery_address = rx_buffer;
+        align_buf_array->buf.body.length = buffer_len;
+        need_cache_sync[1] = false;
+    }
+
+    for (int i = 0; i < 3; i++) {
+        if (align_buf_array->aligned_buffer[i].length == 0) {
+            align_buf_array->aligned_buffer[i].aligned_buffer = NULL;
+            align_buf_array->aligned_buffer[i].recovery_address = NULL;
+            need_cache_sync[i] = false;
+        }
+    }
+
+    // invalidate the aligned buffer if necessary
+    for (int i = 0; i < 3; i++) {
+        if (need_cache_sync[i]) {
+            size_t sync_size = align_buf_array->aligned_buffer[i].length;
+            if (sync_size < split_line_size) {
+                // If the size is smaller than the cache line, we need to sync the split buffer (must be cache line sized)
+                sync_size = split_line_size;
+            }
+            esp_cache_msync(align_buf_array->aligned_buffer[i].aligned_buffer, sync_size, ESP_CACHE_MSYNC_FLAG_DIR_M2C);
+        }
+    }
+
+    *ret_stash_buffer = stash_buffer;
+    return ESP_OK;
+}
+
+esp_err_t esp_dma_merge_aligned_rx_buffers(dma_buffer_split_array_t *align_array)
+{
+    ESP_RETURN_ON_FALSE_ISR(align_array, ESP_ERR_INVALID_ARG, TAG, "invalid argument");
+
+    // only need to copy the head and tail buffer
+    if (align_array->buf.head.length) {
+        memcpy(align_array->buf.head.recovery_address, align_array->buf.head.aligned_buffer, align_array->buf.head.length);
+    }
+    if (align_array->buf.tail.length) {
+        memcpy(align_array->buf.tail.recovery_address, align_array->buf.tail.aligned_buffer, align_array->buf.tail.length);
+    }
+    return ESP_OK;
+}

 esp_err_t esp_dma_capable_malloc(size_t size, const esp_dma_mem_info_t *dma_mem_info, void **out_ptr, size_t *actual_size)
 {
--- a/components/esp_hw_support/dma/gdma_link.c
+++ b/components/esp_hw_support/dma/gdma_link.c
@ -6,14 +6,8 @@

 #include <stdlib.h>
 #include <string.h>
-#include <stdatomic.h>
 #include <sys/cdefs.h>
-#include <sys/lock.h>
-#include "sdkconfig.h"
-#include "freertos/FreeRTOS.h"
-#include "freertos/task.h"
 #include "soc/soc_caps.h"
-#include "soc/ext_mem_defs.h"
 #include "esp_log.h"
 #include "esp_check.h"
 #include "esp_memory_utils.h"
--- a/components/esp_hw_support/dma/include/esp_private/esp_dma_utils.h
+++ b/components/esp_hw_support/dma/include/esp_private/esp_dma_utils.h
@ -0,0 +1,88 @@
+/*
+ * SPDX-FileCopyrightText: 2023-2025 Espressif Systems (Shanghai) CO LTD
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+#pragma once
+
+#include <stdbool.h>
+#include "esp_err.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/**
+ * @brief DMA buffer information
+ */
+typedef struct {
+    void *aligned_buffer;   //!< Buffer address
+    void *recovery_address; //!< Origin buffer address that aligned buffer should be recovered
+    size_t length;          //!< Buffer length
+} dma_buffer_split_info_t;
+
+/**
+ * @brief DMA buffer aligned array
+ * The array contains three parts: head, body and tail.
+ * Length of each part will be >=0, especially, length=0 means that there is no such part.
+ */
+typedef struct {
+    union {
+        struct {
+            dma_buffer_split_info_t head;               //!< Aligned head part. Corresponds to the part of the original buffer where the head is not aligned
+            dma_buffer_split_info_t body;               //!< Aligned body part. Corresponds to the part of the original aligned buffer
+            dma_buffer_split_info_t tail;               //!< Aligned tail part. Corresponds to the part of the original buffer where the tail is not aligned
+        } buf;
+        dma_buffer_split_info_t aligned_buffer[3];      //!< DMA aligned buffer array, consist of `head`, `body` and `tail`
+    };
+} dma_buffer_split_array_t;
+
+/**
+ * @brief Split DMA RX buffer to cache aligned buffers
+ *
+ * @note After the original RX buffer is split into an array, caller should mount the buffer array to the DMA controller in scatter-gather mode.
+ *       Don't read/write the aligned buffers before the DMA finished using them.
+ *
+ * @param[in]   rx_buffer        The origin DMA buffer used for receiving data
+ * @param[in]   buffer_len       rx_buffer length
+ * @param[out]  align_buf_array  Aligned DMA buffer array
+ * @param[out]  ret_stash_buffer Allocated stash buffer (caller should free it after use)
+ * @return
+ *      - ESP_OK: Split to aligned buffer successfully
+ *      - ESP_ERR_INVALID_ARG: Split to aligned buffer failed because of invalid argument
+ *
+ *  brief sketch:
+ *                  cache alignment delimiter    cache alignment delimiter
+ *                              │                             │
+ *     Origin Buffer            │        Origin Buffer        │
+ *           │                  │              │              │
+ *           │                  ▼              ▼              ▼
+ *           │       ...---xxxxx|xxxxxxxxxxxxxxxxxxxxxxxxxxxxx|xxxxx----...
+ *           │               │                 │                 │
+ *           │               │                 ▼                 │
+ *           │               │  |xxxxxxxxxxxxxxxxxxxxxxxxxxxxx|  │
+ *           │               │                 ▲                 │
+ *           ▼               │                 │                 │
+ *     Aligned buffers       └──► Head        Body   Tail ◄──────┘
+ *                                 │                  │
+ *                                 ▼                  ▼
+ *                              |xxxxx......|     |xxxxx......|
+ */
+esp_err_t esp_dma_split_rx_buffer_to_cache_aligned(void *rx_buffer, size_t buffer_len, dma_buffer_split_array_t *align_buf_array, uint8_t** ret_stash_buffer);
+
+/**
+ * @brief Merge aligned RX buffer array to origin buffer
+ *
+ * @note This function can be used in the ISR context.
+ *
+ * @param[in] align_buf_array Aligned DMA buffer array
+ * @return
+ *      - ESP_OK: Merge aligned buffer to origin buffer successfully
+ *      - ESP_ERR_INVALID_ARG: Merge aligned buffer to origin buffer failed because of invalid argument
+ */
+esp_err_t esp_dma_merge_aligned_rx_buffers(dma_buffer_split_array_t *align_buf_array);
+
+#ifdef __cplusplus
+}
+#endif
--- a/components/esp_hw_support/test_apps/dma/main/test_async_memcpy.c
+++ b/components/esp_hw_support/test_apps/dma/main/test_async_memcpy.c
@ -1,5 +1,5 @@
 /*
- * SPDX-FileCopyrightText: 2021-2024 Espressif Systems (Shanghai) CO LTD
+ * SPDX-FileCopyrightText: 2021-2025 Espressif Systems (Shanghai) CO LTD
 *
 * SPDX-License-Identifier: Apache-2.0
 */
@ -8,27 +8,21 @@
 #include <string.h>
 #include <inttypes.h>
 #include <sys/param.h>
+#include "unity.h"
+#include "soc/soc_caps.h"
 #include "esp_heap_caps.h"
-#include "esp_rom_sys.h"
 #include "freertos/FreeRTOS.h"
 #include "freertos/task.h"
 #include "freertos/semphr.h"
-#include "unity.h"
 #include "ccomp_timer.h"
 #include "esp_async_memcpy.h"
-#include "soc/soc_caps.h"
-#include "hal/dma_types.h"
+#if SOC_GDMA_SUPPORTED
+#include "hal/gdma_ll.h"
+#endif

 #define IDF_LOG_PERFORMANCE(item, value_fmt, value, ...) \
    printf("[Performance][%s]: " value_fmt "\n", item, value, ##__VA_ARGS__)

-#define ALIGN_UP(addr, align) (((addr) + (align)-1) & ~((align)-1))
-#define ALIGN_DOWN(size, align)  ((size) & ~((align) - 1))
-
-#if CONFIG_IDF_TARGET_ESP32P4
-#define TEST_MEMCPY_BUFFER_SIZE_MUST_ALIGN_CACHE 1
-#endif
-
 typedef struct {
    uint32_t seed;
    size_t buffer_size;
@ -37,8 +31,9 @@ typedef struct {
    uint8_t *dst_buf;
    uint8_t *from_addr;
    uint8_t *to_addr;
-    uint32_t align;
-    uint32_t offset;
+    uint32_t align; // alignment required by DMA engine
+    uint32_t src_offset;
+    uint32_t dst_offset;
    bool src_in_psram;
    bool dst_in_psram;
 } memcpy_testbench_context_t;
@ -46,7 +41,6 @@ typedef struct {
 static void async_memcpy_setup_testbench(memcpy_testbench_context_t *test_context)
 {
    srand(test_context->seed);
-    printf("allocating memory buffer...\r\n");
    size_t buffer_size = test_context->buffer_size;
    size_t copy_size = buffer_size;
    uint8_t *src_buf = NULL;
@ -63,13 +57,11 @@ static void async_memcpy_setup_testbench(memcpy_testbench_context_t *test_contex
    TEST_ASSERT_NOT_NULL(dst_buf);

    // adding extra offset
-    from_addr = src_buf + test_context->offset;
-    to_addr = dst_buf;
-    copy_size -= test_context->offset;
-    copy_size &= ~(test_context->align - 1);
+    from_addr = src_buf + test_context->src_offset;
+    to_addr = dst_buf + test_context->dst_offset;
+    copy_size -= MAX(test_context->src_offset, test_context->dst_offset);

-    printf("...to copy size %zu Bytes, from @%p, to @%p\r\n", copy_size, from_addr, to_addr);
-    printf("fill src buffer with random data\r\n");
+    printf("copy @%p --> @%p, %zu Bytes\r\n", from_addr, to_addr, copy_size);
    for (int i = 0; i < copy_size; i++) {
        from_addr[i] = rand() % 256;
    }
@ -82,28 +74,23 @@ static void async_memcpy_setup_testbench(memcpy_testbench_context_t *test_contex
    test_context->to_addr = to_addr;
 }

-static void async_memcpy_verify_and_clear_testbench(uint32_t seed, uint32_t copy_size, uint8_t *src_buf, uint8_t *dst_buf, uint8_t *from_addr, uint8_t *to_addr)
+static void async_memcpy_verify_and_clear_testbench(uint32_t copy_size, uint8_t *src_buf, uint8_t *dst_buf, uint8_t *from_addr, uint8_t *to_addr)
 {
-    srand(seed);
    // check if source date has been copied to destination and source data not broken
    for (int i = 0; i < copy_size; i++) {
-        TEST_ASSERT_EQUAL_MESSAGE(rand() % 256, from_addr[i], "source data doesn't match generator data");
-    }
-    srand(seed);
-    for (int i = 0; i < copy_size; i++) {
-        TEST_ASSERT_EQUAL_MESSAGE(rand() % 256, to_addr[i], "destination data doesn't match source data");
+        if (from_addr[i] != to_addr[i]) {
+            printf("location[%d]:s=%d,d=%d\r\n", i, from_addr[i], to_addr[i]);
+            TEST_FAIL_MESSAGE("destination data doesn't match source data");
+        }
    }
    free(src_buf);
    free(dst_buf);
 }

-TEST_CASE("memory copy the same buffer with different content", "[async mcp]")
+static void test_memory_copy_with_same_buffer(async_memcpy_handle_t driver)
 {
-    async_memcpy_config_t config = ASYNC_MEMCPY_DEFAULT_CONFIG();
-    async_memcpy_handle_t driver = NULL;
-    TEST_ESP_OK(esp_async_memcpy_install(&config, &driver));
-    uint8_t *sbuf = heap_caps_aligned_calloc(4, 1, 256, MALLOC_CAP_DMA | MALLOC_CAP_INTERNAL | MALLOC_CAP_8BIT);
-    uint8_t *dbuf = heap_caps_aligned_calloc(4, 1, 256, MALLOC_CAP_DMA | MALLOC_CAP_INTERNAL | MALLOC_CAP_8BIT);
+    uint8_t *sbuf = heap_caps_calloc(1, 256, MALLOC_CAP_DMA | MALLOC_CAP_INTERNAL | MALLOC_CAP_8BIT);
+    uint8_t *dbuf = heap_caps_calloc(1, 256, MALLOC_CAP_DMA | MALLOC_CAP_INTERNAL | MALLOC_CAP_8BIT);
    TEST_ASSERT_NOT_NULL(sbuf);
    TEST_ASSERT_NOT_NULL(dbuf);

@ -119,77 +106,35 @@ TEST_CASE("memory copy the same buffer with different content", "[async mcp]")
            }
        }
    }
-    TEST_ESP_OK(esp_async_memcpy_uninstall(driver));
    free(sbuf);
    free(dbuf);
 }

-static void test_memory_copy_one_by_one(async_memcpy_handle_t driver)
+TEST_CASE("memory copy the same buffer with different content", "[async mcp]")
 {
-    uint32_t aligned_test_buffer_size[] = {256, 512, 1024, 2048, 4096};
-    memcpy_testbench_context_t test_context = {
-        .align = 4,
-    };
-
-    for (int i = 0; i < sizeof(aligned_test_buffer_size) / sizeof(aligned_test_buffer_size[0]); i++) {
-        test_context.buffer_size = aligned_test_buffer_size[i];
-        test_context.seed = i;
-        test_context.offset = 0;
-        async_memcpy_setup_testbench(&test_context);
-
-        TEST_ESP_OK(esp_async_memcpy(driver, test_context.to_addr, test_context.from_addr, test_context.copy_size, NULL, NULL));
-        vTaskDelay(pdMS_TO_TICKS(10));
-        async_memcpy_verify_and_clear_testbench(test_context.seed, test_context.copy_size, test_context.src_buf,
-                                                test_context.dst_buf, test_context.from_addr, test_context.to_addr);
-    }
-
-#if !TEST_MEMCPY_BUFFER_SIZE_MUST_ALIGN_CACHE
-    uint32_t unaligned_test_buffer_size[] = {255, 511, 1023, 2047, 4095, 5011};
-    for (int i = 0; i < sizeof(unaligned_test_buffer_size) / sizeof(unaligned_test_buffer_size[0]); i++) {
-        // Test different align edge
-        for (int off = 0; off < 4; off++) {
-            test_context.buffer_size = unaligned_test_buffer_size[i];
-            test_context.seed = i;
-            test_context.offset = off;
-            async_memcpy_setup_testbench(&test_context);
-
-            TEST_ESP_OK(esp_async_memcpy(driver, test_context.to_addr, test_context.from_addr, test_context.copy_size, NULL, NULL));
-            vTaskDelay(pdMS_TO_TICKS(10));
-            async_memcpy_verify_and_clear_testbench(test_context.seed, test_context.copy_size, test_context.src_buf,
-                                                    test_context.dst_buf, test_context.from_addr, test_context.to_addr);
-        }
-    }
-#endif
-}
-
-TEST_CASE("memory copy by DMA one by one", "[async mcp]")
-{
-    async_memcpy_config_t config = {
-        .backlog = 4,
-    };
+    async_memcpy_config_t config = ASYNC_MEMCPY_DEFAULT_CONFIG();
    async_memcpy_handle_t driver = NULL;

 #if SOC_AHB_GDMA_SUPPORTED
-    printf("Testing memory by AHB GDMA\r\n");
+    printf("Testing memcpy by AHB GDMA\r\n");
    TEST_ESP_OK(esp_async_memcpy_install_gdma_ahb(&config, &driver));
-    test_memory_copy_one_by_one(driver);
+    test_memory_copy_with_same_buffer(driver);
    TEST_ESP_OK(esp_async_memcpy_uninstall(driver));
 #endif // SOC_AHB_GDMA_SUPPORTED

 #if SOC_AXI_GDMA_SUPPORTED
-    printf("Testing memory by AXI GDMA\r\n");
+    printf("Testing memcpy by AXI GDMA\r\n");
    TEST_ESP_OK(esp_async_memcpy_install_gdma_axi(&config, &driver));
-    test_memory_copy_one_by_one(driver);
+    test_memory_copy_with_same_buffer(driver);
    TEST_ESP_OK(esp_async_memcpy_uninstall(driver));
 #endif // SOC_AXI_GDMA_SUPPORTED

 #if SOC_CP_DMA_SUPPORTED
-    printf("Testing memory by CP DMA\r\n");
+    printf("Testing memcpy by CP DMA\r\n");
    TEST_ESP_OK(esp_async_memcpy_install_cpdma(&config, &driver));
-    test_memory_copy_one_by_one(driver);
+    test_memory_copy_with_same_buffer(driver);
    TEST_ESP_OK(esp_async_memcpy_uninstall(driver));
 #endif // SOC_CP_DMA_SUPPORTED
-
 }

 static bool test_async_memcpy_cb_v1(async_memcpy_handle_t mcp_hdl, async_memcpy_event_t *event, void *cb_args)
@ -200,208 +145,235 @@ static bool test_async_memcpy_cb_v1(async_memcpy_handle_t mcp_hdl, async_memcpy_
    return high_task_wakeup == pdTRUE;
 }

-TEST_CASE("memory copy done callback", "[async mcp]")
+static void test_memory_copy_blocking(async_memcpy_handle_t driver)
 {
-    async_memcpy_config_t config = {
-        // all default
-    };
-    async_memcpy_handle_t driver = NULL;
-    TEST_ESP_OK(esp_async_memcpy_install(&config, &driver));
-
-    uint8_t *src_buf = heap_caps_aligned_calloc(4, 1, 256, MALLOC_CAP_DMA | MALLOC_CAP_INTERNAL | MALLOC_CAP_8BIT);
-    uint8_t *dst_buf = heap_caps_aligned_calloc(4, 1, 256, MALLOC_CAP_DMA | MALLOC_CAP_INTERNAL | MALLOC_CAP_8BIT);
-    TEST_ASSERT_NOT_NULL(src_buf);
-    TEST_ASSERT_NOT_NULL(dst_buf);
-
    SemaphoreHandle_t sem = xSemaphoreCreateBinary();
-    TEST_ESP_OK(esp_async_memcpy(driver, dst_buf, src_buf, 256, test_async_memcpy_cb_v1, sem));
-    TEST_ASSERT_EQUAL(pdTRUE, xSemaphoreTake(sem, pdMS_TO_TICKS(1000)));
-    TEST_ESP_OK(esp_async_memcpy_uninstall(driver));
-    free(src_buf);
-    free(dst_buf);
+    const uint32_t test_buffer_size[] = {256, 512, 1024, 2048, 4096, 5012};
+    memcpy_testbench_context_t test_context = {
+        .align = 4,
+    };
+    for (int i = 0; i < sizeof(test_buffer_size) / sizeof(test_buffer_size[0]); i++) {
+        // Test different align edge
+        for (int off = 0; off < 4; off++) {
+            test_context.buffer_size = test_buffer_size[i];
+            test_context.seed = i;
+            test_context.src_offset = off;
+            test_context.dst_offset = off;
+            async_memcpy_setup_testbench(&test_context);
+
+            TEST_ESP_OK(esp_async_memcpy(driver, test_context.to_addr, test_context.from_addr, test_context.copy_size, test_async_memcpy_cb_v1, sem));
+            TEST_ASSERT_EQUAL(pdTRUE, xSemaphoreTake(sem, pdMS_TO_TICKS(10)));
+            async_memcpy_verify_and_clear_testbench(test_context.copy_size, test_context.src_buf, test_context.dst_buf,
+                                                    test_context.from_addr, test_context.to_addr);
+        }
+    }
    vSemaphoreDelete(sem);
 }

-TEST_CASE("memory copy by DMA on the fly", "[async mcp]")
+TEST_CASE("memory copy by DMA (blocking)", "[async mcp]")
 {
-    async_memcpy_config_t config = ASYNC_MEMCPY_DEFAULT_CONFIG();
-    async_memcpy_handle_t driver = NULL;
-    TEST_ESP_OK(esp_async_memcpy_install(&config, &driver));
-
-    uint32_t aligned_test_buffer_size[] = {512, 1024, 2048, 4096, 4608};
-    memcpy_testbench_context_t test_context[5] = {
-        [0 ... 4] = {
-            .align = 4,
-        }
+    async_memcpy_config_t config = {
+        .backlog = 1,
+        .dma_burst_size = 0,
    };
+    async_memcpy_handle_t driver = NULL;

-    // Aligned case
-    for (int i = 0; i < sizeof(aligned_test_buffer_size) / sizeof(aligned_test_buffer_size[0]); i++) {
-        test_context[i].seed = i;
-        test_context[i].buffer_size = aligned_test_buffer_size[i];
-        async_memcpy_setup_testbench(&test_context[i]);
-    }
-    for (int i = 0; i < sizeof(aligned_test_buffer_size) / sizeof(aligned_test_buffer_size[0]); i++) {
-        TEST_ESP_OK(esp_async_memcpy(driver, test_context[i].to_addr, test_context[i].from_addr, test_context[i].copy_size, NULL, NULL));
-    }
-    for (int i = 0; i < sizeof(aligned_test_buffer_size) / sizeof(aligned_test_buffer_size[0]); i++) {
-        async_memcpy_verify_and_clear_testbench(i, test_context[i].copy_size, test_context[i].src_buf, test_context[i].dst_buf, test_context[i].from_addr, test_context[i].to_addr);
-    }
-
-#if !TEST_MEMCPY_BUFFER_SIZE_MUST_ALIGN_CACHE
-    uint32_t unaligned_test_buffer_size[] = {511, 1023, 2047, 4095, 5011};
-    // Non-aligned case
-    for (int i = 0; i < sizeof(unaligned_test_buffer_size) / sizeof(unaligned_test_buffer_size[0]); i++) {
-        test_context[i].seed = i;
-        test_context[i].buffer_size = unaligned_test_buffer_size[i];
-        test_context[i].offset = 3;
-        async_memcpy_setup_testbench(&test_context[i]);
-    }
-    for (int i = 0; i < sizeof(unaligned_test_buffer_size) / sizeof(unaligned_test_buffer_size[0]); i++) {
-        TEST_ESP_OK(esp_async_memcpy(driver, test_context[i].to_addr, test_context[i].from_addr, test_context[i].copy_size, NULL, NULL));
-    }
-    for (int i = 0; i < sizeof(unaligned_test_buffer_size) / sizeof(unaligned_test_buffer_size[0]); i++) {
-        async_memcpy_verify_and_clear_testbench(i, test_context[i].copy_size, test_context[i].src_buf, test_context[i].dst_buf, test_context[i].from_addr, test_context[i].to_addr);
-    }
-#endif
-
+#if SOC_AHB_GDMA_SUPPORTED
+    printf("Testing memcpy by AHB GDMA\r\n");
+    TEST_ESP_OK(esp_async_memcpy_install_gdma_ahb(&config, &driver));
+    test_memory_copy_blocking(driver);
    TEST_ESP_OK(esp_async_memcpy_uninstall(driver));
+#endif // SOC_AHB_GDMA_SUPPORTED
+
+#if SOC_AXI_GDMA_SUPPORTED
+    printf("Testing memcpy by AXI GDMA\r\n");
+    TEST_ESP_OK(esp_async_memcpy_install_gdma_axi(&config, &driver));
+    test_memory_copy_blocking(driver);
+    TEST_ESP_OK(esp_async_memcpy_uninstall(driver));
+#endif // SOC_AXI_GDMA_SUPPORTED
+
+#if SOC_CP_DMA_SUPPORTED
+    printf("Testing memcpy by CP DMA\r\n");
+    TEST_ESP_OK(esp_async_memcpy_install_cpdma(&config, &driver));
+    test_memory_copy_blocking(driver);
+    TEST_ESP_OK(esp_async_memcpy_uninstall(driver));
+#endif // SOC_CP_DMA_SUPPORTED
 }

-#define TEST_ASYNC_MEMCPY_BENCH_COUNTS   (8)
-static int s_count = 0;
-
-static IRAM_ATTR bool test_async_memcpy_isr_cb(async_memcpy_handle_t mcp_hdl, async_memcpy_event_t *event, void *cb_args)
+[[maybe_unused]] static void test_memcpy_with_dest_addr_unaligned(async_memcpy_handle_t driver, bool src_in_psram, bool dst_in_psram)
 {
-    SemaphoreHandle_t sem = (SemaphoreHandle_t)cb_args;
+    SemaphoreHandle_t sem = xSemaphoreCreateBinary();
+    const uint32_t test_buffer_size[] = {256, 512, 1024, 2048, 4096, 5012};
+    memcpy_testbench_context_t test_context = {
+        .align = 4,
+        .src_in_psram = src_in_psram,
+        .dst_in_psram = dst_in_psram,
+    };
+    for (int i = 0; i < sizeof(test_buffer_size) / sizeof(test_buffer_size[0]); i++) {
+        // Test different alignment
+        for (int off = 0; off < 4; off++) {
+            test_context.buffer_size = test_buffer_size[i];
+            test_context.seed = i;
+            test_context.src_offset = off;
+            test_context.dst_offset = off + 1;
+            async_memcpy_setup_testbench(&test_context);
+
+            TEST_ESP_OK(esp_async_memcpy(driver, test_context.to_addr, test_context.from_addr, test_context.copy_size, test_async_memcpy_cb_v1, sem));
+            TEST_ASSERT_EQUAL(pdTRUE, xSemaphoreTake(sem, pdMS_TO_TICKS(10)));
+            async_memcpy_verify_and_clear_testbench(test_context.copy_size, test_context.src_buf, test_context.dst_buf,
+                                                    test_context.from_addr, test_context.to_addr);
+        }
+    }
+    vSemaphoreDelete(sem);
+}
+
+TEST_CASE("memory copy with dest address unaligned", "[async mcp]")
+{
+    [[maybe_unused]] async_memcpy_config_t driver_config = {
+        .backlog = 4,
+        .dma_burst_size = 32,
+    };
+    [[maybe_unused]] async_memcpy_handle_t driver = NULL;
+
+
+#if SOC_CP_DMA_SUPPORTED
+    printf("Testing memcpy by CP DMA\r\n");
+    TEST_ESP_OK(esp_async_memcpy_install_cpdma(&driver_config, &driver));
+    test_memcpy_with_dest_addr_unaligned(driver, false, false);
+    TEST_ESP_OK(esp_async_memcpy_uninstall(driver));
+#endif // SOC_CP_DMA_SUPPORTED
+
+#if SOC_AHB_GDMA_SUPPORTED && !GDMA_LL_AHB_RX_BURST_NEEDS_ALIGNMENT
+    printf("Testing memcpy by AHB GDMA\r\n");
+    TEST_ESP_OK(esp_async_memcpy_install_gdma_ahb(&driver_config, &driver));
+    test_memcpy_with_dest_addr_unaligned(driver, false, false);
+#if SOC_AHB_GDMA_SUPPORT_PSRAM
+    test_memcpy_with_dest_addr_unaligned(driver, true, true);
+#endif // SOC_AHB_GDMA_SUPPORT_PSRAM
+    TEST_ESP_OK(esp_async_memcpy_uninstall(driver));
+#endif // SOC_AHB_GDMA_SUPPORTED
+
+#if SOC_AXI_GDMA_SUPPORTED
+    printf("Testing memcpy by AXI GDMA\r\n");
+    TEST_ESP_OK(esp_async_memcpy_install_gdma_axi(&driver_config, &driver));
+    test_memcpy_with_dest_addr_unaligned(driver, false, false);
+#if SOC_AXI_GDMA_SUPPORT_PSRAM
+    test_memcpy_with_dest_addr_unaligned(driver, true, true);
+#endif // SOC_AXI_GDMA_SUPPORT_PSRAM
+    TEST_ESP_OK(esp_async_memcpy_uninstall(driver));
+#endif // SOC_AXI_GDMA_SUPPORTED
+}
+
+#define TEST_ASYNC_MEMCPY_BENCH_COUNTS 16
+
+typedef struct {
+    int perf_count;
+    SemaphoreHandle_t sem;
+} mcp_perf_user_context_t;
+
+static IRAM_ATTR bool test_async_memcpy_perf_cb(async_memcpy_handle_t mcp_hdl, async_memcpy_event_t *event, void *cb_args)
+{
+    mcp_perf_user_context_t* user = (mcp_perf_user_context_t*)cb_args;
    BaseType_t high_task_wakeup = pdFALSE;
-    s_count++;
-    if (s_count == TEST_ASYNC_MEMCPY_BENCH_COUNTS) {
-        xSemaphoreGiveFromISR(sem, &high_task_wakeup);
+    user->perf_count++;
+    if (user->perf_count == TEST_ASYNC_MEMCPY_BENCH_COUNTS) {
+        xSemaphoreGiveFromISR(user->sem, &high_task_wakeup);
    }
    return high_task_wakeup == pdTRUE;
 }

-static void memcpy_performance_test(uint32_t buffer_size)
+static void test_memcpy_performance(async_memcpy_handle_t driver, uint32_t buffer_size, bool src_in_psram, bool dst_in_psram)
 {
-    SemaphoreHandle_t sem = xSemaphoreCreateBinary();
-
-    async_memcpy_config_t config = ASYNC_MEMCPY_DEFAULT_CONFIG();
-    config.backlog = (buffer_size / DMA_DESCRIPTOR_BUFFER_MAX_SIZE + 1) * TEST_ASYNC_MEMCPY_BENCH_COUNTS;
-    config.dma_burst_size = 64;   // set a big burst size for performance
-    async_memcpy_handle_t driver = NULL;
    int64_t elapse_us = 0;
    float throughput = 0.0;
-    TEST_ESP_OK(esp_async_memcpy_install(&config, &driver));

-    // 1. SRAM->SRAM
    memcpy_testbench_context_t test_context = {
-        .align = config.dma_burst_size,
+        .align = 32, // set alignment same as the burst size, to achieve the best performance
        .buffer_size = buffer_size,
-        .src_in_psram = false,
-        .dst_in_psram = false,
+        .src_in_psram = src_in_psram,
+        .dst_in_psram = dst_in_psram,
    };
    async_memcpy_setup_testbench(&test_context);
-    s_count = 0;
-    ccomp_timer_start();
-    for (int i = 0; i < TEST_ASYNC_MEMCPY_BENCH_COUNTS; i++) {
-        TEST_ESP_OK(esp_async_memcpy(driver, test_context.to_addr, test_context.from_addr, test_context.copy_size, test_async_memcpy_isr_cb, sem));
-    }
-    // wait for done semaphore
-    TEST_ASSERT_EQUAL(pdTRUE, xSemaphoreTake(sem, pdMS_TO_TICKS(1000)));
-    elapse_us = ccomp_timer_stop();
-    throughput = (float)test_context.buffer_size * 1e6 * TEST_ASYNC_MEMCPY_BENCH_COUNTS / 1024 / 1024 / elapse_us;
-    IDF_LOG_PERFORMANCE("DMA_COPY", "%.2f MB/s, dir: SRAM->SRAM, size: %zu Bytes", throughput, test_context.buffer_size);
+
+    // get CPU memcpy performance
    ccomp_timer_start();
    for (int i = 0; i < TEST_ASYNC_MEMCPY_BENCH_COUNTS; i++) {
        memcpy(test_context.to_addr, test_context.from_addr, test_context.buffer_size);
    }
    elapse_us = ccomp_timer_stop();
    throughput = (float)test_context.buffer_size * 1e6 * TEST_ASYNC_MEMCPY_BENCH_COUNTS / 1024 / 1024 / elapse_us;
-    IDF_LOG_PERFORMANCE("CPU_COPY", "%.2f MB/s, dir: SRAM->SRAM, size: %zu Bytes", throughput, test_context.buffer_size);
-    async_memcpy_verify_and_clear_testbench(test_context.seed, test_context.copy_size, test_context.src_buf, test_context.dst_buf, test_context.from_addr, test_context.to_addr);
+    IDF_LOG_PERFORMANCE("CPU_COPY", "%.2f MB/s, dir: %s->%s", throughput, src_in_psram ? "PSRAM" : "SRAM", dst_in_psram ? "PSRAM" : "SRAM");

-#if SOC_AHB_GDMA_SUPPORT_PSRAM
-    // 2. PSRAM->PSRAM
-    test_context.src_in_psram = true;
-    test_context.dst_in_psram = true;
-    async_memcpy_setup_testbench(&test_context);
-    s_count = 0;
+    // get DMA memcpy performance
    ccomp_timer_start();
+    mcp_perf_user_context_t user_context = {
+        .perf_count = 0,
+        .sem = xSemaphoreCreateBinary()
+    };
    for (int i = 0; i < TEST_ASYNC_MEMCPY_BENCH_COUNTS; i++) {
-        TEST_ESP_OK(esp_async_memcpy(driver, test_context.to_addr, test_context.from_addr, test_context.copy_size, test_async_memcpy_isr_cb, sem));
+        TEST_ESP_OK(esp_async_memcpy(driver, test_context.to_addr, test_context.from_addr, test_context.copy_size, test_async_memcpy_perf_cb, &user_context));
    }
    // wait for done semaphore
-    TEST_ASSERT_EQUAL(pdTRUE, xSemaphoreTake(sem, pdMS_TO_TICKS(1000)));
+    TEST_ASSERT_EQUAL(pdTRUE, xSemaphoreTake(user_context.sem, pdMS_TO_TICKS(1000)));
    elapse_us = ccomp_timer_stop();
-    throughput = (float)test_context.buffer_size * 1e6 * TEST_ASYNC_MEMCPY_BENCH_COUNTS / 1024 / 1024 / elapse_us;
-    IDF_LOG_PERFORMANCE("DMA_COPY", "%.2f MB/s, dir: PSRAM->PSRAM, size: %zu Bytes", throughput, test_context.buffer_size);
-    ccomp_timer_start();
-    for (int i = 0; i < TEST_ASYNC_MEMCPY_BENCH_COUNTS; i++) {
-        memcpy(test_context.to_addr, test_context.from_addr, test_context.buffer_size);
-    }
-    elapse_us = ccomp_timer_stop();
-    throughput = (float)test_context.buffer_size * 1e6 * TEST_ASYNC_MEMCPY_BENCH_COUNTS / 1024 / 1024 / elapse_us;
-    IDF_LOG_PERFORMANCE("CPU_COPY", "%.2f MB/s, dir: PSRAM->PSRAM, size: %zu Bytes", throughput, test_context.buffer_size);
-    async_memcpy_verify_and_clear_testbench(test_context.seed, test_context.copy_size, test_context.src_buf, test_context.dst_buf, test_context.from_addr, test_context.to_addr);
+    async_memcpy_verify_and_clear_testbench(test_context.copy_size, test_context.src_buf, test_context.dst_buf, test_context.from_addr, test_context.to_addr);
+    throughput = (float)buffer_size * 1e6 * TEST_ASYNC_MEMCPY_BENCH_COUNTS / 1024 / 1024 / elapse_us;
+    IDF_LOG_PERFORMANCE("DMA_COPY", "%.2f MB/s, dir: %s->%s", throughput, src_in_psram ? "PSRAM" : "SRAM", dst_in_psram ? "PSRAM" : "SRAM");

-    // 3. PSRAM->SRAM
-    test_context.src_in_psram = true;
-    test_context.dst_in_psram = false;
-    async_memcpy_setup_testbench(&test_context);
-    s_count = 0;
-    ccomp_timer_start();
-    for (int i = 0; i < TEST_ASYNC_MEMCPY_BENCH_COUNTS; i++) {
-        TEST_ESP_OK(esp_async_memcpy(driver, test_context.to_addr, test_context.from_addr, test_context.copy_size, test_async_memcpy_isr_cb, sem));
-    }
-    // wait for done semaphore
-    TEST_ASSERT_EQUAL(pdTRUE, xSemaphoreTake(sem, pdMS_TO_TICKS(1000)));
-    elapse_us = ccomp_timer_stop();
-    throughput = (float)test_context.buffer_size * 1e6 * TEST_ASYNC_MEMCPY_BENCH_COUNTS / 1024 / 1024 / elapse_us;
-    IDF_LOG_PERFORMANCE("DMA_COPY", "%.2f MB/s, dir: PSRAM->SRAM, size: %zu Bytes", throughput, test_context.buffer_size);
-    ccomp_timer_start();
-    for (int i = 0; i < TEST_ASYNC_MEMCPY_BENCH_COUNTS; i++) {
-        memcpy(test_context.to_addr, test_context.from_addr, test_context.buffer_size);
-    }
-    elapse_us = ccomp_timer_stop();
-    throughput = (float)test_context.buffer_size * 1e6 * TEST_ASYNC_MEMCPY_BENCH_COUNTS / 1024 / 1024 / elapse_us;
-    IDF_LOG_PERFORMANCE("CPU_COPY", "%.2f MB/s, dir: PSRAM->SRAM, size: %zu Bytes", throughput, test_context.buffer_size);
-    async_memcpy_verify_and_clear_testbench(test_context.seed, test_context.copy_size, test_context.src_buf, test_context.dst_buf, test_context.from_addr, test_context.to_addr);
+    vSemaphoreDelete(user_context.sem);
+}

-    // 4. SRAM->PSRAM
-    test_context.src_in_psram = false;
-    test_context.dst_in_psram = true;
-    async_memcpy_setup_testbench(&test_context);
-    s_count = 0;
-    ccomp_timer_start();
-    for (int i = 0; i < TEST_ASYNC_MEMCPY_BENCH_COUNTS; i++) {
-        TEST_ESP_OK(esp_async_memcpy(driver, test_context.to_addr, test_context.from_addr, test_context.copy_size, test_async_memcpy_isr_cb, sem));
-    }
-    // wait for done semaphore
-    TEST_ASSERT_EQUAL(pdTRUE, xSemaphoreTake(sem, pdMS_TO_TICKS(1000)));
-    elapse_us = ccomp_timer_stop();
-    throughput = (float)test_context.buffer_size * 1e6 * TEST_ASYNC_MEMCPY_BENCH_COUNTS / 1024 / 1024 / elapse_us;
-    IDF_LOG_PERFORMANCE("DMA_COPY", "%.2f MB/s, dir: SRAM->PSRAM, size: %zu Bytes", throughput, test_context.buffer_size);
-    ccomp_timer_start();
-    for (int i = 0; i < TEST_ASYNC_MEMCPY_BENCH_COUNTS; i++) {
-        memcpy(test_context.to_addr, test_context.from_addr, test_context.buffer_size);
-    }
-    elapse_us = ccomp_timer_stop();
-    throughput = (float)test_context.buffer_size * 1e6 * TEST_ASYNC_MEMCPY_BENCH_COUNTS / 1024 / 1024 / elapse_us;
-    IDF_LOG_PERFORMANCE("CPU_COPY", "%.2f MB/s, dir: SRAM->PSRAM, size: %zu Bytes", throughput, test_context.buffer_size);
-    async_memcpy_verify_and_clear_testbench(test_context.seed, test_context.copy_size, test_context.src_buf, test_context.dst_buf, test_context.from_addr, test_context.to_addr);
-#endif
+TEST_CASE("memory copy performance 40KB: SRAM->SRAM", "[async mcp]")
+{
+    async_memcpy_config_t driver_config = {
+        .backlog = TEST_ASYNC_MEMCPY_BENCH_COUNTS,
+        .dma_burst_size = 32,
+    };
+    async_memcpy_handle_t driver = NULL;

+#if SOC_AHB_GDMA_SUPPORTED
+    printf("Testing memcpy by AHB GDMA\r\n");
+    TEST_ESP_OK(esp_async_memcpy_install_gdma_ahb(&driver_config, &driver));
+    test_memcpy_performance(driver, 40 * 1024, false, false);
    TEST_ESP_OK(esp_async_memcpy_uninstall(driver));
-    vSemaphoreDelete(sem);
+#endif // SOC_AHB_GDMA_SUPPORTED
+
+#if SOC_AXI_GDMA_SUPPORTED
+    printf("Testing memcpy by AXI GDMA\r\n");
+    TEST_ESP_OK(esp_async_memcpy_install_gdma_axi(&driver_config, &driver));
+    test_memcpy_performance(driver, 40 * 1024, false, false);
+    TEST_ESP_OK(esp_async_memcpy_uninstall(driver));
+#endif // SOC_AXI_GDMA_SUPPORTED
+
+#if SOC_CP_DMA_SUPPORTED
+    printf("Testing memcpy by CP DMA\r\n");
+    TEST_ESP_OK(esp_async_memcpy_install_cpdma(&driver_config, &driver));
+    test_memcpy_performance(driver, 40 * 1024, false, false);
+    TEST_ESP_OK(esp_async_memcpy_uninstall(driver));
+#endif // SOC_CP_DMA_SUPPORTED
 }

-TEST_CASE("memory copy performance test 40KB", "[async mcp]")
+#if SOC_SPIRAM_SUPPORTED
+TEST_CASE("memory copy performance 40KB: PSRAM->PSRAM", "[async mcp]")
 {
-    memcpy_performance_test(40 * 1024);
-}
+    [[maybe_unused]] async_memcpy_config_t driver_config = {
+        .backlog = TEST_ASYNC_MEMCPY_BENCH_COUNTS,
+        .dma_burst_size = 32,
+    };
+    [[maybe_unused]] async_memcpy_handle_t driver = NULL;

-TEST_CASE("memory copy performance test 4KB", "[async mcp]")
-{
-    memcpy_performance_test(4 * 1024);
+#if SOC_AHB_GDMA_SUPPORTED && SOC_AHB_GDMA_SUPPORT_PSRAM
+    printf("Testing memcpy by AHB GDMA\r\n");
+    TEST_ESP_OK(esp_async_memcpy_install_gdma_ahb(&driver_config, &driver));
+    test_memcpy_performance(driver, 40 * 1024, true, true);
+    TEST_ESP_OK(esp_async_memcpy_uninstall(driver));
+#endif // SOC_AHB_GDMA_SUPPORTED && SOC_AHB_GDMA_SUPPORT_PSRAM
+
+#if SOC_AXI_GDMA_SUPPORTED && SOC_AXI_GDMA_SUPPORT_PSRAM
+    printf("Testing memcpy by AXI GDMA\r\n");
+    TEST_ESP_OK(esp_async_memcpy_install_gdma_axi(&driver_config, &driver));
+    test_memcpy_performance(driver, 40 * 1024, true, true);
+    TEST_ESP_OK(esp_async_memcpy_uninstall(driver));
+#endif // SOC_AXI_GDMA_SUPPORTED && SOC_AXI_GDMA_SUPPORT_PSRAM
 }
+#endif
--- a/components/esp_hw_support/test_apps/dma/main/test_gdma.c
+++ b/components/esp_hw_support/test_apps/dma/main/test_gdma.c
@ -14,6 +14,7 @@
 #include "esp_heap_caps.h"
 #include "esp_private/gdma.h"
 #include "esp_private/gdma_link.h"
+#include "esp_private/esp_dma_utils.h"
 #include "hal/dma_types.h"
 #include "soc/soc_caps.h"
 #include "hal/gdma_ll.h"
@ -22,6 +23,9 @@
 #include "esp_cache.h"
 #include "esp_memory_utils.h"

+#define ALIGN_UP(num, align)    (((num) + ((align) - 1)) & ~((align) - 1))
+#define ALIGN_DOWN(num, align)  ((num) & ~((align) - 1))
+
 TEST_CASE("GDMA channel allocation", "[GDMA]")
 {
    gdma_channel_alloc_config_t channel_config = {};
@ -147,22 +151,9 @@ TEST_CASE("GDMA channel allocation", "[GDMA]")
 #endif // GDMA_LL_AXI_PAIRS_PER_GROUP >= 2
 }

-static bool test_gdma_m2m_rx_eof_callback(gdma_channel_handle_t dma_chan, gdma_event_data_t *event_data, void *user_data)
+static void test_gdma_config_link_list(gdma_channel_handle_t tx_chan, gdma_channel_handle_t rx_chan,
+                                     gdma_link_list_handle_t *tx_link_list, gdma_link_list_handle_t *rx_link_list, size_t sram_alignment, bool dma_link_in_ext_mem)
 {
-    BaseType_t task_woken = pdFALSE;
-    SemaphoreHandle_t done_sem = (SemaphoreHandle_t)user_data;
-    xSemaphoreGiveFromISR(done_sem, &task_woken);
-    return task_woken == pdTRUE;
-}
-
-static void test_gdma_m2m_mode(gdma_channel_handle_t tx_chan, gdma_channel_handle_t rx_chan, bool dma_link_in_ext_mem)
-{
-    size_t sram_alignment = cache_hal_get_cache_line_size(CACHE_LL_LEVEL_INT_MEM, CACHE_TYPE_DATA);
-    gdma_rx_event_callbacks_t rx_cbs = {
-        .on_recv_eof = test_gdma_m2m_rx_eof_callback,
-    };
-    SemaphoreHandle_t done_sem = xSemaphoreCreateBinary();
-    TEST_ESP_OK(gdma_register_rx_event_callbacks(rx_chan, &rx_cbs, done_sem));

    gdma_strategy_config_t strategy = {
        .auto_update_desc = true,
@ -189,24 +180,46 @@ static void test_gdma_m2m_mode(gdma_channel_handle_t tx_chan, gdma_channel_handl
            .check_owner = true,
        }
    };
-    gdma_link_list_handle_t tx_link_list = NULL;
-    TEST_ESP_OK(gdma_new_link_list(&tx_link_list_config, &tx_link_list));
-    // allocate the source buffer from SRAM
-    uint8_t *src_data = heap_caps_calloc(1, 128, MALLOC_CAP_DMA | MALLOC_CAP_INTERNAL | MALLOC_CAP_8BIT);
-    TEST_ASSERT_NOT_NULL(src_data);
-
+    TEST_ESP_OK(gdma_new_link_list(&tx_link_list_config, tx_link_list));
    // create DMA link list for RX channel
    gdma_link_list_config_t rx_link_list_config = {
        .buffer_alignment = sram_alignment, // RX buffer should be aligned to the cache line size, because we will do cache invalidate later
        .item_alignment = 8, // 8-byte alignment required by the AXI-GDMA
-        .num_items = 1,
+        .num_items = 5,
        .flags = {
            .items_in_ext_mem = dma_link_in_ext_mem,
            .check_owner = true,
        },
    };
+    TEST_ESP_OK(gdma_new_link_list(&rx_link_list_config, rx_link_list));
+}
+
+static bool test_gdma_m2m_rx_eof_callback(gdma_channel_handle_t dma_chan, gdma_event_data_t *event_data, void *user_data)
+{
+    BaseType_t task_woken = pdFALSE;
+    SemaphoreHandle_t done_sem = (SemaphoreHandle_t)user_data;
+    xSemaphoreGiveFromISR(done_sem, &task_woken);
+    return task_woken == pdTRUE;
+}
+
+static void test_gdma_m2m_mode(gdma_channel_handle_t tx_chan, gdma_channel_handle_t rx_chan, bool dma_link_in_ext_mem)
+{
+    size_t sram_alignment = cache_hal_get_cache_line_size(CACHE_LL_LEVEL_INT_MEM, CACHE_TYPE_DATA);
+    gdma_rx_event_callbacks_t rx_cbs = {
+        .on_recv_eof = test_gdma_m2m_rx_eof_callback,
+    };
+    SemaphoreHandle_t done_sem = xSemaphoreCreateBinary();
+    TEST_ASSERT_NOT_NULL(done_sem);
+    TEST_ESP_OK(gdma_register_rx_event_callbacks(rx_chan, &rx_cbs, done_sem));
+
+    gdma_link_list_handle_t tx_link_list = NULL;
    gdma_link_list_handle_t rx_link_list = NULL;
-    TEST_ESP_OK(gdma_new_link_list(&rx_link_list_config, &rx_link_list));
+    test_gdma_config_link_list(tx_chan, rx_chan, &tx_link_list, &rx_link_list, sram_alignment, dma_link_in_ext_mem);
+
+    // allocate the source buffer from SRAM
+    uint8_t *src_data = heap_caps_calloc(1, 128, MALLOC_CAP_DMA | MALLOC_CAP_INTERNAL | MALLOC_CAP_8BIT);
+    TEST_ASSERT_NOT_NULL(src_data);
+
    // allocate the destination buffer from SRAM
    uint8_t *dst_data = heap_caps_calloc(1, 256, MALLOC_CAP_DMA | MALLOC_CAP_INTERNAL | MALLOC_CAP_8BIT);
    TEST_ASSERT_NOT_NULL(dst_data);
@ -270,7 +283,7 @@ static void test_gdma_m2m_mode(gdma_channel_handle_t tx_chan, gdma_channel_handl
    TEST_ESP_OK(gdma_start(rx_chan, gdma_link_get_head_addr(rx_link_list)));
    TEST_ESP_OK(gdma_start(tx_chan, gdma_link_get_head_addr(tx_link_list)));

-    xSemaphoreTake(done_sem, portMAX_DELAY);
+    xSemaphoreTake(done_sem, 1000 / portTICK_PERIOD_MS);

    if (sram_alignment) {
        // the destination data are not reflected to the cache, so do an invalidate to ask the cache load new data
@ -344,3 +357,133 @@ TEST_CASE("GDMA M2M Mode", "[GDMA][M2M]")
    TEST_ESP_OK(gdma_del_channel(rx_chan));
 #endif // SOC_AXI_GDMA_SUPPORTED
 }
+
+typedef struct {
+    SemaphoreHandle_t done_sem;
+    dma_buffer_split_array_t *align_array;
+} test_gdma_context_t;
+
+static bool test_gdma_m2m_unaligned_rx_eof_callback(gdma_channel_handle_t dma_chan, gdma_event_data_t *event_data, void *user_data)
+{
+    BaseType_t task_woken = pdFALSE;
+    test_gdma_context_t *user_ctx = (test_gdma_context_t*)user_data;
+    TEST_ESP_OK(esp_dma_merge_aligned_rx_buffers(user_ctx->align_array));
+    xSemaphoreGiveFromISR(user_ctx->done_sem, &task_woken);
+    return task_woken == pdTRUE;
+}
+
+static void test_gdma_m2m_unaligned_buffer_test(uint8_t *dst_data, uint8_t *src_data, size_t data_length, size_t offset_len)
+{
+    TEST_ASSERT_NOT_NULL(src_data);
+    TEST_ASSERT_NOT_NULL(dst_data);
+    gdma_channel_handle_t tx_chan = NULL;
+    gdma_channel_handle_t rx_chan = NULL;
+    gdma_channel_alloc_config_t tx_chan_alloc_config = {};
+    gdma_channel_alloc_config_t rx_chan_alloc_config = {};
+    tx_chan_alloc_config = (gdma_channel_alloc_config_t) {
+        .direction = GDMA_CHANNEL_DIRECTION_TX,
+        .flags.reserve_sibling = true,
+    };
+    TEST_ESP_OK(gdma_new_ahb_channel(&tx_chan_alloc_config, &tx_chan));
+    rx_chan_alloc_config = (gdma_channel_alloc_config_t) {
+        .direction = GDMA_CHANNEL_DIRECTION_RX,
+        .sibling_chan = tx_chan,
+    };
+    TEST_ESP_OK(gdma_new_ahb_channel(&rx_chan_alloc_config, &rx_chan));
+    size_t sram_alignment = cache_hal_get_cache_line_size(CACHE_LL_LEVEL_INT_MEM, CACHE_TYPE_DATA);
+
+    gdma_link_list_handle_t tx_link_list = NULL;
+    gdma_link_list_handle_t rx_link_list = NULL;
+    test_gdma_config_link_list(tx_chan, rx_chan, &tx_link_list, &rx_link_list, sram_alignment, false);
+
+    // prepare the source data
+    for (int i = 0; i < data_length; i++) {
+        src_data[i] = i;
+    }
+    if (sram_alignment) {
+        // do write-back for the source data because it's in the cache
+        TEST_ESP_OK(esp_cache_msync(src_data, ALIGN_UP(data_length, sram_alignment), ESP_CACHE_MSYNC_FLAG_DIR_C2M));
+    }
+
+    gdma_buffer_mount_config_t tx_buf_mount_config[] = {
+        [0] = {
+            .buffer = src_data,
+            .length = data_length,
+            .flags = {
+                .mark_eof = true,
+                .mark_final = true, // using singly list, so terminate the link here
+            }
+        }
+    };
+    TEST_ESP_OK(gdma_link_mount_buffers(tx_link_list, 0, tx_buf_mount_config, sizeof(tx_buf_mount_config) / sizeof(gdma_buffer_mount_config_t), NULL));
+
+    dma_buffer_split_array_t align_array = {0};
+    gdma_buffer_mount_config_t rx_aligned_buf_mount_config[3] = {0};
+    uint8_t* stash_buffer = NULL;
+    TEST_ESP_OK(esp_dma_split_rx_buffer_to_cache_aligned(dst_data + offset_len, data_length, &align_array, &stash_buffer));
+    for (int i = 0; i < 3; i++) {
+        rx_aligned_buf_mount_config[i].buffer = align_array.aligned_buffer[i].aligned_buffer;
+        rx_aligned_buf_mount_config[i].length = align_array.aligned_buffer[i].length;
+    }
+    TEST_ESP_OK(gdma_link_mount_buffers(rx_link_list, 0, rx_aligned_buf_mount_config, 3, NULL));
+
+    gdma_rx_event_callbacks_t rx_cbs = {
+        .on_recv_eof = test_gdma_m2m_unaligned_rx_eof_callback,
+    };
+    SemaphoreHandle_t done_sem = xSemaphoreCreateBinary();
+    TEST_ASSERT_NOT_NULL(done_sem);
+    test_gdma_context_t user_ctx = {
+        .done_sem = done_sem,
+        .align_array = &align_array,
+    };
+    TEST_ESP_OK(gdma_register_rx_event_callbacks(rx_chan, &rx_cbs, &user_ctx));
+
+    TEST_ESP_OK(gdma_start(rx_chan, gdma_link_get_head_addr(rx_link_list)));
+    TEST_ESP_OK(gdma_start(tx_chan, gdma_link_get_head_addr(tx_link_list)));
+
+    xSemaphoreTake(done_sem, 1000 / portTICK_PERIOD_MS);
+
+    // validate the destination data
+    for (int i = 0; i < data_length; i++) {
+        TEST_ASSERT_EQUAL(i % 256 , dst_data[i + offset_len]);
+    }
+
+    TEST_ESP_OK(gdma_del_link_list(tx_link_list));
+    TEST_ESP_OK(gdma_del_link_list(rx_link_list));
+    TEST_ESP_OK(gdma_del_channel(tx_chan));
+    TEST_ESP_OK(gdma_del_channel(rx_chan));
+    vSemaphoreDelete(done_sem);
+    free(stash_buffer);
+}
+
+TEST_CASE("GDMA M2M Unaligned RX Buffer Test", "[GDMA][M2M]")
+{
+    uint8_t *sbuf = heap_caps_aligned_calloc(64, 1, 10240, MALLOC_CAP_DMA | MALLOC_CAP_INTERNAL | MALLOC_CAP_8BIT);
+    uint8_t *dbuf = heap_caps_aligned_calloc(64, 1, 10240, MALLOC_CAP_DMA | MALLOC_CAP_INTERNAL | MALLOC_CAP_8BIT);
+
+    // case buffer len less than buffer alignment
+    test_gdma_m2m_unaligned_buffer_test(dbuf, sbuf, 60, 0);
+    test_gdma_m2m_unaligned_buffer_test(dbuf, sbuf, 60, 4);
+    test_gdma_m2m_unaligned_buffer_test(dbuf, sbuf, 60, 2);
+
+    // case buffer head aligned
+    test_gdma_m2m_unaligned_buffer_test(dbuf, sbuf, 246, 0);
+    test_gdma_m2m_unaligned_buffer_test(dbuf, sbuf, 8182, 0);
+
+    // case buffer tail aligned
+    test_gdma_m2m_unaligned_buffer_test(dbuf, sbuf, 246, 10);
+    test_gdma_m2m_unaligned_buffer_test(dbuf, sbuf, 8182, 10);
+
+    // case buffer unaligned
+    test_gdma_m2m_unaligned_buffer_test(dbuf, sbuf, 100, 10);
+    test_gdma_m2m_unaligned_buffer_test(dbuf, sbuf, 10, 60);
+    test_gdma_m2m_unaligned_buffer_test(dbuf, sbuf, 256, 10);
+    test_gdma_m2m_unaligned_buffer_test(dbuf, sbuf, 8192, 10);
+
+    // case buffer full aligned
+    test_gdma_m2m_unaligned_buffer_test(dbuf, sbuf, 256, 0);
+    test_gdma_m2m_unaligned_buffer_test(dbuf, sbuf, 8192, 0);
+
+    free(sbuf);
+    free(dbuf);
+}