高性能计算实践-OpenCV图像矩阵转置 transpose SIMD加速(ippicv)复现

技术分享 7个月前 (11-19) 0 999+

说明

矩阵转置是高性能计算中的经典问题。OpenCV 的 transpose 函数内部依赖 ippicv 库中的 ippiTranspose_8u_C1R 实现。本文将对该优化算法进行复现与分析。

与上一篇基于 cv::flip / ippiMirror 的图像翻转不同，矩阵转置不再是简单的行内倒序，而是将整幅图像在行列维度上重新映射。我们可以用块划分（tiling）的遍历方式来解决，同时加上各种优化技巧。

复现

 #ifdef _MSC_VER #define FORCE_INLINE __forceinline #elif defined(__GNUC__) #define FORCE_INLINE __attribute__((always_inline)) inline #else #define FORCE_INLINE inline #endif  /**  * 8x8 SSE 转置微核  * 读取 8 行源数据 -> 转置 -> 连续写入 64 字节到 buffer  */ FORCE_INLINE void transpose_8x8_store_contiguous(const uint8_t* src0,                                                  const uint8_t* src1,                                                  const uint8_t* src2,                                                  const uint8_t* src3,                                                  const uint8_t* src4,                                                  const uint8_t* src5,                                                  const uint8_t* src6,                                                  const uint8_t* src7,                                                  uint8_t* pDst) {     __m128i r0 = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(src0));     __m128i r1 = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(src1));     __m128i r2 = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(src2));     __m128i r3 = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(src3));     __m128i r4 = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(src4));     __m128i r5 = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(src5));     __m128i r6 = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(src6));     __m128i r7 = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(src7));      __m128i t0 = _mm_unpacklo_epi8(r0, r1);     __m128i t1 = _mm_unpacklo_epi8(r2, r3);     __m128i t2 = _mm_unpacklo_epi8(r4, r5);     __m128i t3 = _mm_unpacklo_epi8(r6, r7);      __m128i t4 = _mm_unpacklo_epi16(t0, t1);     __m128i t5 = _mm_unpacklo_epi16(t2, t3);     __m128i t6 = _mm_unpackhi_epi16(t0, t1);     __m128i t7 = _mm_unpackhi_epi16(t2, t3);      __m128i c0 = _mm_unpacklo_epi32(t4, t5);     __m128i c1 = _mm_unpackhi_epi32(t4, t5);     __m128i c2 = _mm_unpacklo_epi32(t6, t7);     __m128i c3 = _mm_unpackhi_epi32(t6, t7);      // 将转置后的 8x8 块 (64字节) 连续写入 buffer     // buffer 是 alignas(64) 的，始终对齐     _mm_store_si128(reinterpret_cast<__m128i*>(pDst + 0), c0);     _mm_store_si128(reinterpret_cast<__m128i*>(pDst + 16), c1);     _mm_store_si128(reinterpret_cast<__m128i*>(pDst + 32), c2);     _mm_store_si128(reinterpret_cast<__m128i*>(pDst + 48), c3); }  /**  * 64x64 Tile 转置核心优化  * 使用 64x64 栈上缓存  * 1. 读 Src (8x8 块)，转置后线性写入 Tmp (Row-Major Block)  * 2. 读 Tmp (Strided)，合并后流式写入 Dst (Contiguous Rows)  */ template <bool UseStream> FORCE_INLINE void transpose_64x64_tile_impl(const uint8_t* pSrc, unsigned int srcStep, uint8_t* pDst, unsigned int dstStep) {     // 64x64 临时 Buffer     alignas(64) uint8_t tmp[64 * 64];     uint8_t* tmpPtr = tmp;      // 1. 读取源并填充 Buffer     // 策略：保持源图像的线性访问 (Y then X)，这对性能至关重要     // 结果：tmp 中的块是按 "Row-Major Block" 顺序排列的     // 即：[B(0,0)] [B(0,1)] ... [B(0,7)] [B(1,0)] ...      // 预计算步长指针，减少循环内乘法     size_t srcStep8 = (size_t)srcStep * 8;     const uint8_t* s0 = pSrc;      for (int y = 0; y < 64; y += 8) {         const uint8_t* r0 = s0;         const uint8_t* r1 = s0 + srcStep;         const uint8_t* r2 = s0 + srcStep * 2;         const uint8_t* r3 = s0 + srcStep * 3;         const uint8_t* r4 = s0 + srcStep * 4;         const uint8_t* r5 = s0 + srcStep * 5;         const uint8_t* r6 = s0 + srcStep * 6;         const uint8_t* r7 = s0 + srcStep * 7;          for (int x = 0; x < 64; x += 8) {             transpose_8x8_store_contiguous(r0 + x, r1 + x, r2 + x, r3 + x, r4 + x, r5 + x, r6 + x, r7 + x, tmpPtr);             tmpPtr += 64; // buffer 线性写入         }         s0 += srcStep8;     }      // 2. 从 Buffer 读取并流式写入 Dst     // 目标：写入 Dst 的行     // Dst 的第 i 个条带 (由8行组成) 对应 Source 的第 i 个块列     // Source 的块列 i 包含块：B(0,i), B(1,i), ... B(7,i)     // 在 Row-Major 的 tmp 中，这些块的内存地址不是连续的，而是相隔 8个块 (8*64 = 512字节)      // 外层循环：遍历 8 个垂直条带 (Strip)，对应 tmp 中的 Block Column 0..7     for (int colBlock = 0; colBlock < 8; ++colBlock) {         // 当前条带中 B(0, colBlock) 的起始地址         // 在 tmp 中，Block(row, col) 的索引是 row*8 + col         // Block(0, colBlock) 的偏移是 colBlock * 64         const uint8_t* bBase = tmp + colBlock * 64;          // 处理条带内的 8 行         for (int r = 0; r < 8; ++r) {             // 我们需要从 8 个垂直堆叠的块中，分别取出第 r 行             // Block stride = 512 bytes.             // Row offset inside block = r * 8 bytes.             int laneOffset = r * 8;              // 从 tmp 中以 512 字节 stride 读取             __m128i b0 = _mm_loadl_epi64((const __m128i*)(bBase + 0 * 512 + laneOffset));             __m128i b1 = _mm_loadl_epi64((const __m128i*)(bBase + 1 * 512 + laneOffset));             __m128i b2 = _mm_loadl_epi64((const __m128i*)(bBase + 2 * 512 + laneOffset));             __m128i b3 = _mm_loadl_epi64((const __m128i*)(bBase + 3 * 512 + laneOffset));             __m128i b4 = _mm_loadl_epi64((const __m128i*)(bBase + 4 * 512 + laneOffset));             __m128i b5 = _mm_loadl_epi64((const __m128i*)(bBase + 5 * 512 + laneOffset));             __m128i b6 = _mm_loadl_epi64((const __m128i*)(bBase + 6 * 512 + laneOffset));             __m128i b7 = _mm_loadl_epi64((const __m128i*)(bBase + 7 * 512 + laneOffset));              __m128i v0 = _mm_unpacklo_epi64(b0, b1);             __m128i v1 = _mm_unpacklo_epi64(b2, b3);             __m128i v2 = _mm_unpacklo_epi64(b4, b5);             __m128i v3 = _mm_unpacklo_epi64(b6, b7);              // 计算目标地址：             // 当前是第 colBlock 个条带，第 r 行 -> 全局行 colBlock*8 + r             uint8_t* dstRowPtr = pDst + (colBlock * 8 + r) * dstStep;              if (UseStream) { // 编译期优化，生成无分支代码                 // Stream 路径：要求 dstRowPtr 必须 16 字节对齐                 // 适用于 dstStep % 16 == 0 且 pDst 对齐的情况                 _mm_stream_si128(reinterpret_cast<__m128i*>(dstRowPtr + 0), v0);                 _mm_stream_si128(reinterpret_cast<__m128i*>(dstRowPtr + 16), v1);                 _mm_stream_si128(reinterpret_cast<__m128i*>(dstRowPtr + 32), v2);                 _mm_stream_si128(reinterpret_cast<__m128i*>(dstRowPtr + 48), v3);             } else {                 // StoreU 路径：安全处理任意对齐，且依然是 SIMD 向量化                 // 适用于 dstStep % 16 != 0 的情况                 _mm_storeu_si128(reinterpret_cast<__m128i*>(dstRowPtr + 0), v0);                 _mm_storeu_si128(reinterpret_cast<__m128i*>(dstRowPtr + 16), v1);                 _mm_storeu_si128(reinterpret_cast<__m128i*>(dstRowPtr + 32), v2);                 _mm_storeu_si128(reinterpret_cast<__m128i*>(dstRowPtr + 48), v3);             }         }     } }  /**  * 处理边缘的小块 (8x8 fallback)  * 将 8x8 源块 (srcStep) 转置写入 8x8 目标块 (dstStep)  */ FORCE_INLINE void transpose_8x8_u8_to_strided(const uint8_t* pSrc, unsigned int srcStep, uint8_t* pDst, unsigned int dstStep) {     __m128i r0 = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(pSrc + 0 * srcStep));     __m128i r1 = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(pSrc + 1 * srcStep));     __m128i r2 = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(pSrc + 2 * srcStep));     __m128i r3 = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(pSrc + 3 * srcStep));     __m128i r4 = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(pSrc + 4 * srcStep));     __m128i r5 = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(pSrc + 5 * srcStep));     __m128i r6 = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(pSrc + 6 * srcStep));     __m128i r7 = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(pSrc + 7 * srcStep));      __m128i t0 = _mm_unpacklo_epi8(r0, r1);     __m128i t1 = _mm_unpacklo_epi8(r2, r3);     __m128i t2 = _mm_unpacklo_epi8(r4, r5);     __m128i t3 = _mm_unpacklo_epi8(r6, r7);      __m128i t4 = _mm_unpacklo_epi16(t0, t1);     __m128i t5 = _mm_unpacklo_epi16(t2, t3);     __m128i t6 = _mm_unpackhi_epi16(t0, t1);     __m128i t7 = _mm_unpackhi_epi16(t2, t3);      __m128i c0 = _mm_unpacklo_epi32(t4, t5);     __m128i c1 = _mm_unpackhi_epi32(t4, t5);     __m128i c2 = _mm_unpacklo_epi32(t6, t7);     __m128i c3 = _mm_unpackhi_epi32(t6, t7);      _mm_storel_epi64(reinterpret_cast<__m128i*>(pDst + 0 * dstStep), c0);     _mm_storel_epi64(reinterpret_cast<__m128i*>(pDst + 1 * dstStep), _mm_srli_si128(c0, 8));     _mm_storel_epi64(reinterpret_cast<__m128i*>(pDst + 2 * dstStep), c1);     _mm_storel_epi64(reinterpret_cast<__m128i*>(pDst + 3 * dstStep), _mm_srli_si128(c1, 8));     _mm_storel_epi64(reinterpret_cast<__m128i*>(pDst + 4 * dstStep), c2);     _mm_storel_epi64(reinterpret_cast<__m128i*>(pDst + 5 * dstStep), _mm_srli_si128(c2, 8));     _mm_storel_epi64(reinterpret_cast<__m128i*>(pDst + 6 * dstStep), c3);     _mm_storel_epi64(reinterpret_cast<__m128i*>(pDst + 7 * dstStep), _mm_srli_si128(c3, 8)); }  /**  * 核心转置内核，处理任意 WxH 块  * 内部使用 64x64 Tile 优化，并处理 8x8 和 1x1 边缘  */ template <bool UseStream> int64_t icv_y8_owniTransposeWxH_8uC1_impl(const uint8_t* pSrc,                                           unsigned int srcStep,                                           uint8_t* pDst,                                           unsigned int dstStep,                                           int width,                                           int height) {     if (width <= 0 || height <= 0)         return 0;      constexpr int TILE = 64;     constexpr int MICRO = 8;      const int wMain = width & ~(TILE - 1);  // 64x 块主区域     const int hMain = height & ~(TILE - 1); // 64x 块主区域      // 1. 主循环 64x64 Tile (使用模板参数选择优化策略)     for (int y = 0; y < hMain; y += TILE) {         for (int x = 0; x < wMain; x += TILE) {             // Source Tile (x, y) 转置后写入 Dst Tile (y, x)             const uint8_t* srcTile = pSrc + y * srcStep + x;             uint8_t* dstTile = pDst + x * dstStep + y;             transpose_64x64_tile_impl<UseStream>(srcTile, srcStep, dstTile, dstStep);         }     }      // 2. 边缘处理 (通用代码，不依赖 UseStream，因为 storel 总是安全的)     // 高度为 hMain，宽度为 wTail     const int wTail = width - wMain;     if (wTail > 0) {         int wTailMain = wTail & ~(MICRO - 1); // 8x 块区域         int wTailTail = wTail - wTailMain;    // 1x 标量区域          for (int y = 0; y < hMain; y += MICRO) {             const uint8_t* srcRow = pSrc + y * srcStep + wMain;             uint8_t* dstCol = pDst + wMain * dstStep + y;              int xOff = 0;             // 8x8 块             for (; xOff < wTailMain; xOff += MICRO) {                 transpose_8x8_u8_to_strided(srcRow + xOff, srcStep, dstCol + xOff * dstStep, dstStep);             }             // 标量补齐             // (y, xOff) -> (xOff, y)             for (int k = 0; k < MICRO; ++k) { // 遍历 8 行                 for (int x = 0; x < wTailTail; ++x) {                     dstCol[(xOff + x) * dstStep + k] = srcRow[k * srcStep + (xOff + x)];                 }             }         }     }      // 3. 处理底部边缘 (Height non-64, 左侧部分)     // 高度为 hBottomTail，宽度为 wMain     const int hBottomTail = height - hMain;     if (hBottomTail > 0) {         int hBottomMain = hBottomTail & ~(MICRO - 1); // 8x 块区域         int hBottomTailTail = hBottomTail - hBottomMain; // 1x 标量区域          for (int x = 0; x < wMain; x += MICRO) {             const uint8_t* srcCol = pSrc + hMain * srcStep + x;             uint8_t* dstRow = pDst + x * dstStep + hMain;              int yOff = 0;             // 8x8 块             for (; yOff < hBottomMain; yOff += MICRO) {                 transpose_8x8_u8_to_strided(srcCol + yOff * srcStep, srcStep, dstRow + yOff, dstStep);             }             // 标量补齐             // (yOff, k) -> (k, yOff)             for (int k = 0; k < MICRO; ++k) { // 遍历 8 列                 for (int y = 0; y < hBottomTailTail; ++y) {                     dstRow[k * dstStep + (yOff + y)] = srcCol[(yOff + y) * srcStep + k];                 }             }         }     }      // 4. 处理右下角 (wTail x hBottomTail)     if (wTail > 0 && hBottomTail > 0) {         // C++ 标量实现         const uint8_t* srcCorner = pSrc + hMain * srcStep + wMain;         uint8_t* dstCorner = pDst + wMain * dstStep + hMain;         for (int y = 0; y < hBottomTail; ++y) {             for (int x = 0; x < wTail; ++x) {                 dstCorner[x * dstStep + y] = srcCorner[y * srcStep + x];             }         }     }      // 如果使用了 Stream (NT Store)，需要 sfence 确保数据可见性     if (UseStream) {         _mm_sfence();     }     return 0; }  /**  * 核心转置内核 Dispatcher  * 根据 dstStep 和 pDst 的对齐情况，分发到 Stream 版或 StoreU 版  */ int64_t icv_y8_owniTransposeWxH_8uC1(const uint8_t* pSrc,                                      unsigned int srcStep,                                      uint8_t* pDst,                                      unsigned int dstStep,                                      int width,                                      int height) {     // 检查对齐     // 1. pDst 地址必须 16 字节对齐     // 2. dstStep 必须是 16 的倍数     // 只有同时满足，才能在 64x64 块内部安全使用 stream 指令     bool isAligned = (((uintptr_t)pDst | (uintptr_t)dstStep) & 0xF) == 0;      if (isAligned) {         return icv_y8_owniTransposeWxH_8uC1_impl<true>(pSrc, srcStep, pDst, dstStep, width, height);     } else {         return icv_y8_owniTransposeWxH_8uC1_impl<false>(pSrc, srcStep, pDst, dstStep, width, height);     } }   /**  * 顶层转置函数：将整幅图像按 512x512 分块，调度到 icv_y8_owniTransposeWxH_8uC1  */ int64_t icv_transpose_8u_C1(const uint8_t* pSrc,                                          unsigned int srcStep,                                          uint8_t* pDst,                                          unsigned int dstStep,                                          int width,                                          int height) {     constexpr int TILE = 512;      if (width <= 0 || height <= 0) {         return 0;     }      const int h_main = height & ~(TILE - 1); // height - height % 512     const int w_main = width & ~(TILE - 1);  // width  - width  % 512     const int h_tail = height - h_main;      // height % 512     const int w_tail = width - w_main;       // width % 512      int64_t last_ret = 0; // 保存最后一次调用内核的返回值      // 1. 主 512x512 网格区域：0..h_main-1, 0..w_main-1     //    外层循环 Width (bj)，内层展开 Height (bi)     //    这使得 Source 每次读取跳跃 512 行 (垂直)，     //    而 Destination 每次写入跳跃 512 列 (水平，即连续内存)，     //    这对写合并缓冲 (Write Combining) 非常友好     if (h_main > 0 && w_main > 0) {         const int blocksH = h_main / TILE; // 垂直方向块数         const int blocksW = w_main / TILE; // 水平方向块数         const int GROUP = 8;               // 8 个 512x512 块一组          // 外层遍历 Destination 的行 (即 Source 的列)         for (int bj = 0; bj < blocksW; ++bj) {             const int srcColOffset = bj * TILE;             const int dstRowOffset = bj * TILE * static_cast<int>(dstStep);              int bi = 0;              // 1a. 内层展开：处理 Source 的 8 个垂直块 (Vertical Blocks)             //     这会生成 Destination 的 8 个水平块 (Horizontal Blocks -> 连续写入)             for (; bi + GROUP - 1 < blocksH; bi += GROUP) {                 const int srcRowOffset = bi * TILE * static_cast<int>(srcStep);                 const int dstColOffset = bi * TILE;                  const uint8_t* srcBase = pSrc + srcRowOffset + srcColOffset;                 uint8_t* dstBase = pDst + dstRowOffset + dstColOffset;                  // Source 指针每次加 srcStep * TILE (垂直移动)                 // Dest 指针每次加 TILE (水平移动)                 last_ret = icv_y8_owniTransposeWxH_8uC1(srcBase + 0 * TILE * srcStep, srcStep,                                                         dstBase + 0 * TILE, dstStep, TILE, TILE);                 last_ret = icv_y8_owniTransposeWxH_8uC1(srcBase + 1 * TILE * srcStep, srcStep,                                                         dstBase + 1 * TILE, dstStep, TILE, TILE);                 last_ret = icv_y8_owniTransposeWxH_8uC1(srcBase + 2 * TILE * srcStep, srcStep,                                                         dstBase + 2 * TILE, dstStep, TILE, TILE);                 last_ret = icv_y8_owniTransposeWxH_8uC1(srcBase + 3 * TILE * srcStep, srcStep,                                                         dstBase + 3 * TILE, dstStep, TILE, TILE);                 last_ret = icv_y8_owniTransposeWxH_8uC1(srcBase + 4 * TILE * srcStep, srcStep,                                                         dstBase + 4 * TILE, dstStep, TILE, TILE);                 last_ret = icv_y8_owniTransposeWxH_8uC1(srcBase + 5 * TILE * srcStep, srcStep,                                                         dstBase + 5 * TILE, dstStep, TILE, TILE);                 last_ret = icv_y8_owniTransposeWxH_8uC1(srcBase + 6 * TILE * srcStep, srcStep,                                                         dstBase + 6 * TILE, dstStep, TILE, TILE);                 last_ret = icv_y8_owniTransposeWxH_8uC1(srcBase + 7 * TILE * srcStep, srcStep,                                                         dstBase + 7 * TILE, dstStep, TILE, TILE);             }              // 1b. 本行(列)剩余的 512x512 块（不足 8 个的一段）             for (; bi < blocksH; ++bi) {                 const int srcRowOffset = bi * TILE * static_cast<int>(srcStep);                 const int dstColOffset = bi * TILE;                  const uint8_t* srcBlock = pSrc + srcRowOffset + srcColOffset;                 uint8_t* dstBlock = pDst + dstRowOffset + dstColOffset;                  last_ret = icv_y8_owniTransposeWxH_8uC1(srcBlock, srcStep, dstBlock, dstStep, TILE, TILE);             }         }     }      // 2. 右侧边缘：宽度剩余 w_tail x 高度 h_main     //    这个区域的块尺寸是 w_tail x 512     if (w_tail > 0 && h_main > 0) {         const int blocksH = h_main / TILE;          for (int bi = 0; bi < blocksH; ++bi) {             const int srcRowOffset = bi * TILE * static_cast<int>(srcStep);             const int dstColOffset = bi * TILE;              const uint8_t* srcBlock = pSrc + srcRowOffset + w_main;             uint8_t* dstBlock = pDst + w_main * static_cast<int>(dstStep) + dstColOffset;              last_ret = icv_y8_owniTransposeWxH_8uC1(srcBlock, srcStep, dstBlock, dstStep, w_tail, TILE);         }     }      // 3. 底部边缘：宽度 w_main x 高度剩余 h_tail     //    区域被拆成若干 512x h_tail 的块，同样按宽度做 8x 展开     //    (这部分保持不变，因为高度 < 512，无法进行垂直展开)     if (h_tail > 0 && w_main > 0) {         const int blocksW = w_main / TILE;         const int GROUP = 8;          const int srcRowOffsetBase = h_main * static_cast<int>(srcStep);         const int dstColOffsetBase = h_main;          int bj = 0;          // 3a. 每次处理 8 个 512x h_tail 的块         for (; bj + GROUP - 1 < blocksW; bj += GROUP) {             const int srcColOffset = bj * TILE;             const int dstRowOffset = bj * TILE * static_cast<int>(dstStep);              const uint8_t* srcBase = pSrc + srcRowOffsetBase + srcColOffset;             uint8_t* dstBase = pDst + dstRowOffset + dstColOffsetBase;              // 水平展开             last_ret = icv_y8_owniTransposeWxH_8uC1(srcBase + 0 * TILE,                                                     srcStep,                                                     dstBase + 0 * TILE * static_cast<int>(dstStep),                                                     dstStep,                                                     TILE,                                                     h_tail);             last_ret = icv_y8_owniTransposeWxH_8uC1(srcBase + 1 * TILE,                                                     srcStep,                                                     dstBase + 1 * TILE * static_cast<int>(dstStep),                                                     dstStep,                                                     TILE,                                                     h_tail);             last_ret = icv_y8_owniTransposeWxH_8uC1(srcBase + 2 * TILE,                                                     srcStep,                                                     dstBase + 2 * TILE * static_cast<int>(dstStep),                                                     dstStep,                                                     TILE,                                                     h_tail);             last_ret = icv_y8_owniTransposeWxH_8uC1(srcBase + 3 * TILE,                                                     srcStep,                                                     dstBase + 3 * TILE * static_cast<int>(dstStep),                                                     dstStep,                                                     TILE,                                                     h_tail);             last_ret = icv_y8_owniTransposeWxH_8uC1(srcBase + 4 * TILE,                                                     srcStep,                                                     dstBase + 4 * TILE * static_cast<int>(dstStep),                                                     dstStep,                                                     TILE,                                                     h_tail);             last_ret = icv_y8_owniTransposeWxH_8uC1(srcBase + 5 * TILE,                                                     srcStep,                                                     dstBase + 5 * TILE * static_cast<int>(dstStep),                                                     dstStep,                                                     TILE,                                                     h_tail);             last_ret = icv_y8_owniTransposeWxH_8uC1(srcBase + 6 * TILE,                                                     srcStep,                                                     dstBase + 6 * TILE * static_cast<int>(dstStep),                                                     dstStep,                                                     TILE,                                                     h_tail);             last_ret = icv_y8_owniTransposeWxH_8uC1(srcBase + 7 * TILE,                                                     srcStep,                                                     dstBase + 7 * TILE * static_cast<int>(dstStep),                                                     dstStep,                                                     TILE,                                                     h_tail);         }          // 3b. 本行剩余的 512x h_tail 块         for (; bj < blocksW; ++bj) {             const uint8_t* srcBlock = pSrc + srcRowOffsetBase + bj * TILE;             uint8_t* dstBlock = pDst + bj * TILE * static_cast<int>(dstStep) + dstColOffsetBase;             last_ret = icv_y8_owniTransposeWxH_8uC1(srcBlock, srcStep, dstBlock, dstStep, TILE, h_tail);         }     }      // 4. 右下角小块：w_tail x h_tail     if (h_tail > 0 && w_tail > 0) {         const uint8_t* srcBlock = pSrc + h_main * static_cast<int>(srcStep) + w_main;         uint8_t* dstBlock = pDst + w_main * static_cast<int>(dstStep) + h_main;         last_ret = icv_y8_owniTransposeWxH_8uC1(srcBlock, srcStep, dstBlock, dstStep, w_tail, h_tail);     }      return last_ret; }

性能测试

普通c++基准代码如下，一般可以得到6倍加速

template <typename T> void simple_transpose(         const T* pSrc, unsigned int srcStep,         T* pDst, unsigned int dstStep,         int width, int height) {     // 块大小根据 CPU L1 Cache 大小调整     // 对于 uint8_t，64x64 = 4KB，通常适合 L1 Cache     // 对于 float，可能需要减小到 32 或 16     constexpr int BLOCK_SIZE = 64;      // 以块为单位遍历 (i, j 指向块的左上角)     for (int i = 0; i < height; i += BLOCK_SIZE)     {         for (int j = 0; j < width; j += BLOCK_SIZE)         {             // 如果 i + BLOCK_SIZE 超过了 height，则只处理到 height 为止             // 解决了矩阵尺寸不能被 BLOCK_SIZE 整除的情况             const int i_max = std::min(i + BLOCK_SIZE, height);             const int j_max = std::min(j + BLOCK_SIZE, width);              for (int ii = i; ii < i_max; ++ii)             {                 for (int jj = j; jj < j_max; ++jj)                 {                     // Dst(row, col) = Src(col, row)                     pDst[jj * dstStep + ii] = pSrc[ii * srcStep + jj];                 }             }         }     } }

为了节约篇幅就不展示功能测试了。

性能测试代码如下

class TransposeFixture : public benchmark::Fixture { public:     void SetUp(const ::benchmark::State& state) override     {         width = state.range(0);         height = state.range(1);         srcStep = width;          src = std::make_unique<uint8_t[]>(static_cast<size_t>(height) * srcStep);         dst = std::make_unique<uint8_t[]>(static_cast<size_t>(width) * height);          for (int i = 0; i < height; ++i) {             for (int j = 0; j < width; ++j) {                 src[i * srcStep + j] = static_cast<uint8_t>((i + j) % 256);             }         }          srcMat = cv::Mat(height, width, CV_8UC1, src.get(), srcStep);         dstMat = cv::Mat(width, height, CV_8UC1, dst.get(), height);          bytes_per_iteration = static_cast<int64_t>(width) * height * 2;     }      void TearDown(const ::benchmark::State&) override     {         src.reset();         dst.reset();     }  protected:     int width{};     int height{};     int srcStep{};     int64_t bytes_per_iteration{};     std::unique_ptr<uint8_t[]> src;     std::unique_ptr<uint8_t[]> dst;     cv::Mat srcMat;  // 预创建的cv::Mat对象     cv::Mat dstMat;  // 预创建的cv::Mat对象 };  BENCHMARK_DEFINE_F(TransposeFixture, Optimized)(benchmark::State& state) {     for (auto _ : state) {         icv_transpose_8u_C1(                 src.get(), srcStep,                 dst.get(), height,                 width, height         );         benchmark::DoNotOptimize(dst.get());         benchmark::ClobberMemory();     }     state.SetBytesProcessed(state.iterations() * bytes_per_iteration); }  BENCHMARK_DEFINE_F(TransposeFixture, OpenCV)(benchmark::State& state) {     for (auto _ : state) {         cv::transpose(srcMat, dstMat);         benchmark::DoNotOptimize(dstMat.data);         benchmark::ClobberMemory();     }     state.SetBytesProcessed(state.iterations() * bytes_per_iteration); }

性能测试结果如下

TransposeFixture/Optimized/4096/4096    1538876 ns      1537400 ns          498 bytes_per_second=20.3265Gi/s TransposeFixture/OpenCV/4096/4096       6065054 ns      5998884 ns          112 bytes_per_second=5.2093Gi/s TransposeFixture/Optimized/2050/1920     285198 ns       288771 ns         2489 bytes_per_second=25.3882Gi/s TransposeFixture/OpenCV/2050/1920        279878 ns       284630 ns         2635 bytes_per_second=25.7576Gi/s

可以看到在大图上能够完全超越，在普通图像上性能接近。

发表评论