Android NDK之使用 arm-v7a 汇编实现两数之和
关键词: NDK armv7a WebRTC arm汇编 CMake
最近适配对讲程序,在webrtc的库编译的过程中,发现其为arm的平台定制了汇编程序以优化平方根倒数算法速度,上次写汇编还是8086的,借此机会初步尝试下android上arm汇编
具体jni工程建立就不介绍了,Android Studio直接可以从模板创建
工程目录如下
kryo@WSL1:/mnt/k/Android/NDK-Project/XXX/src/main$ tree . ├── AndroidManifest.xml ├── cpp │ ├── asm │ │ ├── CMakeLists.txt │ │ ├── asm_defines.h │ │ ├── asm_jni.cpp │ │ ├── asm_jni.h │ │ ├── tow_sum_armv7a.S │ │ └── tow_sum_cpp.cpp └── java └── com └── kryo ├── asm │ └── TowSumAsm.java └── ...
1、C++接口编写
asm_jni.h
#ifndef TOW_SUM_AMS_TEST_H #define TOW_SUM_AMS_TEST_H #include <jni.h> #ifdef USE_ASM extern "C" int32_t tow_sum_asm(int32_t *data_in, int32_t *data_out, int32_t data_len, int32_t ret_len, int32_t target); #else extern "C" int32_t tow_sum_cpp(int32_t *data_in, int32_t *data_out, int32_t data_len, int32_t target); #endif #endif //TOW_SUM_AMS_TEST_H
这里分别使用asm和c代码各自实现一个暴搜版本的两数之和接口。关于asm传递5个参数是有用意的,涉及到函数调用约定,armv7a前4个参数用寄存器传参,超过4个的用栈传递
2、汇编实现
写汇编时我习惯先参考C代码去推导
tow_sum_cpp.cpp
#include "asm_jni.h" extern "C" int32_t tow_sum_cpp(int32_t *data_in, int32_t *data_out, int32_t data_len,int32_t target) { for (int i = 0; i < data_len; ++i) { for (int j = i + 1; j < data_len; ++j) { if (data_in[i] + data_in[j] == target) { data_out[0] = i; data_out[1] = j; return 0; } } } data_out[0] = 0; data_out[1] = 0; return -1; }
以下是具体汇编代码的实现,基本每行都给出了注释
tow_sum_armv7a.S
@ Input:( @ int32_t* data_in, -> r0 &data_in @ int32_t* data_out,-> r1 &data_out @ int32_t data_len, -> r2 @ int32_t ret_len, -> r3 @ int32_t target -> [sp]) @ Output: r0 32 bit unsigned integer @ @ r4: i-index @ r5: j-index @ r6: target @ r7: num1-buff @ r8: num2-buff @ r9: sum cache #include "asm_defines.h" GLOBAL_FUNCTION tow_sum .align 4 DEFINE_FUNCTION tow_sum push {r4-r11} @ 保存现场 ldr r6, [sp, #32] @ 保存了8个寄存器,偏移8*4bytes取得第5个参数 mov r4, #0 @ 初始化第一个数的索引 i mov r5, #0 @ 初始化第二个数的索引 j LOOP_1: sub r9, r2, #1 @ 数组长度-1 cmp r4, r9 @ 判断i是否数组最后一个 beq FAL @ 是就查找失败 mov r5, r4 @ j = i LOOP_2: add r5, r5, #1 @ j ++ lsl r9, r4, #2 @ 把索引 i 乘4得到地址偏移量 ldr r7, [r0, r9] @ r7 = data_in[i],寄存器相对寻址, r0为 data_in的地址,加上偏移量取的数组元素 lsl r9, r5, #2 ldr r8, [r0, r9] @ 同上得到 r8 = data_in[j] add r9, r8, r7 @ 两数之和 cmp r9, r6 @ 与目标做比较 beq SUC @ 成功 add r9, r5, #1 @ 没有成功 cmp r9, r2 @ if j < data_len bne LOOP_2 @ then:下一轮j的查找 add r4, r4, #1 @ else: j没找到,把i++ b LOOP_1 @ 下一轮 i的查找 SUC: str r4, [r1] @ data_out[0] = i str r5, [r1, #4] @ data_out[1] = j mov r0, #0 @ return 0 b END FAL: mov r4, #0 mov r5, #0 mov r0, #-1 @ return -1 b SUC END: pop {r4-r11} @ 还原现场 bx lr
3、JNI实现
asm_jni.cpp
#include "asm_jni.h" #include <android/log.h> #define TAG "ASM_TEST" #define LOGD(...) __android_log_print(ANDROID_LOG_DEBUG, TAG, __VA_ARGS__) #ifdef __cplusplus extern "C" { #endif JNIEXPORT jintArray JNICALL Java_com_kryo_asm_TowSumAsm_towsum(JNIEnv *env, jobject thiz, jintArray data, jint target) { jintArray r_array = env->NewIntArray(2); jint *elements_out = env->GetIntArrayElements(r_array, NULL); jsize length = env->GetArrayLength(data); jint *elements_in = env->GetIntArrayElements(data, NULL); #ifdef USE_ASM LOGD("call tow_sum_asm !n"); tow_sum_asm(elements_in, elements_out, (size_t) length, 2, (size_t) target); #else LOGD("call tow_sum_cpp !n"); tow_sum_cpp(elements_in, elements_out, (size_t) length, (size_t) target); #endif env->ReleaseIntArrayElements(data, elements_in, 0); env->ReleaseIntArrayElements(r_array, elements_out, 0); return r_array; } #ifdef __cplusplus } #endif
TowSumAsm.java
public class TowSumAsm { static { System.loadLibrary("asm"); } public native int[] towsum(int[] data, int target); }
最后贴一下从webrtc开源代码中copy来的asm_defines.h
/* * Copyright (c) 2012 The WebRTC project authors. All Rights Reserved. * * Use of this source code is governed by a BSD-style license * that can be found in the LICENSE file in the root of the source * tree. An additional intellectual property rights grant can be found * in the file PATENTS. All contributing project authors may * be found in the AUTHORS file in the root of the source tree. */ #ifndef KRYO_INCLUDE_ASM_DEFINES_H_ #define KRYO_INCLUDE_ASM_DEFINES_H_ #if defined(__linux__) && defined(__ELF__) .section .note.GNU-stack,"",%progbits #endif // Define the macros used in ARM assembly code, so that for Mac or iOS builds // we add leading underscores for the function names. #ifdef __APPLE__ .macro GLOBAL_FUNCTION name .global _name .private_extern _name .endm .macro DEFINE_FUNCTION name _name: .endm .macro CALL_FUNCTION name bl _name .endm .macro GLOBAL_LABEL name .global _name .private_extern _name .endm #else .macro GLOBAL_FUNCTION name .global name .hidden name .endm .macro DEFINE_FUNCTION name #if defined(__linux__) && defined(__ELF__) .type name,%function #endif name: .endm .macro CALL_FUNCTION name bl name .endm .macro GLOBAL_LABEL name .global name .hidden name .endm #endif // With Apple's clang compiler, for instructions ldrb, strh, etc., // the condition code is after the width specifier. Here we define // only the ones that are actually used in the assembly files. #if (defined __llvm__) && (defined __APPLE__) .macro streqh reg1, reg2, num strheq reg1, reg2, num .endm #endif .text #endif // KRYO_INCLUDE_ASM_DEFINES_H_
4、CMakeLists.txt编写生成libasm.so
CMakeLists.txt
cmake_minimum_required(VERSION 3.10.2) project("asm") ENABLE_LANGUAGE(ASM) #启用汇编支持 if(${ANDROID_ABI} STREQUAL "armeabi-v7a") add_library(asm SHARED asm_jni.cpp tow_sum_armv7a.S) add_definitions(-DUSE_ASM) elseif(${ANDROID_ABI} STREQUAL "arm64-v8a") add_library(asm SHARED asm_jni.cpp tow_sum_cpp.cpp) else() message(FATAL_ERROR "Unsupported ABI: ${ANDROID_ABI}") endif() target_link_libraries(asm log)
5、运行测试
TowSumAsm towSumAsm = new TowSumAsm(); int[] result = towSumAsm.towsum(new int[]{1, 3, 5, 7, 9}, 12); Log.d(TAG, "result " + result[0] + " " + result[1]);
2024-04-05 10:24:29.269 19863-19863 ASM_TEST com.kryo.demo D call tow_sum_asm ! 2024-04-05 10:24:29.269 19863-19863 JNI_Activity com.kryo.demo D result 1 4
Reference
- VS Code 上ARM指令集参考文档插件: 32位Code4Leg ,64位ARM A64 Instruction Reference
- Android-Audio-Processing-Using-WebRTC spl_sqrt_floor_arm.S