在A527下测试了一下neon加速的效果。
计算res=(a+3.4)*3.1。
# include <iostream>
# include <chrono>
# include <random>
# include <arm_neon.h>
int main(int argc, char const *argv[])
{
float *data_tmp = new float[1080 * 720 * 3];
std::default_random_engine e;
std::uniform_real_distribution<float> u(0, 255);
for(int i = 0; i < 1080 * 720 * 3; ++i) {
*(data_tmp + i) = u(e);
}
float *data = data_tmp;
float *data_res1 = new float[1080 * 720 * 3];
float *data_tmp1 = data_res1;
std::chrono::microseconds start_time = std::chrono::duration_cast<std::chrono::microseconds>(
std::chrono::system_clock::now().time_since_epoch()
);
for(int i = 0; i < 1080 * 720 * 3; ++i) {
*data_res1 = ((*data) + 3.4 ) * 3.1;
++data_res1;
++data;
}
std::chrono::microseconds end_time = std::chrono::duration_cast<std::chrono::microseconds>(
std::chrono::system_clock::now().time_since_epoch()
);
std::cout << "cost total time : " << (end_time - start_time).count() << " microseconds -- common method" << std::endl;
data = data_tmp;
float *data_res2 = new float[1080 * 720 * 3];
float *data_tmp2 = data_res2;
start_time = std::chrono::duration_cast<std::chrono::microseconds>(
std::chrono::system_clock::now().time_since_epoch()
);
float32x4_t A = vdupq_n_f32(3.4);
float32x4_t B = vdupq_n_f32(3.1);
for(int i = 0; i < 1080 * 720 * 3 / 4; ++i) {
float32x4_t C = vld1q_f32(data);
float32x4_t D = vmulq_f32(vaddq_f32(C, A), B);
vst1q_f32(data_res2, D);
data = data + 4;
data_res2 = data_res2 + 4;
}
end_time = std::chrono::duration_cast<std::chrono::microseconds>(
std::chrono::system_clock::now().time_since_epoch()
);
std::cout << "cost total time : " << (end_time - start_time).count() << " microseconds -- neon method" << std::endl;
data = data_tmp;
float *data_res3 = new float[1080 * 720 * 3];
float *data_tmp3 = data_res3;
start_time = std::chrono::duration_cast<std::chrono::microseconds>(
std::chrono::system_clock::now().time_since_epoch()
);
A = vdupq_n_f32(10.54);
B = vdupq_n_f32(3.1);
for(int i = 0; i < 1080 * 720 * 3 / 4; ++i) {
float32x4_t C = vld1q_f32(data);
float32x4_t D = vmlaq_f32(A, B, C);
vst1q_f32(data_res3, D);
data = data + 4;
data_res3 = data_res2 + 4;
}
end_time = std::chrono::duration_cast<std::chrono::microseconds>(
std::chrono::system_clock::now().time_since_epoch()
);
for(int i = 0; i < 1; ++i) {
std::cout << "data[" << i << "]:" <<data_tmp[i] << std::endl;
std::cout << "data_res1[" << i << "]:" <<data_tmp1[i] << std::endl;
std::cout << "data_res2[" << i << "]:" <<data_tmp2[i] << std::endl;
std::cout << "data_res3[" << i << "]:" <<data_tmp3[i] << std::endl;
}
std::cout << "cost total time : " << (end_time - start_time).count() << " microseconds -- neon method" << std::endl;
return 0;
}
测试结果
cost total time : 47286 microseconds -- common method
cost total time : 26103 microseconds -- neon method
data[0]:0.00199561
data_res1[0]:10.5462
data_res2[0]:10.5462
data_res3[0]:10.5462
cost total time : 19555 microseconds -- neon method
手动计算转成乘加后,比普通C++速度提升了59%。