A527 neon加速测试
-
在A527下测试了一下neon加速的效果。
计算res=(a+3.4)*3.1。# include <iostream> # include <chrono> # include <random> # include <arm_neon.h> int main(int argc, char const *argv[]) { float *data_tmp = new float[1080 * 720 * 3]; std::default_random_engine e; std::uniform_real_distribution<float> u(0, 255); for(int i = 0; i < 1080 * 720 * 3; ++i) { *(data_tmp + i) = u(e); } float *data = data_tmp; float *data_res1 = new float[1080 * 720 * 3]; float *data_tmp1 = data_res1; std::chrono::microseconds start_time = std::chrono::duration_cast<std::chrono::microseconds>( std::chrono::system_clock::now().time_since_epoch() ); for(int i = 0; i < 1080 * 720 * 3; ++i) { *data_res1 = ((*data) + 3.4 ) * 3.1; ++data_res1; ++data; } std::chrono::microseconds end_time = std::chrono::duration_cast<std::chrono::microseconds>( std::chrono::system_clock::now().time_since_epoch() ); std::cout << "cost total time : " << (end_time - start_time).count() << " microseconds -- common method" << std::endl; data = data_tmp; float *data_res2 = new float[1080 * 720 * 3]; float *data_tmp2 = data_res2; start_time = std::chrono::duration_cast<std::chrono::microseconds>( std::chrono::system_clock::now().time_since_epoch() ); float32x4_t A = vdupq_n_f32(3.4); float32x4_t B = vdupq_n_f32(3.1); for(int i = 0; i < 1080 * 720 * 3 / 4; ++i) { float32x4_t C = vld1q_f32(data); float32x4_t D = vmulq_f32(vaddq_f32(C, A), B); vst1q_f32(data_res2, D); data = data + 4; data_res2 = data_res2 + 4; } end_time = std::chrono::duration_cast<std::chrono::microseconds>( std::chrono::system_clock::now().time_since_epoch() ); std::cout << "cost total time : " << (end_time - start_time).count() << " microseconds -- neon method" << std::endl; data = data_tmp; float *data_res3 = new float[1080 * 720 * 3]; float *data_tmp3 = data_res3; start_time = std::chrono::duration_cast<std::chrono::microseconds>( std::chrono::system_clock::now().time_since_epoch() ); A = vdupq_n_f32(10.54); B = vdupq_n_f32(3.1); for(int i = 0; i < 1080 * 720 * 3 / 4; ++i) { float32x4_t C = vld1q_f32(data); float32x4_t D = vmlaq_f32(A, B, C); vst1q_f32(data_res3, D); data = data + 4; data_res3 = data_res2 + 4; } end_time = std::chrono::duration_cast<std::chrono::microseconds>( std::chrono::system_clock::now().time_since_epoch() ); for(int i = 0; i < 1; ++i) { std::cout << "data[" << i << "]:" <<data_tmp[i] << std::endl; std::cout << "data_res1[" << i << "]:" <<data_tmp1[i] << std::endl; std::cout << "data_res2[" << i << "]:" <<data_tmp2[i] << std::endl; std::cout << "data_res3[" << i << "]:" <<data_tmp3[i] << std::endl; } std::cout << "cost total time : " << (end_time - start_time).count() << " microseconds -- neon method" << std::endl; return 0; }
测试结果
cost total time : 47286 microseconds -- common method cost total time : 26103 microseconds -- neon method data[0]:0.00199561 data_res1[0]:10.5462 data_res2[0]:10.5462 data_res3[0]:10.5462 cost total time : 19555 microseconds -- neon method
手动计算转成乘加后,比普通C++速度提升了59%。
-
Copyright © 2024 深圳全志在线有限公司 粤ICP备2021084185号 粤公网安备44030502007680号