全志在线开发者论坛

a502849625

在A527下测试了一下neon加速的效果。
计算res=(a+3.4)*3.1。

# include <iostream>
# include <chrono>
# include <random>
# include <arm_neon.h>

int main(int argc, char const *argv[])
{
  float *data_tmp = new float[1080 * 720 * 3];
  std::default_random_engine e;
  std::uniform_real_distribution<float> u(0, 255);
  for(int i = 0; i < 1080 * 720 * 3; ++i) {
    *(data_tmp + i) = u(e);
  }

  float *data = data_tmp;
  float *data_res1 = new float[1080 * 720 * 3];
  float *data_tmp1 = data_res1;

  std::chrono::microseconds start_time = std::chrono::duration_cast<std::chrono::microseconds>(
    std::chrono::system_clock::now().time_since_epoch()
  );

  for(int i = 0; i < 1080 * 720 * 3; ++i) {
    *data_res1 = ((*data) + 3.4 ) * 3.1;
    ++data_res1;
    ++data;
  }

  std::chrono::microseconds end_time = std::chrono::duration_cast<std::chrono::microseconds>(
    std::chrono::system_clock::now().time_since_epoch()
  );

  std::cout << "cost total time : " << (end_time - start_time).count() << " microseconds  -- common method" << std::endl;

  data = data_tmp;
  float *data_res2 = new float[1080 * 720 * 3];
  float *data_tmp2 = data_res2;

  start_time = std::chrono::duration_cast<std::chrono::microseconds>(
    std::chrono::system_clock::now().time_since_epoch()
  );

  float32x4_t A = vdupq_n_f32(3.4);
  float32x4_t B = vdupq_n_f32(3.1);
  for(int i = 0; i < 1080 * 720 * 3 / 4; ++i) {
    float32x4_t C = vld1q_f32(data);
    float32x4_t D = vmulq_f32(vaddq_f32(C, A), B);
    vst1q_f32(data_res2, D);
    data = data + 4;
    data_res2 = data_res2 + 4;
  }
  
  end_time = std::chrono::duration_cast<std::chrono::microseconds>(
    std::chrono::system_clock::now().time_since_epoch()
  );

  std::cout << "cost total time : " << (end_time - start_time).count() << " microseconds  -- neon method" << std::endl;

  data = data_tmp;
  float *data_res3 = new float[1080 * 720 * 3];
  float *data_tmp3 = data_res3;

  start_time = std::chrono::duration_cast<std::chrono::microseconds>(
    std::chrono::system_clock::now().time_since_epoch()
  );

  A = vdupq_n_f32(10.54);
  B = vdupq_n_f32(3.1);
  for(int i = 0; i < 1080 * 720 * 3 / 4; ++i) {
    float32x4_t C = vld1q_f32(data);
    float32x4_t D = vmlaq_f32(A, B, C);
    vst1q_f32(data_res3, D);
    data = data + 4;
    data_res3 = data_res2 + 4;
  }

  end_time = std::chrono::duration_cast<std::chrono::microseconds>(
    std::chrono::system_clock::now().time_since_epoch()
  );

  for(int i = 0; i < 1; ++i) {
    std::cout << "data[" << i << "]:" <<data_tmp[i] << std::endl;
    std::cout << "data_res1[" << i << "]:" <<data_tmp1[i] << std::endl;
    std::cout << "data_res2[" << i << "]:" <<data_tmp2[i] << std::endl;
    std::cout << "data_res3[" << i << "]:" <<data_tmp3[i] << std::endl;
  }

  std::cout << "cost total time : " << (end_time - start_time).count() << " microseconds  -- neon method" << std::endl;

  return 0;
}

测试结果

cost total time : 47286 microseconds  -- common method
cost total time : 26103 microseconds  -- neon method
data[0]:0.00199561
data_res1[0]:10.5462
data_res2[0]:10.5462
data_res3[0]:10.5462
cost total time : 19555 microseconds  -- neon method

手动计算转成乘加后，比普通C++速度提升了59%。

a502849625

在A527下测试了一下neon加速的效果。
计算res=(a+3.4)*3.1。

# include <iostream>
# include <chrono>
# include <random>
# include <arm_neon.h>

int main(int argc, char const *argv[])
{
  float *data_tmp = new float[1080 * 720 * 3];
  std::default_random_engine e;
  std::uniform_real_distribution<float> u(0, 255);
  for(int i = 0; i < 1080 * 720 * 3; ++i) {
    *(data_tmp + i) = u(e);
  }

  float *data = data_tmp;
  float *data_res1 = new float[1080 * 720 * 3];
  float *data_tmp1 = data_res1;

  std::chrono::microseconds start_time = std::chrono::duration_cast<std::chrono::microseconds>(
    std::chrono::system_clock::now().time_since_epoch()
  );

  for(int i = 0; i < 1080 * 720 * 3; ++i) {
    *data_res1 = ((*data) + 3.4 ) * 3.1;
    ++data_res1;
    ++data;
  }

  std::chrono::microseconds end_time = std::chrono::duration_cast<std::chrono::microseconds>(
    std::chrono::system_clock::now().time_since_epoch()
  );

  std::cout << "cost total time : " << (end_time - start_time).count() << " microseconds  -- common method" << std::endl;

  data = data_tmp;
  float *data_res2 = new float[1080 * 720 * 3];
  float *data_tmp2 = data_res2;

  start_time = std::chrono::duration_cast<std::chrono::microseconds>(
    std::chrono::system_clock::now().time_since_epoch()
  );

  float32x4_t A = vdupq_n_f32(3.4);
  float32x4_t B = vdupq_n_f32(3.1);
  for(int i = 0; i < 1080 * 720 * 3 / 4; ++i) {
    float32x4_t C = vld1q_f32(data);
    float32x4_t D = vmulq_f32(vaddq_f32(C, A), B);
    vst1q_f32(data_res2, D);
    data = data + 4;
    data_res2 = data_res2 + 4;
  }
  
  end_time = std::chrono::duration_cast<std::chrono::microseconds>(
    std::chrono::system_clock::now().time_since_epoch()
  );

  std::cout << "cost total time : " << (end_time - start_time).count() << " microseconds  -- neon method" << std::endl;

  data = data_tmp;
  float *data_res3 = new float[1080 * 720 * 3];
  float *data_tmp3 = data_res3;

  start_time = std::chrono::duration_cast<std::chrono::microseconds>(
    std::chrono::system_clock::now().time_since_epoch()
  );

  A = vdupq_n_f32(10.54);
  B = vdupq_n_f32(3.1);
  for(int i = 0; i < 1080 * 720 * 3 / 4; ++i) {
    float32x4_t C = vld1q_f32(data);
    float32x4_t D = vmlaq_f32(A, B, C);
    vst1q_f32(data_res3, D);
    data = data + 4;
    data_res3 = data_res2 + 4;
  }

  end_time = std::chrono::duration_cast<std::chrono::microseconds>(
    std::chrono::system_clock::now().time_since_epoch()
  );

  for(int i = 0; i < 1; ++i) {
    std::cout << "data[" << i << "]:" <<data_tmp[i] << std::endl;
    std::cout << "data_res1[" << i << "]:" <<data_tmp1[i] << std::endl;
    std::cout << "data_res2[" << i << "]:" <<data_tmp2[i] << std::endl;
    std::cout << "data_res3[" << i << "]:" <<data_tmp3[i] << std::endl;
  }

  std::cout << "cost total time : " << (end_time - start_time).count() << " microseconds  -- neon method" << std::endl;

  return 0;
}

测试结果

cost total time : 47286 microseconds  -- common method
cost total time : 26103 microseconds  -- neon method
data[0]:0.00199561
data_res1[0]:10.5462
data_res2[0]:10.5462
data_res3[0]:10.5462
cost total time : 19555 microseconds  -- neon method

手动计算转成乘加后，比普通C++速度提升了59%。

a502849625

问题已解决。参照
SDK_DIR/buildroot/package/auto下有sdk_demo。
SDK_DIR下执行.buildconfig后，执行./buildroot/package/auto/build.sh可以编译demo。
但是demo上的解码器如指导文档所说，解码出图滞后。所以解单帧h264时，demo不会出图。建议参照demo里的stopAndGetLeftFrame函数做修改，把滞后的图像取出。测试可行。

int stopAndGetLeftFrame()
{
    int nRet;

    int leftFrame = VideoStreamFrameNum(pVideoDec, 0);
    ALOGD("VideoStreamFrameNum:%d", leftFrame);
    if (leftFrame > 0) {
        int waitLoop = 0;
        while (leftFrame > 0) {
            nRet = DecodeVideoStream(pVideoDec,
                                     1 /*eos*/,
                                     0/*key frame only*/,
                                     0/*drop b frame*/,
                                     0/*current time*/);
            ALOGD("DecodeVideoStream:%d", nRet);
            if ((nRet == VDECODE_RESULT_NO_FRAME_BUFFER) ||
                (nRet == VDECODE_RESULT_NO_BITSTREAM)) {
                break;
            }

            usleep(200);
            waitLoop++;
            leftFrame--;
            if (waitLoop > 50) {
                ALOGW("decode eos time out > 10 ms!");
                break;
            }
        }
    } else {
        ALOGD("There is no left fream to decode!");
    }

    int nValidPicNum = ValidPictureNum(pVideoDec, 0);
    ALOGD("ValidPictureNum:%d", nValidPicNum);
    if (nValidPicNum <= 0) {
        nRet = -2;
        ALOGW("nValidPicNum:%d, no pic left.", nValidPicNum);
        return nRet;
    }
    while(nValidPicNum > 0) {
        pPicture = RequestPicture(pVideoDec, 0/*the major stream*/); //VdecH264:step6
        if (pPicture != NULL) {
            AVPacket outPacket;
            outPacket.id = pPicture->nID;
            outPacket.pts = pPicture->nPts;

            outPacket.pAddrPhy0 = (unsigned char*)pPicture->phyYBufAddr;
            outPacket.pAddrVir0 = (unsigned char*)pPicture->pData0;
            outPacket.dataLen0 = pPicture->nWidth * pPicture->nHeight;

            outPacket.pAddrPhy1 = (unsigned char*)pPicture->phyCBufAddr;
            outPacket.pAddrVir1 = (unsigned char*)pPicture->pData1;
            outPacket.dataLen1 = outPacket.dataLen0 / 2;

            if (NULL != mDataCbk) {
                mDataCbk->decoderDataReady(&outPacket);
            }

            ReturnPicture(pVideoDec, pPicture);
            nValidPicNum--;
        } else {
            ALOGW("pPicture == NULL.");
            return -2;
        }
    }
    return nRet;
}

a502849625

使用A527解码一帧h264图片，DecodeVideoStream函数返回VDECODE_RESULT_NO_BITSTREAM。相同的h264图片，用ffmpeg软解码是成功的。有人用这款芯片硬解码成功了吗？求指导
代码如下

/*
 * Cedarx media decoder test demo.
 *
 * Copyright (c) 2020-2023 Leng Xujun <lengxujun2007@126.com>.
 *
 * Cedarx is free software; you can redistribute it and/or
 * modify it under the terms of the GNU Lesser General Public
 * License as published by the Free Software Foundation; either
 * version 2.1 of the License, or (at your option) any later version.
 *
 * This program is distributed "as is" WITHOUT ANY WARRANTY of any
 * kind, whether express or implied; without even the implied warranty
 * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
 * GNU Lesser General Public License for more details.
 */

#include <stdio.h>
#include <string.h>
#include <time.h>
#include <errno.h>
#include "vdecoder.h"


//#define VIDEO_WIDTH 1280//1920
//#define VIDEO_HEIGHT 720//1080
#define VIDEO_IN_FMT VIDEO_CODEC_FORMAT_H264//VIDEO_CODEC_FORMAT_MPEG2//VIDEO_CODEC_FORMAT_H264
#define VIDEO_OUT_FMT PIXEL_FORMAT_YUV_PLANER_420//PIXEL_FORMAT_YUV_PLANER_422

//#define DECODE_IN_FILE "720p.jpg" //"1080p.jpg"
//#define DECODE_OUT_FILE "1080p.yuv"

#define TimeDiff(t2, t1) \
		((long long)( ((long long)((t2).tv_sec - (t1).tv_sec)) * 1000000000LL + (t2).tv_nsec - (t1).tv_nsec ))


static char *ReadFile(char *path, int *pLen);
static int WriteFile(const char *fileName, const char *data, int size);

typedef unsigned char u8;
typedef unsigned int u32;
static u8 *YUV_MB32_420_To_YUV420(u32 Width, u32 Height, u8 *ySrc, u8 *cSrc,  
								  u8 **pY, u8 **pU, u8 **pV);


int main(int argc, char *argv[])
{
	char picFileName[256 + 1], *p;
	int picWidth = 1280, picHeight = 720;
	int useNeon = 1, i;

	AddVDPlugin();

	memset(picFileName, 0, sizeof(picFileName));
	for (i = 1; i < argc; i++) {
		if (strncmp(argv[i], "--size", 6) == 0) {
			picWidth = atoi(argv[i] + 7);

			p = strchr(argv[i], 'x');
			if (p != NULL)
				picHeight = atoi(p + 1);

			if (picWidth < 0 || picHeight < 0) {
				picWidth = 1280;
				picHeight = 720;
			}
		} else if (strncmp(argv[i], "--neon", 6) == 0) {
			p = strchr(argv[i], '=');
			if (p != NULL)
				useNeon = atoi(p + 1);
			if (useNeon < 0)
				useNeon = 1;
		} else if (strncmp(argv[i], "--", 2) == 0) {
			printf("unknow option %s", argv[i]);
		} else {
			strcpy(picFileName, argv[i]);
		}
	}

	if (!picFileName[0]) {
		printf("file name missed");
		printf("./VDecodeTest [--size=wxh] [--neon={0,1}] file-name");
		return -1;
	}

	printf("convert %s(%dx%d) to yuv %s NEON...", 
		picFileName, picWidth, picHeight, useNeon ? "with" : "without");

	/*
	 * Create video decoder.
	 */
	VideoDecoder *pVideoDecoder = NULL;

	pVideoDecoder = CreateVideoDecoder();
	if (NULL == pVideoDecoder) {
		printf("create video decode failed\n");
		return -1;
	}

	/*
	 * Init video decoder depends on video stream information & config.
	 */
	VideoStreamInfo videoStreamInfo;
	VConfig videoConfig;
	
	memset(&videoStreamInfo, 0, sizeof(videoStreamInfo));
	videoStreamInfo.eCodecFormat = VIDEO_IN_FMT;
	videoStreamInfo.nWidth = picWidth;
	videoStreamInfo.nHeight = picHeight;
	
	memset(&videoConfig, 0, sizeof(videoConfig));
	printf("VIDEO_OUT_FMT:%d\n",VIDEO_OUT_FMT);
	videoConfig.eOutputPixelFormat = VIDEO_OUT_FMT;
	// videoConfig.nFrameBufferNum = 10;
	// videoConfig.nDecodeSmoothFrameBufferNum = 3;
	// videoConfig.nDeInterlaceHoldingFrameBufferNum = 3;
	// videoConfig.nDisplayHoldingFrameBufferNum = 3;

	if (InitializeVideoDecoder(pVideoDecoder, 
						&videoStreamInfo, &videoConfig) < 0) {
		printf("init video decoder failed");
		DestroyVideoDecoder(pVideoDecoder);
		return -1;
	}

	struct timespec t1, t2;

	clock_gettime(CLOCK_MONOTONIC, &t1);

	/*
	 * Request video stream buffer from decoder.
	 */
	int nRequireSize = picWidth * picHeight * 3 / 2;
	char *pBuf;
	int bufSize;
	char *pRingBuf;
	int ringBufSize;
	
	if (RequestVideoStreamBuffer(pVideoDecoder, nRequireSize, 
						&pBuf, &bufSize, &pRingBuf, &ringBufSize, 0) < 0) {
		printf("request video stream buffer failed");
		DestroyVideoDecoder(pVideoDecoder);
		return -1;
	}

	/*
	 * Sumbit video stream data to be decode to decoder.
	 */
	char *inFileData = NULL;
	int inDataSize = 0;

	inFileData = ReadFile(picFileName, &inDataSize);
	if (NULL == inFileData || inDataSize <= 0) {
		printf("read file %s failed", picFileName);
		DestroyVideoDecoder(pVideoDecoder);
		return -1;
	}

	printf("read file %d\n", inDataSize);

	if (bufSize >= inDataSize) {
        memcpy(pBuf, inFileData, inDataSize);
    } else {
        memcpy(pBuf, inFileData, bufSize);
        //memcpy(pRingBuf, inFileData + bufSize, inDataSize - bufSize);
    }
	
	VideoStreamDataInfo videoStreamDataInfo;
	
	memset(&videoStreamDataInfo, 0, sizeof(videoStreamDataInfo));
	videoStreamDataInfo.pData = pBuf;
	videoStreamDataInfo.nLength = inDataSize;
	videoStreamDataInfo.bIsFirstPart = 1;
	videoStreamDataInfo.bIsLastPart = 1;
	if (SubmitVideoStreamData(pVideoDecoder, &videoStreamDataInfo, 0) < 0) {
		printf("submit video stream to decoder failed");
		free(inFileData);
		DestroyVideoDecoder(pVideoDecoder);
		return -1;
	}

	/*
	 * Start decoding.
	 */
	int frame_num = VideoStreamFrameNum(pVideoDecoder, 0);
	printf("frame: %d\n", frame_num);
	frame_num = ValidPictureNum(pVideoDecoder, 0);
	printf("frame: %d\n", frame_num);
	int decodeResult = DecodeVideoStream(pVideoDecoder, 0, 0, 0, 0);
	if (ValidPictureNum(pVideoDecoder, 0) < 0) {
	}
	if (!(decodeResult == VDECODE_RESULT_OK || 
		  decodeResult == VDECODE_RESULT_FRAME_DECODED || 
		  decodeResult == VDECODE_RESULT_KEYFRAME_DECODED)) {
		printf("decode failed, decode result: %d\n", decodeResult);
		free(inFileData);
		DestroyVideoDecoder(pVideoDecoder);
		return -1;
	}
	
	/*
	 * Toggles the decoded stream object of decoder to obtain data. 
	 */
	VideoPicture *pVideoPic = NULL;
	pVideoPic = RequestPicture(pVideoDecoder, 0);
	if (NULL == pVideoPic) {
		printf("decode failed");
		free(inFileData);
		DestroyVideoDecoder(pVideoDecoder);
		return -1;
	}

	printf("Decoded data statistics:");
	printf("pixel format: %d", pVideoPic->ePixelFormat);
	printf("width: %d", pVideoPic->nWidth);
	printf("height: %d", pVideoPic->nHeight);
	printf("line stride: %d", pVideoPic->nLineStride);
	printf("left offset: %d", pVideoPic->nLeftOffset);
	printf("top offset: %d", pVideoPic->nTopOffset);
	printf("right offset: %d", pVideoPic->nRightOffset);
	printf("bottom offset: %d", pVideoPic->nBottomOffset);
	printf("progressive: %s", pVideoPic->bIsProgressive ? "y" : "n");

	/*
	 * Returns the toggled decoded stream object back to decoder. 
	 */
	ReturnPicture(pVideoDecoder, pVideoPic);

	clock_gettime(CLOCK_MONOTONIC, &t2);

	char outputFile[256 + 1];

	strcpy(outputFile, picFileName);
	strcat(outputFile, ".yuv");
	// if (useNeon) {
	// 	extern void ConvertMb32420ToNv21Y(char* pSrc,char* pDst,int nWidth, int nHeight);
	
	// 	char *Y = malloc(pVideoPic->nRightOffset * pVideoPic->nBottomOffset);
	// 	ConvertMb32420ToNv21Y(pVideoPic->pData0, Y, pVideoPic->nRightOffset, pVideoPic->nBottomOffset);
	// 	WriteFile(outputFile, Y, pVideoPic->nRightOffset * pVideoPic->nBottomOffset);
	// 	free(Y);
	// } else {
		// u8 *Y, *U, *V;
		// u8 *y_uv = YUV_MB32_420_To_YUV420(pVideoPic->nRightOffset, pVideoPic->nBottomOffset, 
		// 					   			  pVideoPic->pData0, pVideoPic->pData1, &Y, &U, &V);
		WriteFile(outputFile, pVideoPic->pData0, pVideoPic->nRightOffset * pVideoPic->nBottomOffset);
		// free(y_uv);
	// }

	//clock_gettime(CLOCK_MONOTONIC, &t2);
	long long diff = TimeDiff(t2, t1);
	printf("time spent for decoding: %lldms / %lldus/ %lldns", 
		 diff / 1000000, diff / 1000, diff);

	/*
	 * Clean up & Destroy video decoder.
	 */
	free(inFileData);
	inFileData = NULL;
	
	//DestroyVideoDecoder(pVideoDecoder);

	return 0;
}


static char *ReadFile(char *path, int *pLen)
{
    FILE *fp = NULL;
    int ret = 0;
    char *data = NULL;

    fp = fopen(path, "rb");
    if (fp == NULL) {
        printf("read jpeg file error, errno(%d)", errno);
        return NULL;
    }

    fseek(fp,0,SEEK_END);
    *pLen = ftell(fp);
	
    rewind(fp);
    data = (char *) malloc(sizeof(char) * (*pLen));
    if(data == NULL) {
		printf("malloc memory fail");
		fclose(fp);
		return NULL;
	}

    ret = fread (data, 1, *pLen, fp);
    if (ret != *pLen) {
        printf("read file fail");
        fclose(fp);
        free(data);
        return NULL;
    }

    if (fp != NULL)
        fclose(fp);
	
    return data;
}

static int WriteFile(const char *fileName, const char *data, int size)
{
	FILE *fp;

	fp = fopen(fileName, "wb");
	if (fp == NULL) {
		printf("create file %s failed", fileName);
		return -1;
	}

	fwrite(data, 1, size, fp);

	fclose(fp);

	return 0;
}


static void WriteBack_Y(u32 Width, u32 Height, u8 *src, u8 *dst)
{
	u32 i, j;
	u32 x, y;
	u8 *srcT, *dstT;

	srcT = src;
	dstT = dst;

	for (y = 0; y < Height; y += 32)
	{
		dstT = dst + Width * y;
		srcT = src + Width * y;
		for (x = 0; x < Width; x += 32)
		{
			for (i = 0; i < 32; i++)
			{
				for (j = 0; j < 32; j++)
					dstT[i*Width + j] = srcT[i*32 + j];
			}
			dstT += 32;
			srcT += 32 * 32;
		}
	}
}

static void WriteBack_UV(u32 width, u32 height, u8 *srcChrom, u8 *dstU, u8 *dstV)
{
	u32 i, j;
	u32 x, y;
	u8 *C, *U, *V;

	C = srcChrom;
	
	for (y = 0; y < height / 2; y += 32)
	{
		U = dstU + width / 2 * y;
		V = dstV + width / 2 * y;
		C = srcChrom + width * y;

		for (x = 0; x < width; x += 32)
		{
			for (i = 0; i < 32; i++)
			{
				for (j = 0; j < 16; j++)
				{
					if ((y + i) < (height / 2))
					{
						U[i*width / 2 + j] = C[i*32 + 2*j];
						V[i*width / 2 + j] = C[i*32 + 2*j + 1];
					}
				}
			}
			
			U += 16;
			V += 16;
			C += 32*32;
		}
	}
}

static u8 *YUV_MB32_420_To_YUV420(u32 Width, u32 Height, u8 *ySrc, u8 *cSrc,  
								  u8 **pY, u8 **pU, u8 **pV)
{
	u8 *yuv_plane;
	u32 ysize;
	u32 csize;

	u32 width16;
	u32 height16;
	
	u32 width32, height32;

	u32 height64;

	u8 *Y, *U, *V;

	width16 = (Width + 15) & ~15;
	height16 = (Height + 15) & ~15;
	width32 = (Width + 31) & ~31;
	height32 = (Height + 31) & ~31;
	height64 = (Height + 63) & ~63;

	ysize = width32 * height32;
	csize = width32 * height64 / 2;

	yuv_plane = (u8 *)malloc(ysize + csize);
	if (yuv_plane == NULL)
		return NULL;

	Y = yuv_plane;
	WriteBack_Y(width32, height32, ySrc, Y);

	U = Y + width32 * height32;
	V = U + (width32 * height64 / 4);
	//WriteBack_UV(width32, height64, cSrc, U, V);

	*pY = Y;
	*pU = U;
	*pV = V;

	return yuv_plane;
}

a502849625LV 2

@a502849625

a502849625 发布的最佳帖子

a502849625 发布的最新帖子