导航

    全志在线开发者论坛

    • 注册
    • 登录
    • 搜索
    • 版块
    • 话题
    • 在线文档
    • 社区主页
    1. 主页
    2. a502849625
    A
    • 资料
    • 关注 0
    • 粉丝 0
    • 我的积分 154
    • 主题 2
    • 帖子 3
    • 最佳 1
    • 群组 0

    a502849625LV 2

    @a502849625

    154
    积分
    1
    声望
    2
    资料浏览
    3
    帖子
    0
    粉丝
    0
    关注
    注册时间 最后登录

    a502849625 取消关注 关注

    a502849625 发布的最佳帖子

    • A527 neon加速测试

      在A527下测试了一下neon加速的效果。
      计算res=(a+3.4)*3.1。

      # include <iostream>
      # include <chrono>
      # include <random>
      # include <arm_neon.h>
      
      int main(int argc, char const *argv[])
      {
        float *data_tmp = new float[1080 * 720 * 3];
        std::default_random_engine e;
        std::uniform_real_distribution<float> u(0, 255);
        for(int i = 0; i < 1080 * 720 * 3; ++i) {
          *(data_tmp + i) = u(e);
        }
      
        float *data = data_tmp;
        float *data_res1 = new float[1080 * 720 * 3];
        float *data_tmp1 = data_res1;
      
        std::chrono::microseconds start_time = std::chrono::duration_cast<std::chrono::microseconds>(
          std::chrono::system_clock::now().time_since_epoch()
        );
      
        for(int i = 0; i < 1080 * 720 * 3; ++i) {
          *data_res1 = ((*data) + 3.4 ) * 3.1;
          ++data_res1;
          ++data;
        }
      
        std::chrono::microseconds end_time = std::chrono::duration_cast<std::chrono::microseconds>(
          std::chrono::system_clock::now().time_since_epoch()
        );
      
        std::cout << "cost total time : " << (end_time - start_time).count() << " microseconds  -- common method" << std::endl;
      
        data = data_tmp;
        float *data_res2 = new float[1080 * 720 * 3];
        float *data_tmp2 = data_res2;
      
        start_time = std::chrono::duration_cast<std::chrono::microseconds>(
          std::chrono::system_clock::now().time_since_epoch()
        );
      
        float32x4_t A = vdupq_n_f32(3.4);
        float32x4_t B = vdupq_n_f32(3.1);
        for(int i = 0; i < 1080 * 720 * 3 / 4; ++i) {
          float32x4_t C = vld1q_f32(data);
          float32x4_t D = vmulq_f32(vaddq_f32(C, A), B);
          vst1q_f32(data_res2, D);
          data = data + 4;
          data_res2 = data_res2 + 4;
        }
        
        end_time = std::chrono::duration_cast<std::chrono::microseconds>(
          std::chrono::system_clock::now().time_since_epoch()
        );
      
        std::cout << "cost total time : " << (end_time - start_time).count() << " microseconds  -- neon method" << std::endl;
      
        data = data_tmp;
        float *data_res3 = new float[1080 * 720 * 3];
        float *data_tmp3 = data_res3;
      
        start_time = std::chrono::duration_cast<std::chrono::microseconds>(
          std::chrono::system_clock::now().time_since_epoch()
        );
      
        A = vdupq_n_f32(10.54);
        B = vdupq_n_f32(3.1);
        for(int i = 0; i < 1080 * 720 * 3 / 4; ++i) {
          float32x4_t C = vld1q_f32(data);
          float32x4_t D = vmlaq_f32(A, B, C);
          vst1q_f32(data_res3, D);
          data = data + 4;
          data_res3 = data_res2 + 4;
        }
      
        end_time = std::chrono::duration_cast<std::chrono::microseconds>(
          std::chrono::system_clock::now().time_since_epoch()
        );
      
        for(int i = 0; i < 1; ++i) {
          std::cout << "data[" << i << "]:" <<data_tmp[i] << std::endl;
          std::cout << "data_res1[" << i << "]:" <<data_tmp1[i] << std::endl;
          std::cout << "data_res2[" << i << "]:" <<data_tmp2[i] << std::endl;
          std::cout << "data_res3[" << i << "]:" <<data_tmp3[i] << std::endl;
        }
      
        std::cout << "cost total time : " << (end_time - start_time).count() << " microseconds  -- neon method" << std::endl;
      
        return 0;
      }
      
      

      测试结果

      cost total time : 47286 microseconds  -- common method
      cost total time : 26103 microseconds  -- neon method
      data[0]:0.00199561
      data_res1[0]:10.5462
      data_res2[0]:10.5462
      data_res3[0]:10.5462
      cost total time : 19555 microseconds  -- neon method
      

      手动计算转成乘加后,比普通C++速度提升了59%。

      发布在 A Series
      A
      a502849625

    a502849625 发布的最新帖子

    • A527 neon加速测试

      在A527下测试了一下neon加速的效果。
      计算res=(a+3.4)*3.1。

      # include <iostream>
      # include <chrono>
      # include <random>
      # include <arm_neon.h>
      
      int main(int argc, char const *argv[])
      {
        float *data_tmp = new float[1080 * 720 * 3];
        std::default_random_engine e;
        std::uniform_real_distribution<float> u(0, 255);
        for(int i = 0; i < 1080 * 720 * 3; ++i) {
          *(data_tmp + i) = u(e);
        }
      
        float *data = data_tmp;
        float *data_res1 = new float[1080 * 720 * 3];
        float *data_tmp1 = data_res1;
      
        std::chrono::microseconds start_time = std::chrono::duration_cast<std::chrono::microseconds>(
          std::chrono::system_clock::now().time_since_epoch()
        );
      
        for(int i = 0; i < 1080 * 720 * 3; ++i) {
          *data_res1 = ((*data) + 3.4 ) * 3.1;
          ++data_res1;
          ++data;
        }
      
        std::chrono::microseconds end_time = std::chrono::duration_cast<std::chrono::microseconds>(
          std::chrono::system_clock::now().time_since_epoch()
        );
      
        std::cout << "cost total time : " << (end_time - start_time).count() << " microseconds  -- common method" << std::endl;
      
        data = data_tmp;
        float *data_res2 = new float[1080 * 720 * 3];
        float *data_tmp2 = data_res2;
      
        start_time = std::chrono::duration_cast<std::chrono::microseconds>(
          std::chrono::system_clock::now().time_since_epoch()
        );
      
        float32x4_t A = vdupq_n_f32(3.4);
        float32x4_t B = vdupq_n_f32(3.1);
        for(int i = 0; i < 1080 * 720 * 3 / 4; ++i) {
          float32x4_t C = vld1q_f32(data);
          float32x4_t D = vmulq_f32(vaddq_f32(C, A), B);
          vst1q_f32(data_res2, D);
          data = data + 4;
          data_res2 = data_res2 + 4;
        }
        
        end_time = std::chrono::duration_cast<std::chrono::microseconds>(
          std::chrono::system_clock::now().time_since_epoch()
        );
      
        std::cout << "cost total time : " << (end_time - start_time).count() << " microseconds  -- neon method" << std::endl;
      
        data = data_tmp;
        float *data_res3 = new float[1080 * 720 * 3];
        float *data_tmp3 = data_res3;
      
        start_time = std::chrono::duration_cast<std::chrono::microseconds>(
          std::chrono::system_clock::now().time_since_epoch()
        );
      
        A = vdupq_n_f32(10.54);
        B = vdupq_n_f32(3.1);
        for(int i = 0; i < 1080 * 720 * 3 / 4; ++i) {
          float32x4_t C = vld1q_f32(data);
          float32x4_t D = vmlaq_f32(A, B, C);
          vst1q_f32(data_res3, D);
          data = data + 4;
          data_res3 = data_res2 + 4;
        }
      
        end_time = std::chrono::duration_cast<std::chrono::microseconds>(
          std::chrono::system_clock::now().time_since_epoch()
        );
      
        for(int i = 0; i < 1; ++i) {
          std::cout << "data[" << i << "]:" <<data_tmp[i] << std::endl;
          std::cout << "data_res1[" << i << "]:" <<data_tmp1[i] << std::endl;
          std::cout << "data_res2[" << i << "]:" <<data_tmp2[i] << std::endl;
          std::cout << "data_res3[" << i << "]:" <<data_tmp3[i] << std::endl;
        }
      
        std::cout << "cost total time : " << (end_time - start_time).count() << " microseconds  -- neon method" << std::endl;
      
        return 0;
      }
      
      

      测试结果

      cost total time : 47286 microseconds  -- common method
      cost total time : 26103 microseconds  -- neon method
      data[0]:0.00199561
      data_res1[0]:10.5462
      data_res2[0]:10.5462
      data_res3[0]:10.5462
      cost total time : 19555 microseconds  -- neon method
      

      手动计算转成乘加后,比普通C++速度提升了59%。

      发布在 A Series
      A
      a502849625
    • 回复: A527硬解码失败

      问题已解决。参照
      SDK_DIR/buildroot/package/auto下有sdk_demo。
      SDK_DIR下执行.buildconfig后,执行./buildroot/package/auto/build.sh可以编译demo。
      但是demo上的解码器如指导文档所说,解码出图滞后。所以解单帧h264时,demo不会出图。建议参照demo里的stopAndGetLeftFrame函数做修改,把滞后的图像取出。测试可行。

      int stopAndGetLeftFrame()
      {
          int nRet;
      
          int leftFrame = VideoStreamFrameNum(pVideoDec, 0);
          ALOGD("VideoStreamFrameNum:%d", leftFrame);
          if (leftFrame > 0) {
              int waitLoop = 0;
              while (leftFrame > 0) {
                  nRet = DecodeVideoStream(pVideoDec,
                                           1 /*eos*/,
                                           0/*key frame only*/,
                                           0/*drop b frame*/,
                                           0/*current time*/);
                  ALOGD("DecodeVideoStream:%d", nRet);
                  if ((nRet == VDECODE_RESULT_NO_FRAME_BUFFER) ||
                      (nRet == VDECODE_RESULT_NO_BITSTREAM)) {
                      break;
                  }
      
                  usleep(200);
                  waitLoop++;
                  leftFrame--;
                  if (waitLoop > 50) {
                      ALOGW("decode eos time out > 10 ms!");
                      break;
                  }
              }
          } else {
              ALOGD("There is no left fream to decode!");
          }
      
          int nValidPicNum = ValidPictureNum(pVideoDec, 0);
          ALOGD("ValidPictureNum:%d", nValidPicNum);
          if (nValidPicNum <= 0) {
              nRet = -2;
              ALOGW("nValidPicNum:%d, no pic left.", nValidPicNum);
              return nRet;
          }
          while(nValidPicNum > 0) {
              pPicture = RequestPicture(pVideoDec, 0/*the major stream*/); //VdecH264:step6
              if (pPicture != NULL) {
                  AVPacket outPacket;
                  outPacket.id = pPicture->nID;
                  outPacket.pts = pPicture->nPts;
      
                  outPacket.pAddrPhy0 = (unsigned char*)pPicture->phyYBufAddr;
                  outPacket.pAddrVir0 = (unsigned char*)pPicture->pData0;
                  outPacket.dataLen0 = pPicture->nWidth * pPicture->nHeight;
      
                  outPacket.pAddrPhy1 = (unsigned char*)pPicture->phyCBufAddr;
                  outPacket.pAddrVir1 = (unsigned char*)pPicture->pData1;
                  outPacket.dataLen1 = outPacket.dataLen0 / 2;
      
                  if (NULL != mDataCbk) {
                      mDataCbk->decoderDataReady(&outPacket);
                  }
      
                  ReturnPicture(pVideoDec, pPicture);
                  nValidPicNum--;
              } else {
                  ALOGW("pPicture == NULL.");
                  return -2;
              }
          }
          return nRet;
      }
      
      发布在 A Series
      A
      a502849625
    • A527硬解码失败

      使用A527解码一帧h264图片,DecodeVideoStream函数返回VDECODE_RESULT_NO_BITSTREAM。相同的h264图片,用ffmpeg软解码是成功的。有人用这款芯片硬解码成功了吗?求指导
      代码如下

      /*
       * Cedarx media decoder test demo.
       *
       * Copyright (c) 2020-2023 Leng Xujun <lengxujun2007@126.com>.
       *
       * Cedarx is free software; you can redistribute it and/or
       * modify it under the terms of the GNU Lesser General Public
       * License as published by the Free Software Foundation; either
       * version 2.1 of the License, or (at your option) any later version.
       *
       * This program is distributed "as is" WITHOUT ANY WARRANTY of any
       * kind, whether express or implied; without even the implied warranty
       * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
       * GNU Lesser General Public License for more details.
       */
      
      #include <stdio.h>
      #include <string.h>
      #include <time.h>
      #include <errno.h>
      #include "vdecoder.h"
      
      
      //#define VIDEO_WIDTH 1280//1920
      //#define VIDEO_HEIGHT 720//1080
      #define VIDEO_IN_FMT VIDEO_CODEC_FORMAT_H264//VIDEO_CODEC_FORMAT_MPEG2//VIDEO_CODEC_FORMAT_H264
      #define VIDEO_OUT_FMT PIXEL_FORMAT_YUV_PLANER_420//PIXEL_FORMAT_YUV_PLANER_422
      
      //#define DECODE_IN_FILE "720p.jpg" //"1080p.jpg"
      //#define DECODE_OUT_FILE "1080p.yuv"
      
      #define TimeDiff(t2, t1) \
      		((long long)( ((long long)((t2).tv_sec - (t1).tv_sec)) * 1000000000LL + (t2).tv_nsec - (t1).tv_nsec ))
      
      
      static char *ReadFile(char *path, int *pLen);
      static int WriteFile(const char *fileName, const char *data, int size);
      
      typedef unsigned char u8;
      typedef unsigned int u32;
      static u8 *YUV_MB32_420_To_YUV420(u32 Width, u32 Height, u8 *ySrc, u8 *cSrc,  
      								  u8 **pY, u8 **pU, u8 **pV);
      
      
      int main(int argc, char *argv[])
      {
      	char picFileName[256 + 1], *p;
      	int picWidth = 1280, picHeight = 720;
      	int useNeon = 1, i;
      
      	AddVDPlugin();
      
      	memset(picFileName, 0, sizeof(picFileName));
      	for (i = 1; i < argc; i++) {
      		if (strncmp(argv[i], "--size", 6) == 0) {
      			picWidth = atoi(argv[i] + 7);
      
      			p = strchr(argv[i], 'x');
      			if (p != NULL)
      				picHeight = atoi(p + 1);
      
      			if (picWidth < 0 || picHeight < 0) {
      				picWidth = 1280;
      				picHeight = 720;
      			}
      		} else if (strncmp(argv[i], "--neon", 6) == 0) {
      			p = strchr(argv[i], '=');
      			if (p != NULL)
      				useNeon = atoi(p + 1);
      			if (useNeon < 0)
      				useNeon = 1;
      		} else if (strncmp(argv[i], "--", 2) == 0) {
      			printf("unknow option %s", argv[i]);
      		} else {
      			strcpy(picFileName, argv[i]);
      		}
      	}
      
      	if (!picFileName[0]) {
      		printf("file name missed");
      		printf("./VDecodeTest [--size=wxh] [--neon={0,1}] file-name");
      		return -1;
      	}
      
      	printf("convert %s(%dx%d) to yuv %s NEON...", 
      		picFileName, picWidth, picHeight, useNeon ? "with" : "without");
      
      	/*
      	 * Create video decoder.
      	 */
      	VideoDecoder *pVideoDecoder = NULL;
      
      	pVideoDecoder = CreateVideoDecoder();
      	if (NULL == pVideoDecoder) {
      		printf("create video decode failed\n");
      		return -1;
      	}
      
      	/*
      	 * Init video decoder depends on video stream information & config.
      	 */
      	VideoStreamInfo videoStreamInfo;
      	VConfig videoConfig;
      	
      	memset(&videoStreamInfo, 0, sizeof(videoStreamInfo));
      	videoStreamInfo.eCodecFormat = VIDEO_IN_FMT;
      	videoStreamInfo.nWidth = picWidth;
      	videoStreamInfo.nHeight = picHeight;
      	
      	memset(&videoConfig, 0, sizeof(videoConfig));
      	printf("VIDEO_OUT_FMT:%d\n",VIDEO_OUT_FMT);
      	videoConfig.eOutputPixelFormat = VIDEO_OUT_FMT;
      	// videoConfig.nFrameBufferNum = 10;
      	// videoConfig.nDecodeSmoothFrameBufferNum = 3;
      	// videoConfig.nDeInterlaceHoldingFrameBufferNum = 3;
      	// videoConfig.nDisplayHoldingFrameBufferNum = 3;
      
      	if (InitializeVideoDecoder(pVideoDecoder, 
      						&videoStreamInfo, &videoConfig) < 0) {
      		printf("init video decoder failed");
      		DestroyVideoDecoder(pVideoDecoder);
      		return -1;
      	}
      
      	struct timespec t1, t2;
      
      	clock_gettime(CLOCK_MONOTONIC, &t1);
      
      	/*
      	 * Request video stream buffer from decoder.
      	 */
      	int nRequireSize = picWidth * picHeight * 3 / 2;
      	char *pBuf;
      	int bufSize;
      	char *pRingBuf;
      	int ringBufSize;
      	
      	if (RequestVideoStreamBuffer(pVideoDecoder, nRequireSize, 
      						&pBuf, &bufSize, &pRingBuf, &ringBufSize, 0) < 0) {
      		printf("request video stream buffer failed");
      		DestroyVideoDecoder(pVideoDecoder);
      		return -1;
      	}
      
      	/*
      	 * Sumbit video stream data to be decode to decoder.
      	 */
      	char *inFileData = NULL;
      	int inDataSize = 0;
      
      	inFileData = ReadFile(picFileName, &inDataSize);
      	if (NULL == inFileData || inDataSize <= 0) {
      		printf("read file %s failed", picFileName);
      		DestroyVideoDecoder(pVideoDecoder);
      		return -1;
      	}
      
      	printf("read file %d\n", inDataSize);
      
      	if (bufSize >= inDataSize) {
              memcpy(pBuf, inFileData, inDataSize);
          } else {
              memcpy(pBuf, inFileData, bufSize);
              //memcpy(pRingBuf, inFileData + bufSize, inDataSize - bufSize);
          }
      	
      	VideoStreamDataInfo videoStreamDataInfo;
      	
      	memset(&videoStreamDataInfo, 0, sizeof(videoStreamDataInfo));
      	videoStreamDataInfo.pData = pBuf;
      	videoStreamDataInfo.nLength = inDataSize;
      	videoStreamDataInfo.bIsFirstPart = 1;
      	videoStreamDataInfo.bIsLastPart = 1;
      	if (SubmitVideoStreamData(pVideoDecoder, &videoStreamDataInfo, 0) < 0) {
      		printf("submit video stream to decoder failed");
      		free(inFileData);
      		DestroyVideoDecoder(pVideoDecoder);
      		return -1;
      	}
      
      	/*
      	 * Start decoding.
      	 */
      	int frame_num = VideoStreamFrameNum(pVideoDecoder, 0);
      	printf("frame: %d\n", frame_num);
      	frame_num = ValidPictureNum(pVideoDecoder, 0);
      	printf("frame: %d\n", frame_num);
      	int decodeResult = DecodeVideoStream(pVideoDecoder, 0, 0, 0, 0);
      	if (ValidPictureNum(pVideoDecoder, 0) < 0) {
      	}
      	if (!(decodeResult == VDECODE_RESULT_OK || 
      		  decodeResult == VDECODE_RESULT_FRAME_DECODED || 
      		  decodeResult == VDECODE_RESULT_KEYFRAME_DECODED)) {
      		printf("decode failed, decode result: %d\n", decodeResult);
      		free(inFileData);
      		DestroyVideoDecoder(pVideoDecoder);
      		return -1;
      	}
      	
      	/*
      	 * Toggles the decoded stream object of decoder to obtain data. 
      	 */
      	VideoPicture *pVideoPic = NULL;
      	pVideoPic = RequestPicture(pVideoDecoder, 0);
      	if (NULL == pVideoPic) {
      		printf("decode failed");
      		free(inFileData);
      		DestroyVideoDecoder(pVideoDecoder);
      		return -1;
      	}
      
      	printf("Decoded data statistics:");
      	printf("pixel format: %d", pVideoPic->ePixelFormat);
      	printf("width: %d", pVideoPic->nWidth);
      	printf("height: %d", pVideoPic->nHeight);
      	printf("line stride: %d", pVideoPic->nLineStride);
      	printf("left offset: %d", pVideoPic->nLeftOffset);
      	printf("top offset: %d", pVideoPic->nTopOffset);
      	printf("right offset: %d", pVideoPic->nRightOffset);
      	printf("bottom offset: %d", pVideoPic->nBottomOffset);
      	printf("progressive: %s", pVideoPic->bIsProgressive ? "y" : "n");
      
      	/*
      	 * Returns the toggled decoded stream object back to decoder. 
      	 */
      	ReturnPicture(pVideoDecoder, pVideoPic);
      
      	clock_gettime(CLOCK_MONOTONIC, &t2);
      
      	char outputFile[256 + 1];
      
      	strcpy(outputFile, picFileName);
      	strcat(outputFile, ".yuv");
      	// if (useNeon) {
      	// 	extern void ConvertMb32420ToNv21Y(char* pSrc,char* pDst,int nWidth, int nHeight);
      	
      	// 	char *Y = malloc(pVideoPic->nRightOffset * pVideoPic->nBottomOffset);
      	// 	ConvertMb32420ToNv21Y(pVideoPic->pData0, Y, pVideoPic->nRightOffset, pVideoPic->nBottomOffset);
      	// 	WriteFile(outputFile, Y, pVideoPic->nRightOffset * pVideoPic->nBottomOffset);
      	// 	free(Y);
      	// } else {
      		// u8 *Y, *U, *V;
      		// u8 *y_uv = YUV_MB32_420_To_YUV420(pVideoPic->nRightOffset, pVideoPic->nBottomOffset, 
      		// 					   			  pVideoPic->pData0, pVideoPic->pData1, &Y, &U, &V);
      		WriteFile(outputFile, pVideoPic->pData0, pVideoPic->nRightOffset * pVideoPic->nBottomOffset);
      		// free(y_uv);
      	// }
      
      	//clock_gettime(CLOCK_MONOTONIC, &t2);
      	long long diff = TimeDiff(t2, t1);
      	printf("time spent for decoding: %lldms / %lldus/ %lldns", 
      		 diff / 1000000, diff / 1000, diff);
      
      	/*
      	 * Clean up & Destroy video decoder.
      	 */
      	free(inFileData);
      	inFileData = NULL;
      	
      	//DestroyVideoDecoder(pVideoDecoder);
      
      	return 0;
      }
      
      
      static char *ReadFile(char *path, int *pLen)
      {
          FILE *fp = NULL;
          int ret = 0;
          char *data = NULL;
      
          fp = fopen(path, "rb");
          if (fp == NULL) {
              printf("read jpeg file error, errno(%d)", errno);
              return NULL;
          }
      
          fseek(fp,0,SEEK_END);
          *pLen = ftell(fp);
      	
          rewind(fp);
          data = (char *) malloc(sizeof(char) * (*pLen));
          if(data == NULL) {
      		printf("malloc memory fail");
      		fclose(fp);
      		return NULL;
      	}
      
          ret = fread (data, 1, *pLen, fp);
          if (ret != *pLen) {
              printf("read file fail");
              fclose(fp);
              free(data);
              return NULL;
          }
      
          if (fp != NULL)
              fclose(fp);
      	
          return data;
      }
      
      static int WriteFile(const char *fileName, const char *data, int size)
      {
      	FILE *fp;
      
      	fp = fopen(fileName, "wb");
      	if (fp == NULL) {
      		printf("create file %s failed", fileName);
      		return -1;
      	}
      
      	fwrite(data, 1, size, fp);
      
      	fclose(fp);
      
      	return 0;
      }
      
      
      static void WriteBack_Y(u32 Width, u32 Height, u8 *src, u8 *dst)
      {
      	u32 i, j;
      	u32 x, y;
      	u8 *srcT, *dstT;
      
      	srcT = src;
      	dstT = dst;
      
      	for (y = 0; y < Height; y += 32)
      	{
      		dstT = dst + Width * y;
      		srcT = src + Width * y;
      		for (x = 0; x < Width; x += 32)
      		{
      			for (i = 0; i < 32; i++)
      			{
      				for (j = 0; j < 32; j++)
      					dstT[i*Width + j] = srcT[i*32 + j];
      			}
      			dstT += 32;
      			srcT += 32 * 32;
      		}
      	}
      }
      
      static void WriteBack_UV(u32 width, u32 height, u8 *srcChrom, u8 *dstU, u8 *dstV)
      {
      	u32 i, j;
      	u32 x, y;
      	u8 *C, *U, *V;
      
      	C = srcChrom;
      	
      	for (y = 0; y < height / 2; y += 32)
      	{
      		U = dstU + width / 2 * y;
      		V = dstV + width / 2 * y;
      		C = srcChrom + width * y;
      
      		for (x = 0; x < width; x += 32)
      		{
      			for (i = 0; i < 32; i++)
      			{
      				for (j = 0; j < 16; j++)
      				{
      					if ((y + i) < (height / 2))
      					{
      						U[i*width / 2 + j] = C[i*32 + 2*j];
      						V[i*width / 2 + j] = C[i*32 + 2*j + 1];
      					}
      				}
      			}
      			
      			U += 16;
      			V += 16;
      			C += 32*32;
      		}
      	}
      }
      
      static u8 *YUV_MB32_420_To_YUV420(u32 Width, u32 Height, u8 *ySrc, u8 *cSrc,  
      								  u8 **pY, u8 **pU, u8 **pV)
      {
      	u8 *yuv_plane;
      	u32 ysize;
      	u32 csize;
      
      	u32 width16;
      	u32 height16;
      	
      	u32 width32, height32;
      
      	u32 height64;
      
      	u8 *Y, *U, *V;
      
      	width16 = (Width + 15) & ~15;
      	height16 = (Height + 15) & ~15;
      	width32 = (Width + 31) & ~31;
      	height32 = (Height + 31) & ~31;
      	height64 = (Height + 63) & ~63;
      
      	ysize = width32 * height32;
      	csize = width32 * height64 / 2;
      
      	yuv_plane = (u8 *)malloc(ysize + csize);
      	if (yuv_plane == NULL)
      		return NULL;
      
      	Y = yuv_plane;
      	WriteBack_Y(width32, height32, ySrc, Y);
      
      	U = Y + width32 * height32;
      	V = U + (width32 * height64 / 4);
      	//WriteBack_UV(width32, height64, cSrc, U, V);
      
      	*pY = Y;
      	*pU = U;
      	*pV = V;
      
      	return yuv_plane;
      }
      
      
      发布在 A Series
      A
      a502849625