android/emu/media/src/android/emulation/MediaCudaVideoHelper.cpp - platform/external/qemu - Git at Google

 // Copyright (C) 2019 The Android Open Source Project
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 //
 // http://www.apache.org/licenses/LICENSE-2.0
 //
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS,
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.

 #include "host-common/MediaCudaVideoHelper.h"
 #include "host-common/MediaCudaDriverHelper.h"
 #include "host-common/MediaCudaUtils.h"
 #include "host-common/YuvConverter.h"
 #include "android/utils/debug.h"

 extern "C" {
 #define INIT_CUDA_GL 1
 #include "android/emulation/dynlink_cudaGL.h"
 #include "host-common/dynlink_cuda.h"
 #include "host-common/dynlink_nvcuvid.h"
 }
 #define MEDIA_CUDA_DEBUG 0

 #if MEDIA_CUDA_DEBUG
 #define CUDA_DPRINT(fmt, ...)                                             \
     fprintf(stderr, "media-cuda-video-helper: %s:%d " fmt "\n", __func__, \
             __LINE__, ##__VA_ARGS__);
 #else
 #define CUDA_DPRINT(fmt, ...)
 #endif

 #define NVDEC_API_CALL(cuvidAPI)                                     \
     do {                                                             \
         CUresult errorCode = cuvidAPI;                               \
         if (errorCode != CUDA_SUCCESS) {                             \
             CUDA_DPRINT("%s failed with error code %d\n", #cuvidAPI, \
                         (int)errorCode);                             \
         }                                                            \
     } while (0)

 namespace android {
 namespace emulation {

 bool MediaCudaVideoHelper::s_isCudaDecoderGood = true;

 using TextureFrame = MediaTexturePool::TextureFrame;
 using FrameInfo = MediaSnapshotState::FrameInfo;
 using ColorAspects = MediaSnapshotState::ColorAspects;

 MediaCudaVideoHelper::MediaCudaVideoHelper(OutputTreatmentMode oMode,
                                            FrameStorageMode fMode,
                                            cudaVideoCodec cudaVideoCodecType)
     : mUseGpuTexture(fMode == FrameStorageMode::USE_GPU_TEXTURE),
       mCudaVideoCodecType(cudaVideoCodecType) {
     mIgnoreDecoderOutput = (oMode == OutputTreatmentMode::IGNORE_RESULT);
 }

 MediaCudaVideoHelper::~MediaCudaVideoHelper() {
     deInit();
 }

 void MediaCudaVideoHelper::deInit() {
     CUDA_DPRINT("deInit calling");

     mSavedDecodedFrames.clear();
     if (mCudaContext != nullptr) {
         NVDEC_API_CALL(cuCtxPushCurrent(mCudaContext));
         if (mCudaParser != nullptr) {
             NVDEC_API_CALL(cuvidDestroyVideoParser(mCudaParser));
             mCudaParser = nullptr;
         }

         if (mCudaDecoder != nullptr) {
             NVDEC_API_CALL(cuvidDestroyDecoder(mCudaDecoder));
             mCudaDecoder = nullptr;
         }
         NVDEC_API_CALL(cuCtxPopCurrent(NULL));
         NVDEC_API_CALL(cuvidCtxLockDestroy(mCtxLock));
     }

     if (mCudaContext != nullptr) {
         CUresult myres = cuCtxDestroy(mCudaContext);
         if (myres != CUDA_SUCCESS) {
             CUDA_DPRINT("Failed to destroy cuda context; error code %d",
                         (int)myres);
         }
         mCudaContext = nullptr;
     }
 }

 bool MediaCudaVideoHelper::init() {
     if (!s_isCudaDecoderGood) {
         CUDA_DPRINT(
                 "Already verified: cuda decoder does not work on this host");
         return false;
     }
     if (!MediaCudaDriverHelper::initCudaDrivers()) {
         CUDA_DPRINT("Failed to initCudaDrivers");
         mIsGood = false;
         mErrorCode = 1;
         s_isCudaDecoderGood = false;
         return false;
     }

     if (mCudaContext != nullptr) {
         deInit();
     }

     // cudat stuff
     const int gpuIndex = 0;
     const int cudaFlags = 0;
     CUdevice cudaDevice = 0;
     CUresult myres = cuDeviceGet(&cudaDevice, gpuIndex);
     if (myres != CUDA_SUCCESS) {
         mIsGood = false;
         mErrorCode = 2;
         s_isCudaDecoderGood = false;
         CUDA_DPRINT("Failed to get cuda device, error code %d", (int)myres);
         return false;
     }

     char buf[1024];
     myres = cuDeviceGetName(buf, sizeof(buf), cudaDevice);
     if (myres != CUDA_SUCCESS) {
         mIsGood = false;
         mErrorCode = 3;
         s_isCudaDecoderGood = false;
         CUDA_DPRINT("Failed to get gpu device name, error code %d", (int)myres);
         return false;
     }

     CUDA_DPRINT("using gpu device %s", buf);

     myres = cuCtxCreate(&mCudaContext, cudaFlags, cudaDevice);
     if (myres != CUDA_SUCCESS) {
         mIsGood = false;
         s_isCudaDecoderGood = false;
         CUDA_DPRINT("Failed to create cuda context, error code %d", (int)myres);
         return false;
     }

     NVDEC_API_CALL(cuvidCtxLockCreate(&mCtxLock, mCudaContext));

     CUVIDPARSERPARAMS videoParserParameters = {};
     // videoParserParameters.CodecType = (mType == MediaCodecType::VP8Codec) ?
     // cudaVideoCodec_VP8 : cudaVideoCodec_VP9;
     videoParserParameters.CodecType = mCudaVideoCodecType;

     videoParserParameters.ulMaxNumDecodeSurfaces = 1;
     videoParserParameters.ulMaxDisplayDelay = 0;
     videoParserParameters.pUserData = this;
     videoParserParameters.pfnSequenceCallback = HandleVideoSequenceProc;
     videoParserParameters.pfnDecodePicture = HandlePictureDecodeProc;
     videoParserParameters.pfnDisplayPicture = HandlePictureDisplayProc;
     NVDEC_API_CALL(
             cuvidCreateVideoParser(&mCudaParser, &videoParserParameters));

     CUDA_DPRINT("Successfully created cuda context %p", mCudaContext);
     dprint("successfully created cuda video decoder for %s, with gpu texture "
            "mode %s",
            mCudaVideoCodecType == cudaVideoCodec_H264
                    ? "H264"
                    : (mCudaVideoCodecType == cudaVideoCodec_HEVC
                               ? "HEVC"
                               : (mCudaVideoCodecType == cudaVideoCodec_VP8
                                          ? "VP8"
                                          : "VP9")),
            mUseGpuTexture ? "on" : "off");

     return true;
 }

 void MediaCudaVideoHelper::decode(const uint8_t* frame,
                                   size_t szBytes,
                                   uint64_t inputPts) {
     CUDA_DPRINT("%s(frame=%p, sz=%zu)", __func__, frame, szBytes);

     CUVIDSOURCEDATAPACKET packet = {0};
     packet.payload = frame;
     packet.payload_size = szBytes;
     packet.flags = CUVID_PKT_TIMESTAMP | CUVID_PKT_ENDOFPICTURE;
     packet.timestamp = inputPts;
     if (!frame || szBytes == 0) {
         packet.flags |= CUVID_PKT_ENDOFSTREAM;
     } else {
         ++mNumInputFrame;
     }
     NVDEC_API_CALL(cuvidParseVideoData(mCudaParser, &packet));
 }

 void MediaCudaVideoHelper::flush() {
     CUDA_DPRINT("started flushing");
     CUVIDSOURCEDATAPACKET packet = {0};
     packet.payload = NULL;
     packet.payload_size = 0;
     packet.flags |= CUVID_PKT_ENDOFSTREAM;
     NVDEC_API_CALL(cuvidParseVideoData(mCudaParser, &packet));
     CUDA_DPRINT("done one flushing");
 }

 int MediaCudaVideoHelper::HandleVideoSequence(CUVIDEOFORMAT* pVideoFormat) {
     int nDecodeSurface = 8;  // need 8 for 4K video

     CUVIDDECODECAPS decodecaps;
     memset(&decodecaps, 0, sizeof(decodecaps));

     decodecaps.eCodecType = pVideoFormat->codec;
     decodecaps.eChromaFormat = pVideoFormat->chroma_format;
     decodecaps.nBitDepthMinus8 = pVideoFormat->bit_depth_luma_minus8;

     NVDEC_API_CALL(cuCtxPushCurrent(mCudaContext));
     NVDEC_API_CALL(cuvidGetDecoderCaps(&decodecaps));
     NVDEC_API_CALL(cuCtxPopCurrent(NULL));

     if (!decodecaps.bIsSupported) {
         mIsGood = false;
         mErrorCode = 4;
         CUDA_DPRINT("Codec not supported on this GPU.");
         return nDecodeSurface;
     }

     if ((pVideoFormat->coded_width > decodecaps.nMaxWidth) ||
         (pVideoFormat->coded_height > decodecaps.nMaxHeight)) {
         CUDA_DPRINT("Resolution not supported on this GPU");
         mIsGood = false;
         mErrorCode = 5;
         return nDecodeSurface;
     }

     if ((pVideoFormat->coded_width >> 4) * (pVideoFormat->coded_height >> 4) >
         decodecaps.nMaxMBCount) {
         CUDA_DPRINT("MBCount not supported on this GPU");
         mIsGood = false;
         mErrorCode = 6;
         return nDecodeSurface;
     }

     mLumaWidth =
             pVideoFormat->display_area.right - pVideoFormat->display_area.left;
     mLumaHeight =
             pVideoFormat->display_area.bottom - pVideoFormat->display_area.top;
     mChromaHeight = mLumaHeight * 0.5;  // NV12
     mBPP = pVideoFormat->bit_depth_luma_minus8 > 0 ? 2 : 1;

     if (mCudaVideoCodecType == cudaVideoCodec_H264
         || mCudaVideoCodecType == cudaVideoCodec_HEVC
         ) {
         if (pVideoFormat->video_signal_description.video_full_range_flag)
             mColorRange = 2;
         else
             mColorRange = 0;

         mColorPrimaries =
                 pVideoFormat->video_signal_description.color_primaries;
         mColorTransfer =
                 pVideoFormat->video_signal_description.transfer_characteristics;
         mColorSpace =
                 pVideoFormat->video_signal_description.matrix_coefficients;
     }

     CUVIDDECODECREATEINFO videoDecodeCreateInfo = {0};
     videoDecodeCreateInfo.CodecType = pVideoFormat->codec;
     videoDecodeCreateInfo.ChromaFormat = pVideoFormat->chroma_format;
     videoDecodeCreateInfo.OutputFormat = cudaVideoSurfaceFormat_NV12;
     CUDA_DPRINT("output format is %d", videoDecodeCreateInfo.OutputFormat);
     videoDecodeCreateInfo.bitDepthMinus8 = pVideoFormat->bit_depth_luma_minus8;
     if (pVideoFormat->progressive_sequence)
         videoDecodeCreateInfo.DeinterlaceMode = cudaVideoDeinterlaceMode_Weave;
     else
         videoDecodeCreateInfo.DeinterlaceMode =
                 cudaVideoDeinterlaceMode_Adaptive;
     videoDecodeCreateInfo.ulNumOutputSurfaces = 1;
     // With PreferCUVID, JPEG is still decoded by CUDA while video is decoded by
     // NVDEC hardware
     videoDecodeCreateInfo.ulCreationFlags = cudaVideoCreate_PreferCUVID;
     videoDecodeCreateInfo.ulNumDecodeSurfaces = nDecodeSurface;
     videoDecodeCreateInfo.vidLock = mCtxLock;
     videoDecodeCreateInfo.ulWidth = pVideoFormat->coded_width;
     videoDecodeCreateInfo.ulHeight = pVideoFormat->coded_height;
     if (mOutputHeight != mLumaHeight || mOutputWidth != mLumaWidth) {
         CUDA_DPRINT("old width %d old height %d", mOutputWidth, mOutputHeight);
         mOutputWidth = mLumaWidth;
         mOutputHeight = mLumaHeight;
         CUDA_DPRINT("new width %d new height %d", mOutputWidth, mOutputHeight);
         unsigned int newOutBufferSize = mOutputWidth * mOutputHeight * 3 / 2;
         if (mOutBufferSize < newOutBufferSize) {
             mOutBufferSize = newOutBufferSize;
         }
     }

     videoDecodeCreateInfo.ulTargetWidth = pVideoFormat->coded_width;
     videoDecodeCreateInfo.ulTargetHeight = pVideoFormat->coded_height;

     mSurfaceWidth = videoDecodeCreateInfo.ulTargetWidth;
     mSurfaceHeight = videoDecodeCreateInfo.ulTargetHeight;

     NVDEC_API_CALL(cuCtxPushCurrent(mCudaContext));
     if (mCudaDecoder != nullptr) {
         NVDEC_API_CALL(cuvidDestroyDecoder(mCudaDecoder));
         mCudaDecoder = nullptr;
     }
     {
         size_t free, total;
         cuMemGetInfo(&free, &total);
         CUDA_DPRINT("free memory %g M, total %g M", free / 1048576.0,
                     total / 1048576.0);
     }
     NVDEC_API_CALL(cuCtxPopCurrent(NULL));
     NVDEC_API_CALL(cuCtxPushCurrent(mCudaContext));
     NVDEC_API_CALL(cuvidCreateDecoder(&mCudaDecoder, &videoDecodeCreateInfo));
     NVDEC_API_CALL(cuCtxPopCurrent(NULL));
     CUDA_DPRINT("successfully called. decoder %p", mCudaDecoder);
     return nDecodeSurface;
 }

 int MediaCudaVideoHelper::HandlePictureDecode(CUVIDPICPARAMS* pPicParams) {
     NVDEC_API_CALL(cuvidDecodePicture(mCudaDecoder, pPicParams));
     CUDA_DPRINT("successfully called.");
     return 1;
 }

 int MediaCudaVideoHelper::HandlePictureDisplay(CUVIDPARSERDISPINFO* pDispInfo) {
     if (mIgnoreDecoderOutput) {
         return 1;
     }
     constexpr int MAX_NUM_INPUT_WITHOUT_OUTPUT = 16;
     if (mNumOutputFrame == 0 && mNumInputFrame > MAX_NUM_INPUT_WITHOUT_OUTPUT) {
         // after more than 16 inputs, there is still no output,
         // probably corrupted stream, ignore everything from now on
         dwarning("%d frames decoded witout any output, possibly bad "
                "input stream. Ignore output frames (they might be corrupted) "
                "from now on.",
                MAX_NUM_INPUT_WITHOUT_OUTPUT);
         return 0;
     }

     CUVIDPROCPARAMS videoProcessingParameters = {};
     videoProcessingParameters.progressive_frame = pDispInfo->progressive_frame;
     videoProcessingParameters.second_field = pDispInfo->repeat_first_field + 1;
     videoProcessingParameters.top_field_first = pDispInfo->top_field_first;
     videoProcessingParameters.unpaired_field =
             pDispInfo->repeat_first_field < 0;
     videoProcessingParameters.output_stream = 0;
     uint64_t myOutputPts = pDispInfo->timestamp;

     CUdeviceptr dpSrcFrame = 0;
     unsigned int nSrcPitch = 0;
     CUresult errorCode = cuvidMapVideoFrame(mCudaDecoder, pDispInfo->picture_index,
                                       &dpSrcFrame, &nSrcPitch,
                                       &videoProcessingParameters);
     if (errorCode != CUDA_SUCCESS) {
         CUDA_DPRINT("failed to call cuvidMapVideoFrame with error code %d\n", (int)errorCode);
         return 0;
     }

     NVDEC_API_CALL(cuCtxPushCurrent(mCudaContext));
     unsigned int newOutBufferSize = mOutputWidth * mOutputHeight * 3 / 2;
     std::vector<uint8_t> myFrame;
     TextureFrame texFrame;
     if (mUseGpuTexture && mTexturePool != nullptr) {
         media_cuda_utils_copy_context my_copy_context{
                 .src_frame = dpSrcFrame,
                 .src_pitch = nSrcPitch,
                 .src_surface_height = mSurfaceHeight,
                 .dest_width = mOutputWidth,
                 .dest_height = mOutputHeight,
         };
         texFrame = mTexturePool->getTextureFrame(mOutputWidth, mOutputHeight);
         mTexturePool->saveDecodedFrameToTexture(
                 texFrame, &my_copy_context,
                 (void*)media_cuda_utils_nv12_updater);
     } else {
         myFrame.resize(newOutBufferSize);
         uint8_t* pDecodedFrame = &(myFrame[0]);

         CUDA_MEMCPY2D m = {0};
         m.srcMemoryType = CU_MEMORYTYPE_DEVICE;
         m.srcDevice = dpSrcFrame;
         m.srcPitch = nSrcPitch;
         m.dstMemoryType = CU_MEMORYTYPE_HOST;
         m.dstDevice = (CUdeviceptr)(m.dstHost = pDecodedFrame);
         m.dstPitch = mOutputWidth * mBPP;
         m.WidthInBytes = mOutputWidth * mBPP;
         m.Height = mLumaHeight;
         CUDA_DPRINT("dstDevice %p, dstPitch %d, WidthInBytes %d Height %d",
                     m.dstHost, (int)m.dstPitch, (int)m.WidthInBytes,
                     (int)m.Height);

         NVDEC_API_CALL(cuMemcpy2DAsync(&m, 0));

         m.srcDevice = (CUdeviceptr)((uint8_t*)dpSrcFrame +
                                     m.srcPitch * mSurfaceHeight);
         m.dstDevice = (CUdeviceptr)(m.dstHost = pDecodedFrame +
                                                 m.dstPitch * mLumaHeight);
         m.Height = mChromaHeight;
         NVDEC_API_CALL(cuMemcpy2DAsync(&m, 0));
         YuvConverter<uint8_t> convert8(mOutputWidth, mOutputHeight);
         convert8.UVInterleavedToPlanar(pDecodedFrame);
     }

     NVDEC_API_CALL(cuStreamSynchronize(0));
     NVDEC_API_CALL(cuCtxPopCurrent(NULL));

     NVDEC_API_CALL(cuvidUnmapVideoFrame(mCudaDecoder, dpSrcFrame));
     {
         std::lock_guard<std::mutex> g(mFrameLock);

         mSavedDecodedFrames.push_back(MediaSnapshotState::FrameInfo{
                 std::move(myFrame),
                 std::vector<uint32_t>{texFrame.Ytex, texFrame.UVtex},
                 (int)mOutputWidth, (int)mOutputHeight, myOutputPts,
                 ColorAspects{mColorPrimaries, mColorRange, mColorTransfer,
                              mColorSpace}});
     }
     ++mNumOutputFrame;
     CUDA_DPRINT("successfully called.");
     return 1;
 }

 }  // namespace emulation
 }  // namespace android
	// Copyright (C) 2019 The Android Open Source Project
	//
	// Licensed under the Apache License, Version 2.0 (the "License");
	// you may not use this file except in compliance with the License.
	// You may obtain a copy of the License at
	//
	// http://www.apache.org/licenses/LICENSE-2.0
	//
	// Unless required by applicable law or agreed to in writing, software
	// distributed under the License is distributed on an "AS IS" BASIS,
	// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	// See the License for the specific language governing permissions and
	// limitations under the License.

	#include "host-common/MediaCudaVideoHelper.h"
	#include "host-common/MediaCudaDriverHelper.h"
	#include "host-common/MediaCudaUtils.h"
	#include "host-common/YuvConverter.h"
	#include "android/utils/debug.h"

	extern "C" {
	#define INIT_CUDA_GL 1
	#include "android/emulation/dynlink_cudaGL.h"
	#include "host-common/dynlink_cuda.h"
	#include "host-common/dynlink_nvcuvid.h"
	}
	#define MEDIA_CUDA_DEBUG 0

	#if MEDIA_CUDA_DEBUG
	#define CUDA_DPRINT(fmt, ...) \
	fprintf(stderr, "media-cuda-video-helper: %s:%d " fmt "\n", __func__, \
	__LINE__, ##__VA_ARGS__);
	#else
	#define CUDA_DPRINT(fmt, ...)
	#endif

	#define NVDEC_API_CALL(cuvidAPI) \
	do { \
	CUresult errorCode = cuvidAPI; \
	if (errorCode != CUDA_SUCCESS) { \
	CUDA_DPRINT("%s failed with error code %d\n", #cuvidAPI, \
	(int)errorCode); \
	} \
	} while (0)

	namespace android {
	namespace emulation {

	bool MediaCudaVideoHelper::s_isCudaDecoderGood = true;

	using TextureFrame = MediaTexturePool::TextureFrame;
	using FrameInfo = MediaSnapshotState::FrameInfo;
	using ColorAspects = MediaSnapshotState::ColorAspects;

	MediaCudaVideoHelper::MediaCudaVideoHelper(OutputTreatmentMode oMode,
	FrameStorageMode fMode,
	cudaVideoCodec cudaVideoCodecType)
	: mUseGpuTexture(fMode == FrameStorageMode::USE_GPU_TEXTURE),
	mCudaVideoCodecType(cudaVideoCodecType) {
	mIgnoreDecoderOutput = (oMode == OutputTreatmentMode::IGNORE_RESULT);
	}

	MediaCudaVideoHelper::~MediaCudaVideoHelper() {
	deInit();
	}

	void MediaCudaVideoHelper::deInit() {
	CUDA_DPRINT("deInit calling");

	mSavedDecodedFrames.clear();
	if (mCudaContext != nullptr) {
	NVDEC_API_CALL(cuCtxPushCurrent(mCudaContext));
	if (mCudaParser != nullptr) {
	NVDEC_API_CALL(cuvidDestroyVideoParser(mCudaParser));
	mCudaParser = nullptr;
	}

	if (mCudaDecoder != nullptr) {
	NVDEC_API_CALL(cuvidDestroyDecoder(mCudaDecoder));
	mCudaDecoder = nullptr;
	}
	NVDEC_API_CALL(cuCtxPopCurrent(NULL));
	NVDEC_API_CALL(cuvidCtxLockDestroy(mCtxLock));
	}

	if (mCudaContext != nullptr) {
	CUresult myres = cuCtxDestroy(mCudaContext);
	if (myres != CUDA_SUCCESS) {
	CUDA_DPRINT("Failed to destroy cuda context; error code %d",
	(int)myres);
	}
	mCudaContext = nullptr;
	}
	}

	bool MediaCudaVideoHelper::init() {
	if (!s_isCudaDecoderGood) {
	CUDA_DPRINT(
	"Already verified: cuda decoder does not work on this host");
	return false;
	}
	if (!MediaCudaDriverHelper::initCudaDrivers()) {
	CUDA_DPRINT("Failed to initCudaDrivers");
	mIsGood = false;
	mErrorCode = 1;
	s_isCudaDecoderGood = false;
	return false;
	}

	if (mCudaContext != nullptr) {
	deInit();
	}

	// cudat stuff
	const int gpuIndex = 0;
	const int cudaFlags = 0;
	CUdevice cudaDevice = 0;
	CUresult myres = cuDeviceGet(&cudaDevice, gpuIndex);
	if (myres != CUDA_SUCCESS) {
	mIsGood = false;
	mErrorCode = 2;
	s_isCudaDecoderGood = false;
	CUDA_DPRINT("Failed to get cuda device, error code %d", (int)myres);
	return false;
	}

	char buf[1024];
	myres = cuDeviceGetName(buf, sizeof(buf), cudaDevice);
	if (myres != CUDA_SUCCESS) {
	mIsGood = false;
	mErrorCode = 3;
	s_isCudaDecoderGood = false;
	CUDA_DPRINT("Failed to get gpu device name, error code %d", (int)myres);
	return false;
	}

	CUDA_DPRINT("using gpu device %s", buf);

	myres = cuCtxCreate(&mCudaContext, cudaFlags, cudaDevice);
	if (myres != CUDA_SUCCESS) {
	mIsGood = false;
	s_isCudaDecoderGood = false;
	CUDA_DPRINT("Failed to create cuda context, error code %d", (int)myres);
	return false;
	}

	NVDEC_API_CALL(cuvidCtxLockCreate(&mCtxLock, mCudaContext));

	CUVIDPARSERPARAMS videoParserParameters = {};
	// videoParserParameters.CodecType = (mType == MediaCodecType::VP8Codec) ?
	// cudaVideoCodec_VP8 : cudaVideoCodec_VP9;
	videoParserParameters.CodecType = mCudaVideoCodecType;

	videoParserParameters.ulMaxNumDecodeSurfaces = 1;
	videoParserParameters.ulMaxDisplayDelay = 0;
	videoParserParameters.pUserData = this;
	videoParserParameters.pfnSequenceCallback = HandleVideoSequenceProc;
	videoParserParameters.pfnDecodePicture = HandlePictureDecodeProc;
	videoParserParameters.pfnDisplayPicture = HandlePictureDisplayProc;
	NVDEC_API_CALL(
	cuvidCreateVideoParser(&mCudaParser, &videoParserParameters));

	CUDA_DPRINT("Successfully created cuda context %p", mCudaContext);
	dprint("successfully created cuda video decoder for %s, with gpu texture "
	"mode %s",
	mCudaVideoCodecType == cudaVideoCodec_H264
	? "H264"
	: (mCudaVideoCodecType == cudaVideoCodec_HEVC
	? "HEVC"
	: (mCudaVideoCodecType == cudaVideoCodec_VP8
	? "VP8"
	: "VP9")),
	mUseGpuTexture ? "on" : "off");

	return true;
	}

	void MediaCudaVideoHelper::decode(const uint8_t* frame,
	size_t szBytes,
	uint64_t inputPts) {
	CUDA_DPRINT("%s(frame=%p, sz=%zu)", __func__, frame, szBytes);

	CUVIDSOURCEDATAPACKET packet = {0};
	packet.payload = frame;
	packet.payload_size = szBytes;
	packet.flags = CUVID_PKT_TIMESTAMP \| CUVID_PKT_ENDOFPICTURE;
	packet.timestamp = inputPts;
	if (!frame \|\| szBytes == 0) {
	packet.flags \|= CUVID_PKT_ENDOFSTREAM;
	} else {
	++mNumInputFrame;
	}
	NVDEC_API_CALL(cuvidParseVideoData(mCudaParser, &packet));
	}

	void MediaCudaVideoHelper::flush() {
	CUDA_DPRINT("started flushing");
	CUVIDSOURCEDATAPACKET packet = {0};
	packet.payload = NULL;
	packet.payload_size = 0;
	packet.flags \|= CUVID_PKT_ENDOFSTREAM;
	NVDEC_API_CALL(cuvidParseVideoData(mCudaParser, &packet));
	CUDA_DPRINT("done one flushing");
	}

	int MediaCudaVideoHelper::HandleVideoSequence(CUVIDEOFORMAT* pVideoFormat) {
	int nDecodeSurface = 8; // need 8 for 4K video

	CUVIDDECODECAPS decodecaps;
	memset(&decodecaps, 0, sizeof(decodecaps));

	decodecaps.eCodecType = pVideoFormat->codec;
	decodecaps.eChromaFormat = pVideoFormat->chroma_format;
	decodecaps.nBitDepthMinus8 = pVideoFormat->bit_depth_luma_minus8;

	NVDEC_API_CALL(cuCtxPushCurrent(mCudaContext));
	NVDEC_API_CALL(cuvidGetDecoderCaps(&decodecaps));
	NVDEC_API_CALL(cuCtxPopCurrent(NULL));

	if (!decodecaps.bIsSupported) {
	mIsGood = false;
	mErrorCode = 4;
	CUDA_DPRINT("Codec not supported on this GPU.");
	return nDecodeSurface;
	}

	if ((pVideoFormat->coded_width > decodecaps.nMaxWidth) \|\|
	(pVideoFormat->coded_height > decodecaps.nMaxHeight)) {
	CUDA_DPRINT("Resolution not supported on this GPU");
	mIsGood = false;
	mErrorCode = 5;
	return nDecodeSurface;
	}

	if ((pVideoFormat->coded_width >> 4) * (pVideoFormat->coded_height >> 4) >
	decodecaps.nMaxMBCount) {
	CUDA_DPRINT("MBCount not supported on this GPU");
	mIsGood = false;
	mErrorCode = 6;
	return nDecodeSurface;
	}

	mLumaWidth =
	pVideoFormat->display_area.right - pVideoFormat->display_area.left;
	mLumaHeight =
	pVideoFormat->display_area.bottom - pVideoFormat->display_area.top;
	mChromaHeight = mLumaHeight * 0.5; // NV12
	mBPP = pVideoFormat->bit_depth_luma_minus8 > 0 ? 2 : 1;

	if (mCudaVideoCodecType == cudaVideoCodec_H264
	\|\| mCudaVideoCodecType == cudaVideoCodec_HEVC
	) {
	if (pVideoFormat->video_signal_description.video_full_range_flag)
	mColorRange = 2;
	else
	mColorRange = 0;

	mColorPrimaries =
	pVideoFormat->video_signal_description.color_primaries;
	mColorTransfer =
	pVideoFormat->video_signal_description.transfer_characteristics;
	mColorSpace =
	pVideoFormat->video_signal_description.matrix_coefficients;
	}

	CUVIDDECODECREATEINFO videoDecodeCreateInfo = {0};
	videoDecodeCreateInfo.CodecType = pVideoFormat->codec;
	videoDecodeCreateInfo.ChromaFormat = pVideoFormat->chroma_format;
	videoDecodeCreateInfo.OutputFormat = cudaVideoSurfaceFormat_NV12;
	CUDA_DPRINT("output format is %d", videoDecodeCreateInfo.OutputFormat);
	videoDecodeCreateInfo.bitDepthMinus8 = pVideoFormat->bit_depth_luma_minus8;
	if (pVideoFormat->progressive_sequence)
	videoDecodeCreateInfo.DeinterlaceMode = cudaVideoDeinterlaceMode_Weave;
	else
	videoDecodeCreateInfo.DeinterlaceMode =
	cudaVideoDeinterlaceMode_Adaptive;
	videoDecodeCreateInfo.ulNumOutputSurfaces = 1;
	// With PreferCUVID, JPEG is still decoded by CUDA while video is decoded by
	// NVDEC hardware
	videoDecodeCreateInfo.ulCreationFlags = cudaVideoCreate_PreferCUVID;
	videoDecodeCreateInfo.ulNumDecodeSurfaces = nDecodeSurface;
	videoDecodeCreateInfo.vidLock = mCtxLock;
	videoDecodeCreateInfo.ulWidth = pVideoFormat->coded_width;
	videoDecodeCreateInfo.ulHeight = pVideoFormat->coded_height;
	if (mOutputHeight != mLumaHeight \|\| mOutputWidth != mLumaWidth) {
	CUDA_DPRINT("old width %d old height %d", mOutputWidth, mOutputHeight);
	mOutputWidth = mLumaWidth;
	mOutputHeight = mLumaHeight;
	CUDA_DPRINT("new width %d new height %d", mOutputWidth, mOutputHeight);
	unsigned int newOutBufferSize = mOutputWidth * mOutputHeight * 3 / 2;
	if (mOutBufferSize < newOutBufferSize) {
	mOutBufferSize = newOutBufferSize;
	}
	}

	videoDecodeCreateInfo.ulTargetWidth = pVideoFormat->coded_width;
	videoDecodeCreateInfo.ulTargetHeight = pVideoFormat->coded_height;

	mSurfaceWidth = videoDecodeCreateInfo.ulTargetWidth;
	mSurfaceHeight = videoDecodeCreateInfo.ulTargetHeight;

	NVDEC_API_CALL(cuCtxPushCurrent(mCudaContext));
	if (mCudaDecoder != nullptr) {
	NVDEC_API_CALL(cuvidDestroyDecoder(mCudaDecoder));
	mCudaDecoder = nullptr;
	}
	{
	size_t free, total;
	cuMemGetInfo(&free, &total);
	CUDA_DPRINT("free memory %g M, total %g M", free / 1048576.0,
	total / 1048576.0);
	}
	NVDEC_API_CALL(cuCtxPopCurrent(NULL));
	NVDEC_API_CALL(cuCtxPushCurrent(mCudaContext));
	NVDEC_API_CALL(cuvidCreateDecoder(&mCudaDecoder, &videoDecodeCreateInfo));
	NVDEC_API_CALL(cuCtxPopCurrent(NULL));
	CUDA_DPRINT("successfully called. decoder %p", mCudaDecoder);
	return nDecodeSurface;
	}

	int MediaCudaVideoHelper::HandlePictureDecode(CUVIDPICPARAMS* pPicParams) {
	NVDEC_API_CALL(cuvidDecodePicture(mCudaDecoder, pPicParams));
	CUDA_DPRINT("successfully called.");
	return 1;
	}

	int MediaCudaVideoHelper::HandlePictureDisplay(CUVIDPARSERDISPINFO* pDispInfo) {
	if (mIgnoreDecoderOutput) {
	return 1;
	}
	constexpr int MAX_NUM_INPUT_WITHOUT_OUTPUT = 16;
	if (mNumOutputFrame == 0 && mNumInputFrame > MAX_NUM_INPUT_WITHOUT_OUTPUT) {
	// after more than 16 inputs, there is still no output,
	// probably corrupted stream, ignore everything from now on
	dwarning("%d frames decoded witout any output, possibly bad "
	"input stream. Ignore output frames (they might be corrupted) "
	"from now on.",
	MAX_NUM_INPUT_WITHOUT_OUTPUT);
	return 0;
	}

	CUVIDPROCPARAMS videoProcessingParameters = {};
	videoProcessingParameters.progressive_frame = pDispInfo->progressive_frame;
	videoProcessingParameters.second_field = pDispInfo->repeat_first_field + 1;
	videoProcessingParameters.top_field_first = pDispInfo->top_field_first;
	videoProcessingParameters.unpaired_field =
	pDispInfo->repeat_first_field < 0;
	videoProcessingParameters.output_stream = 0;
	uint64_t myOutputPts = pDispInfo->timestamp;

	CUdeviceptr dpSrcFrame = 0;
	unsigned int nSrcPitch = 0;
	CUresult errorCode = cuvidMapVideoFrame(mCudaDecoder, pDispInfo->picture_index,
	&dpSrcFrame, &nSrcPitch,
	&videoProcessingParameters);
	if (errorCode != CUDA_SUCCESS) {
	CUDA_DPRINT("failed to call cuvidMapVideoFrame with error code %d\n", (int)errorCode);
	return 0;
	}

	NVDEC_API_CALL(cuCtxPushCurrent(mCudaContext));
	unsigned int newOutBufferSize = mOutputWidth * mOutputHeight * 3 / 2;
	std::vector<uint8_t> myFrame;
	TextureFrame texFrame;
	if (mUseGpuTexture && mTexturePool != nullptr) {
	media_cuda_utils_copy_context my_copy_context{
	.src_frame = dpSrcFrame,
	.src_pitch = nSrcPitch,
	.src_surface_height = mSurfaceHeight,
	.dest_width = mOutputWidth,
	.dest_height = mOutputHeight,
	};
	texFrame = mTexturePool->getTextureFrame(mOutputWidth, mOutputHeight);
	mTexturePool->saveDecodedFrameToTexture(
	texFrame, &my_copy_context,
	(void*)media_cuda_utils_nv12_updater);
	} else {
	myFrame.resize(newOutBufferSize);
	uint8_t* pDecodedFrame = &(myFrame[0]);

	CUDA_MEMCPY2D m = {0};
	m.srcMemoryType = CU_MEMORYTYPE_DEVICE;
	m.srcDevice = dpSrcFrame;
	m.srcPitch = nSrcPitch;
	m.dstMemoryType = CU_MEMORYTYPE_HOST;
	m.dstDevice = (CUdeviceptr)(m.dstHost = pDecodedFrame);
	m.dstPitch = mOutputWidth * mBPP;
	m.WidthInBytes = mOutputWidth * mBPP;
	m.Height = mLumaHeight;
	CUDA_DPRINT("dstDevice %p, dstPitch %d, WidthInBytes %d Height %d",
	m.dstHost, (int)m.dstPitch, (int)m.WidthInBytes,
	(int)m.Height);

	NVDEC_API_CALL(cuMemcpy2DAsync(&m, 0));

	m.srcDevice = (CUdeviceptr)((uint8_t*)dpSrcFrame +
	m.srcPitch * mSurfaceHeight);
	m.dstDevice = (CUdeviceptr)(m.dstHost = pDecodedFrame +
	m.dstPitch * mLumaHeight);
	m.Height = mChromaHeight;
	NVDEC_API_CALL(cuMemcpy2DAsync(&m, 0));
	YuvConverter<uint8_t> convert8(mOutputWidth, mOutputHeight);
	convert8.UVInterleavedToPlanar(pDecodedFrame);
	}

	NVDEC_API_CALL(cuStreamSynchronize(0));
	NVDEC_API_CALL(cuCtxPopCurrent(NULL));

	NVDEC_API_CALL(cuvidUnmapVideoFrame(mCudaDecoder, dpSrcFrame));
	{
	std::lock_guard<std::mutex> g(mFrameLock);

	mSavedDecodedFrames.push_back(MediaSnapshotState::FrameInfo{
	std::move(myFrame),
	std::vector<uint32_t>{texFrame.Ytex, texFrame.UVtex},
	(int)mOutputWidth, (int)mOutputHeight, myOutputPts,
	ColorAspects{mColorPrimaries, mColorRange, mColorTransfer,
	mColorSpace}});
	}
	++mNumOutputFrame;
	CUDA_DPRINT("successfully called.");
	return 1;
	}

	} // namespace emulation
	} // namespace android