| #include "caffe2/video/video_decoder.h" |
| #include "caffe2/core/logging.h" |
| |
| #include <stdio.h> |
| #include <mutex> |
| |
| extern "C" { |
| #include <libavcodec/avcodec.h> |
| #include <libavformat/avformat.h> |
| #include <libavutil/log.h> |
| #include <libswresample/swresample.h> |
| #include <libswscale/swscale.h> |
| } |
| |
| namespace caffe2 { |
| |
| VideoDecoder::VideoDecoder() { |
| static bool gInitialized = false; |
| static std::mutex gMutex; |
| std::unique_lock<std::mutex> lock(gMutex); |
| if (!gInitialized) { |
| av_register_all(); |
| avcodec_register_all(); |
| avformat_network_init(); |
| gInitialized = true; |
| } |
| } |
| |
| void VideoDecoder::decodeLoop( |
| const string& videoName, |
| VideoIOContext& ioctx, |
| const Params& params, |
| std::vector<std::unique_ptr<DecodedFrame>>& sampledFrames) { |
| AVPixelFormat pixFormat = params.pixelFormat_; |
| |
| AVFormatContext* inputContext = avformat_alloc_context(); |
| AVStream* videoStream_ = nullptr; |
| AVCodecContext* videoCodecContext_ = nullptr; |
| AVFrame* videoStreamFrame_ = nullptr; |
| AVPacket packet; |
| av_init_packet(&packet); // init packet |
| SwsContext* scaleContext_ = nullptr; |
| try { |
| inputContext->pb = ioctx.get_avio(); |
| inputContext->flags |= AVFMT_FLAG_CUSTOM_IO; |
| int ret = 0; |
| |
| // Determining the input format: |
| int probeSz = 32 * 1024 + AVPROBE_PADDING_SIZE; |
| DecodedFrame::AvDataPtr probe((uint8_t*)av_malloc(probeSz)); |
| |
| memset(probe.get(), 0, probeSz); |
| int len = ioctx.read(probe.get(), probeSz - AVPROBE_PADDING_SIZE); |
| if (len < probeSz - AVPROBE_PADDING_SIZE) { |
| LOG(ERROR) << "Insufficient data to determine video format"; |
| } |
| |
| // seek back to start of stream |
| ioctx.seek(0, SEEK_SET); |
| |
| unique_ptr<AVProbeData> probeData(new AVProbeData()); |
| probeData->buf = probe.get(); |
| probeData->buf_size = len; |
| probeData->filename = ""; |
| // Determine the input-format: |
| inputContext->iformat = av_probe_input_format(probeData.get(), 1); |
| |
| ret = avformat_open_input(&inputContext, "", nullptr, nullptr); |
| if (ret < 0) { |
| LOG(ERROR) << "Unable to open stream " << ffmpegErrorStr(ret); |
| } |
| |
| ret = avformat_find_stream_info(inputContext, nullptr); |
| if (ret < 0) { |
| LOG(ERROR) << "Unable to find stream info in " << videoName << " " |
| << ffmpegErrorStr(ret); |
| } |
| |
| // Decode the first video stream |
| int videoStreamIndex_ = params.streamIndex_; |
| if (videoStreamIndex_ == -1) { |
| for (int i = 0; i < inputContext->nb_streams; i++) { |
| auto stream = inputContext->streams[i]; |
| if (stream->codec->codec_type == AVMEDIA_TYPE_VIDEO) { |
| videoStreamIndex_ = i; |
| videoStream_ = stream; |
| break; |
| } |
| } |
| } |
| |
| if (videoStream_ == nullptr) { |
| LOG(ERROR) << "Unable to find video stream in " << videoName << " " |
| << ffmpegErrorStr(ret); |
| } |
| |
| // Initialize codec |
| videoCodecContext_ = videoStream_->codec; |
| |
| ret = avcodec_open2( |
| videoCodecContext_, |
| avcodec_find_decoder(videoCodecContext_->codec_id), |
| nullptr); |
| if (ret < 0) { |
| LOG(ERROR) << "Cannot open video codec : " |
| << videoCodecContext_->codec->name; |
| } |
| |
| // Calcuate if we need to rescale the frames |
| int outWidth = videoCodecContext_->width; |
| int outHeight = videoCodecContext_->height; |
| |
| if (params.maxOutputDimension_ != -1) { |
| if (videoCodecContext_->width > videoCodecContext_->height) { |
| // dominant width |
| if (params.maxOutputDimension_ < videoCodecContext_->width) { |
| float ratio = |
| (float)params.maxOutputDimension_ / videoCodecContext_->width; |
| outWidth = params.maxOutputDimension_; |
| outHeight = (int)round(videoCodecContext_->height * ratio); |
| } |
| } else { |
| // dominant height |
| if (params.maxOutputDimension_ < videoCodecContext_->height) { |
| float ratio = |
| (float)params.maxOutputDimension_ / videoCodecContext_->height; |
| outWidth = (int)round(videoCodecContext_->width * ratio); |
| outHeight = params.maxOutputDimension_; |
| } |
| } |
| } else { |
| outWidth = params.outputWidth_ == -1 ? videoCodecContext_->width |
| : params.outputWidth_; |
| outHeight = params.outputHeight_ == -1 ? videoCodecContext_->height |
| : params.outputHeight_; |
| } |
| |
| // Make sure that we have a valid format |
| CAFFE_ENFORCE_NE(videoCodecContext_->pix_fmt, AV_PIX_FMT_NONE); |
| |
| // Create a scale context |
| scaleContext_ = sws_getContext( |
| videoCodecContext_->width, |
| videoCodecContext_->height, |
| videoCodecContext_->pix_fmt, |
| outWidth, |
| outHeight, |
| pixFormat, |
| SWS_FAST_BILINEAR, |
| nullptr, |
| nullptr, |
| nullptr); |
| |
| // Getting video meta data |
| VideoMeta videoMeta; |
| videoMeta.codec_type = videoCodecContext_->codec_type; |
| videoMeta.width = outWidth; |
| videoMeta.height = outHeight; |
| videoMeta.pixFormat = pixFormat; |
| videoMeta.fps = av_q2d(videoStream_->avg_frame_rate); |
| |
| // If sampledFrames is not empty, empty it |
| if (sampledFrames.size() > 0) { |
| sampledFrames.clear(); |
| } |
| |
| if (params.intervals_.size() == 0) { |
| LOG(ERROR) << "Empty sampling intervals."; |
| } |
| |
| std::vector<SampleInterval>::const_iterator itvlIter = |
| params.intervals_.begin(); |
| if (itvlIter->timestamp != 0) { |
| LOG(ERROR) << "Sampling interval starting timestamp is not zero."; |
| } |
| |
| double currFps = itvlIter->fps; |
| if (currFps < 0 && currFps != SpecialFps::SAMPLE_ALL_FRAMES && |
| currFps != SpecialFps::SAMPLE_TIMESTAMP_ONLY) { |
| // fps must be 0, -1, -2 or > 0 |
| LOG(ERROR) << "Invalid sampling fps."; |
| } |
| |
| double prevTimestamp = itvlIter->timestamp; |
| itvlIter++; |
| if (itvlIter != params.intervals_.end() && |
| prevTimestamp >= itvlIter->timestamp) { |
| LOG(ERROR) << "Sampling interval timestamps must be strictly ascending."; |
| } |
| |
| double lastFrameTimestamp = -1.0; |
| double timestamp = -1.0; |
| |
| // Initialize frame and packet. |
| // These will be reused across calls. |
| videoStreamFrame_ = av_frame_alloc(); |
| |
| // frame index in video stream |
| int frameIndex = -1; |
| // frame index of outputed frames |
| int outputFrameIndex = -1; |
| |
| int gotPicture = 0; |
| int eof = 0; |
| |
| // There is a delay between reading packets from the |
| // transport and getting decoded frames back. |
| // Therefore, after EOF, continue going while |
| // the decoder is still giving us frames. |
| while (!eof || gotPicture) { |
| try { |
| if (!eof) { |
| ret = av_read_frame(inputContext, &packet); |
| |
| if (ret == AVERROR(EAGAIN)) { |
| av_free_packet(&packet); |
| continue; |
| } |
| // Interpret any other error as EOF |
| if (ret < 0) { |
| eof = 1; |
| av_free_packet(&packet); |
| continue; |
| } |
| |
| // Ignore packets from other streams |
| if (packet.stream_index != videoStreamIndex_) { |
| av_free_packet(&packet); |
| continue; |
| } |
| } |
| |
| ret = avcodec_decode_video2( |
| videoCodecContext_, videoStreamFrame_, &gotPicture, &packet); |
| if (ret < 0) { |
| LOG(ERROR) << "Error decoding video frame : " << ffmpegErrorStr(ret); |
| } |
| |
| try { |
| // Nothing to do without a picture |
| if (!gotPicture) { |
| av_free_packet(&packet); |
| continue; |
| } |
| |
| frameIndex++; |
| |
| timestamp = av_frame_get_best_effort_timestamp(videoStreamFrame_) * |
| av_q2d(videoStream_->time_base); |
| |
| // if reaching the next interval, update the current fps |
| // and reset lastFrameTimestamp so the current frame could be sampled |
| // (unless fps == SpecialFps::SAMPLE_NO_FRAME) |
| if (itvlIter != params.intervals_.end() && |
| timestamp >= itvlIter->timestamp) { |
| lastFrameTimestamp = -1.0; |
| currFps = itvlIter->fps; |
| prevTimestamp = itvlIter->timestamp; |
| itvlIter++; |
| if (itvlIter != params.intervals_.end() && |
| prevTimestamp >= itvlIter->timestamp) { |
| LOG(ERROR) |
| << "Sampling interval timestamps must be strictly ascending."; |
| } |
| } |
| |
| // keyFrame will bypass all checks on fps sampling settings |
| bool keyFrame = params.keyFrames_ && videoStreamFrame_->key_frame; |
| if (!keyFrame) { |
| // if fps == SpecialFps::SAMPLE_NO_FRAME (0), don't sample at all |
| if (currFps == SpecialFps::SAMPLE_NO_FRAME) { |
| av_free_packet(&packet); |
| continue; |
| } |
| |
| // fps is considered reached in the following cases: |
| // 1. lastFrameTimestamp < 0 - start of a new interval |
| // (or first frame) |
| // 2. currFps == SpecialFps::SAMPLE_ALL_FRAMES (-1) - sample every |
| // frame |
| // 3. timestamp - lastFrameTimestamp has reached target fps and |
| // currFps > 0 (not special fps setting) |
| // different modes for fps: |
| // SpecialFps::SAMPLE_NO_FRAMES (0): |
| // disable fps sampling, no frame sampled at all |
| // SpecialFps::SAMPLE_ALL_FRAMES (-1): |
| // unlimited fps sampling, will sample at native video fps |
| // SpecialFps::SAMPLE_TIMESTAMP_ONLY (-2): |
| // disable fps sampling, but will get the frame at specific |
| // timestamp |
| // others (> 0): decoding at the specified fps |
| bool fpsReached = lastFrameTimestamp < 0 || |
| currFps == SpecialFps::SAMPLE_ALL_FRAMES || |
| (currFps > 0 && timestamp >= |
| lastFrameTimestamp + (1 / currFps)); |
| |
| if (!fpsReached) { |
| av_free_packet(&packet); |
| continue; |
| } |
| } |
| |
| lastFrameTimestamp = timestamp; |
| |
| outputFrameIndex++; |
| if (params.maximumOutputFrames_ != -1 && |
| outputFrameIndex >= params.maximumOutputFrames_) { |
| // enough frames |
| av_free_packet(&packet); |
| break; |
| } |
| |
| AVFrame* rgbFrame = av_frame_alloc(); |
| if (!rgbFrame) { |
| LOG(ERROR) << "Error allocating AVframe"; |
| } |
| |
| try { |
| // Determine required buffer size and allocate buffer |
| int numBytes = avpicture_get_size(pixFormat, outWidth, outHeight); |
| DecodedFrame::AvDataPtr buffer( |
| (uint8_t*)av_malloc(numBytes * sizeof(uint8_t))); |
| |
| int size = avpicture_fill( |
| (AVPicture*)rgbFrame, |
| buffer.get(), |
| pixFormat, |
| outWidth, |
| outHeight); |
| |
| sws_scale( |
| scaleContext_, |
| videoStreamFrame_->data, |
| videoStreamFrame_->linesize, |
| 0, |
| videoCodecContext_->height, |
| rgbFrame->data, |
| rgbFrame->linesize); |
| |
| unique_ptr<DecodedFrame> frame = make_unique<DecodedFrame>(); |
| frame->width_ = outWidth; |
| frame->height_ = outHeight; |
| frame->data_ = move(buffer); |
| frame->size_ = size; |
| frame->index_ = frameIndex; |
| frame->outputFrameIndex_ = outputFrameIndex; |
| frame->timestamp_ = timestamp; |
| frame->keyFrame_ = videoStreamFrame_->key_frame; |
| |
| sampledFrames.push_back(move(frame)); |
| av_frame_free(&rgbFrame); |
| } catch (const std::exception&) { |
| av_frame_free(&rgbFrame); |
| } |
| av_frame_unref(videoStreamFrame_); |
| } catch (const std::exception&) { |
| av_frame_unref(videoStreamFrame_); |
| } |
| |
| av_free_packet(&packet); |
| } catch (const std::exception&) { |
| av_free_packet(&packet); |
| } |
| } // of while loop |
| |
| // free all stuffs |
| sws_freeContext(scaleContext_); |
| av_packet_unref(&packet); |
| av_frame_free(&videoStreamFrame_); |
| avcodec_close(videoCodecContext_); |
| avformat_close_input(&inputContext); |
| avformat_free_context(inputContext); |
| } catch (const std::exception&) { |
| // In case of decoding error |
| // free all stuffs |
| sws_freeContext(scaleContext_); |
| av_packet_unref(&packet); |
| av_frame_free(&videoStreamFrame_); |
| avcodec_close(videoCodecContext_); |
| avformat_close_input(&inputContext); |
| avformat_free_context(inputContext); |
| } |
| } |
| |
| void VideoDecoder::decodeMemory( |
| const char* buffer, |
| const int size, |
| const Params& params, |
| std::vector<std::unique_ptr<DecodedFrame>>& sampledFrames) { |
| VideoIOContext ioctx(buffer, size); |
| decodeLoop(string("Memory Buffer"), ioctx, params, sampledFrames); |
| } |
| |
| void VideoDecoder::decodeFile( |
| const string file, |
| const Params& params, |
| std::vector<std::unique_ptr<DecodedFrame>>& sampledFrames) { |
| VideoIOContext ioctx(file); |
| decodeLoop(file, ioctx, params, sampledFrames); |
| } |
| |
| string VideoDecoder::ffmpegErrorStr(int result) { |
| std::array<char, 128> buf; |
| av_strerror(result, buf.data(), buf.size()); |
| return string(buf.data()); |
| } |
| |
| } // namespace caffe2 |