| #ifndef CAFFE2_VIDEO_VIDEO_DECODER_H_ |
| #define CAFFE2_VIDEO_VIDEO_DECODER_H_ |
| |
| #include <caffe2/core/logging.h> |
| #include <stdio.h> |
| #include <memory> |
| #include <string> |
| #include <vector> |
| |
| extern "C" { |
| #include <libavformat/avformat.h> |
| #include <libavformat/avio.h> |
| } |
| |
| namespace caffe2 { |
| |
| #define VIO_BUFFER_SZ 32768 |
| #define MAX_DECODING_FRAMES 10000 |
| |
| // enum to specify 3 special fps sampling behaviors: |
| // 0: disable fps sampling, no frame sampled at all |
| // -1: unlimited fps sampling, will sample at native video fps |
| // -2: disable fps sampling, but will get the frame at specific timestamp |
| enum SpecialFps { |
| SAMPLE_NO_FRAME = 0, |
| SAMPLE_ALL_FRAMES = -1, |
| SAMPLE_TIMESTAMP_ONLY = -2, |
| }; |
| |
| // three different types of resolution when decoding the video |
| // 0: resize to width x height and ignore the aspect ratio; |
| // 1: resize to make size at least (width x height) and keep the aspect ratio; |
| // 2: using the original resolution of the video; if resolution |
| // is smaller than crop_height x crop_width, resize to ensure |
| // new height >= crop_height and new width >= crop_width |
| // and keep the aspect ratio; |
| enum VideoResType { |
| USE_WIDTH_HEIGHT = 0, |
| USE_MINIMAL_WIDTH_HEIGHT = 1, |
| ORIGINAL_RES = 2, |
| }; |
| |
| // three different types of decoding behavior are supported |
| // 0: do temporal jittering to sample a random clip from the video |
| // 1: sample a clip from a given starting frame |
| // 2: uniformly sample multiple clips from the video; |
| enum DecodeType { |
| DO_TMP_JITTER = 0, |
| DO_UNIFORM_SMP = 1, |
| USE_START_FRM = 2, |
| }; |
| |
| // sampling interval for fps starting at specified timestamp |
| // use enum SpecialFps to set special fps decoding behavior |
| // note sampled fps will not always accurately follow the target fps, |
| // because sampled frame has to snap to actual frame timestamp, |
| // e.g. video fps = 25, sample fps = 4 will sample every 0.28s, not 0.25 |
| // video fps = 25, sample fps = 5 will sample every 0.24s, not 0.2, |
| // because of floating-point division accuracy (1 / 5.0 is not exactly 0.2) |
| struct SampleInterval { |
| double timestamp; |
| double fps; |
| SampleInterval() : timestamp(-1), fps(SpecialFps::SAMPLE_ALL_FRAMES) {} |
| SampleInterval(double ts, double f) : timestamp(ts), fps(f) {} |
| bool operator<(const SampleInterval& itvl) const { |
| return (timestamp < itvl.timestamp); |
| } |
| }; |
| |
| class Params { |
| public: |
| // return all key-frames regardless of specified fps |
| bool keyFrames_ = false; |
| |
| // Output image pixel format |
| AVPixelFormat pixelFormat_ = AVPixelFormat::AV_PIX_FMT_RGB24; |
| |
| // Index of stream to decode. |
| // -1 will automatically decode the first video stream. |
| int streamIndex_ = -1; |
| |
| // How many frames to output at most from the video |
| // -1 no limit |
| int maximumOutputFrames_ = -1; |
| |
| // params for video resolution |
| int video_res_type_ = VideoResType::USE_WIDTH_HEIGHT; |
| |
| // the size of the patch croped from the input video |
| int crop_height_ = -1; |
| int crop_width_ = -1; |
| |
| // minimal resolution for resizing when using USE_MINIMAL_WIDTH_HEIGHT |
| int height_min_ = -1; |
| int width_min_ = -1; |
| |
| // the video resolution after resizing |
| int scale_w_ = -1; |
| int scale_h_ = -1; |
| |
| // params for decoding behavior |
| int decode_type_ = DecodeType::DO_TMP_JITTER; |
| int num_of_required_frame_ = -1; |
| |
| // intervals_ control variable sampling fps between different timestamps |
| // intervals_ must be ordered strictly ascending by timestamps |
| // the first interval must have a timestamp of zero |
| // fps must be either the 3 special fps defined in SpecialFps, or > 0 |
| std::vector<SampleInterval> intervals_ = {{0, SpecialFps::SAMPLE_ALL_FRAMES}}; |
| |
| Params() {} |
| |
| /** |
| * FPS of output frames |
| * setting here will reset intervals_ and force decoding at target FPS |
| * This can be used if user just want to decode at a steady fps |
| */ |
| Params& fps(float v) { |
| intervals_.clear(); |
| intervals_.emplace_back(0, v); |
| return *this; |
| } |
| |
| /** |
| * Sample output frames at a specified list of timestamps |
| * Timestamps must be in increasing order, and timestamps past the end of the |
| * video will be ignored |
| * Setting here will reset intervals_ |
| */ |
| Params& setSampleTimestamps(const std::vector<double>& timestamps) { |
| intervals_.clear(); |
| // insert an interval per desired frame. |
| for (auto& timestamp : timestamps) { |
| intervals_.emplace_back(timestamp, SpecialFps::SAMPLE_TIMESTAMP_ONLY); |
| } |
| return *this; |
| } |
| |
| /** |
| * Pixel format of output buffer, default PIX_FMT_RGB24 |
| */ |
| Params& pixelFormat(AVPixelFormat pixelFormat) { |
| pixelFormat_ = pixelFormat; |
| return *this; |
| } |
| |
| /** |
| * Return all key-frames |
| */ |
| Params& keyFrames(bool keyFrames) { |
| keyFrames_ = keyFrames; |
| return *this; |
| } |
| |
| /** |
| * Index of video stream to process, defaults to the first video stream |
| */ |
| Params& streamIndex(int index) { |
| streamIndex_ = index; |
| return *this; |
| } |
| |
| /** |
| * Only output this many frames, default to no limit |
| */ |
| Params& maxOutputFrames(int count) { |
| maximumOutputFrames_ = count; |
| return *this; |
| } |
| |
| /** |
| * Output frame width, default to video width |
| */ |
| Params& outputWidth(int width) { |
| scale_w_ = width; |
| return *this; |
| } |
| |
| /** |
| * Output frame height, default to video height |
| */ |
| Params& outputHeight(int height) { |
| scale_h_ = height; |
| return *this; |
| } |
| }; |
| |
| // data structure for storing decoded video frames |
| class DecodedFrame { |
| public: |
| struct avDeleter { |
| void operator()(unsigned char* p) const { |
| av_free(p); |
| } |
| }; |
| using AvDataPtr = std::unique_ptr<uint8_t, avDeleter>; |
| |
| // decoded data buffer |
| AvDataPtr data_; |
| |
| // size in bytes |
| int size_ = 0; |
| |
| // frame dimensions |
| int width_ = 0; |
| int height_ = 0; |
| |
| // timestamp in seconds since beginning of video |
| double timestamp_ = 0; |
| |
| // true if this is a key frame. |
| bool keyFrame_ = false; |
| |
| // index of frame in video |
| int index_ = -1; |
| |
| // Sequential number of outputted frame |
| int outputFrameIndex_ = -1; |
| }; |
| |
| class VideoIOContext { |
| public: |
| explicit VideoIOContext(const std::string& fname) |
| : workBuffersize_(VIO_BUFFER_SZ), |
| workBuffer_((uint8_t*)av_malloc(workBuffersize_)), |
| inputFile_(nullptr), |
| inputBuffer_(nullptr), |
| inputBufferSize_(0) { |
| inputFile_ = fopen(fname.c_str(), "rb"); |
| if (inputFile_ == nullptr) { |
| LOG(ERROR) << "Error opening video file " << fname; |
| } |
| ctx_ = avio_alloc_context( |
| static_cast<unsigned char*>(workBuffer_.get()), |
| workBuffersize_, |
| 0, |
| this, |
| &VideoIOContext::readFile, |
| nullptr, // no write function |
| &VideoIOContext::seekFile); |
| } |
| |
| explicit VideoIOContext(const char* buffer, int size) |
| : workBuffersize_(VIO_BUFFER_SZ), |
| workBuffer_((uint8_t*)av_malloc(workBuffersize_)), |
| inputFile_(nullptr), |
| inputBuffer_(buffer), |
| inputBufferSize_(size) { |
| ctx_ = avio_alloc_context( |
| static_cast<unsigned char*>(workBuffer_.get()), |
| workBuffersize_, |
| 0, |
| this, |
| &VideoIOContext::readMemory, |
| nullptr, // no write function |
| &VideoIOContext::seekMemory); |
| } |
| |
| ~VideoIOContext() { |
| av_free(ctx_); |
| if (inputFile_) { |
| fclose(inputFile_); |
| } |
| } |
| |
| int read(unsigned char* buf, int buf_size) { |
| if (inputBuffer_) { |
| return readMemory(this, buf, buf_size); |
| } else if (inputFile_) { |
| return readFile(this, buf, buf_size); |
| } else { |
| return -1; |
| } |
| } |
| |
| int64_t seek(int64_t offset, int whence) { |
| if (inputBuffer_) { |
| return seekMemory(this, offset, whence); |
| } else if (inputFile_) { |
| return seekFile(this, offset, whence); |
| } else { |
| return -1; |
| } |
| } |
| |
| static int readFile(void* opaque, unsigned char* buf, int buf_size) { |
| VideoIOContext* h = static_cast<VideoIOContext*>(opaque); |
| if (feof(h->inputFile_)) { |
| return AVERROR_EOF; |
| } |
| size_t ret = fread(buf, 1, buf_size, h->inputFile_); |
| if (ret < buf_size) { |
| if (ferror(h->inputFile_)) { |
| return -1; |
| } |
| } |
| return ret; |
| } |
| |
| static int64_t seekFile(void* opaque, int64_t offset, int whence) { |
| VideoIOContext* h = static_cast<VideoIOContext*>(opaque); |
| switch (whence) { |
| case SEEK_CUR: // from current position |
| case SEEK_END: // from eof |
| case SEEK_SET: // from beginning of file |
| return fseek(h->inputFile_, static_cast<long>(offset), whence); |
| break; |
| case AVSEEK_SIZE: |
| int64_t cur = ftell(h->inputFile_); |
| fseek(h->inputFile_, 0L, SEEK_END); |
| int64_t size = ftell(h->inputFile_); |
| fseek(h->inputFile_, cur, SEEK_SET); |
| return size; |
| } |
| |
| return -1; |
| } |
| |
| static int readMemory(void* opaque, unsigned char* buf, int buf_size) { |
| VideoIOContext* h = static_cast<VideoIOContext*>(opaque); |
| if (buf_size < 0) { |
| return -1; |
| } |
| |
| int reminder = h->inputBufferSize_ - h->offset_; |
| int r = buf_size < reminder ? buf_size : reminder; |
| if (r < 0) { |
| return AVERROR_EOF; |
| } |
| |
| memcpy(buf, h->inputBuffer_ + h->offset_, r); |
| h->offset_ += r; |
| return r; |
| } |
| |
| static int64_t seekMemory(void* opaque, int64_t offset, int whence) { |
| VideoIOContext* h = static_cast<VideoIOContext*>(opaque); |
| switch (whence) { |
| case SEEK_CUR: // from current position |
| h->offset_ += offset; |
| break; |
| case SEEK_END: // from eof |
| h->offset_ = h->inputBufferSize_ + offset; |
| break; |
| case SEEK_SET: // from beginning of file |
| h->offset_ = offset; |
| break; |
| case AVSEEK_SIZE: |
| return h->inputBufferSize_; |
| } |
| return h->offset_; |
| } |
| |
| AVIOContext* get_avio() { |
| return ctx_; |
| } |
| |
| private: |
| int workBuffersize_; |
| DecodedFrame::AvDataPtr workBuffer_; |
| // for file mode |
| FILE* inputFile_; |
| |
| // for memory mode |
| const char* inputBuffer_; |
| int inputBufferSize_; |
| int offset_ = 0; |
| |
| AVIOContext* ctx_; |
| }; |
| |
| struct VideoMeta { |
| double fps; |
| int width; |
| int height; |
| enum AVMediaType codec_type; |
| AVPixelFormat pixFormat; |
| VideoMeta() |
| : fps(-1), |
| width(-1), |
| height(-1), |
| codec_type(AVMEDIA_TYPE_VIDEO), |
| pixFormat(AVPixelFormat::AV_PIX_FMT_RGB24) {} |
| }; |
| |
| class VideoDecoder { |
| public: |
| VideoDecoder(); |
| |
| void decodeFile( |
| const std::string& filename, |
| const Params& params, |
| const int start_frm, |
| std::vector<std::unique_ptr<DecodedFrame>>& sampledFrames); |
| |
| void decodeMemory( |
| const char* buffer, |
| const int size, |
| const Params& params, |
| const int start_frm, |
| std::vector<std::unique_ptr<DecodedFrame>>& sampledFrames); |
| |
| private: |
| std::string ffmpegErrorStr(int result); |
| |
| void ResizeAndKeepAspectRatio( |
| const int origHeight, |
| const int origWidth, |
| const int heightMin, |
| const int widthMin, |
| int& outHeight, |
| int& outWidth); |
| |
| void decodeLoop( |
| const std::string& videoName, |
| VideoIOContext& ioctx, |
| const Params& params, |
| const int start_frm, |
| std::vector<std::unique_ptr<DecodedFrame>>& sampledFrames); |
| }; |
| } // namespace caffe2 |
| |
| #endif // CAFFE2_VIDEO_VIDEO_DECODER_H_ |