blob: 6b900d7e0789c7d6bc313b12ed01947f7f9dbf4a [file] [log] [blame]
#ifndef CAFFE2_VIDEO_VIDEO_DECODER_H_
#define CAFFE2_VIDEO_VIDEO_DECODER_H_
#include <caffe2/core/logging.h>
#include <stdio.h>
#include <memory>
#include <string>
#include <vector>
extern "C" {
#include <libavformat/avformat.h>
#include <libavformat/avio.h>
}
namespace caffe2 {
#define VIO_BUFFER_SZ 32768
#define MAX_DECODING_FRAMES 10000
// enum to specify 3 special fps sampling behaviors:
// 0: disable fps sampling, no frame sampled at all
// -1: unlimited fps sampling, will sample at native video fps
// -2: disable fps sampling, but will get the frame at specific timestamp
enum SpecialFps {
SAMPLE_NO_FRAME = 0,
SAMPLE_ALL_FRAMES = -1,
SAMPLE_TIMESTAMP_ONLY = -2,
};
// three different types of resolution when decoding the video
// 0: resize to width x height and ignore the aspect ratio;
// 1: resize to make size at least (width x height) and keep the aspect ratio;
// 2: using the original resolution of the video; if resolution
// is smaller than crop_height x crop_width, resize to ensure
// new height >= crop_height and new width >= crop_width
// and keep the aspect ratio;
enum VideoResType {
USE_WIDTH_HEIGHT = 0,
USE_MINIMAL_WIDTH_HEIGHT = 1,
ORIGINAL_RES = 2,
};
// three different types of decoding behavior are supported
// 0: do temporal jittering to sample a random clip from the video
// 1: sample a clip from a given starting frame
// 2: uniformly sample multiple clips from the video;
enum DecodeType {
DO_TMP_JITTER = 0,
DO_UNIFORM_SMP = 1,
USE_START_FRM = 2,
};
// sampling interval for fps starting at specified timestamp
// use enum SpecialFps to set special fps decoding behavior
// note sampled fps will not always accurately follow the target fps,
// because sampled frame has to snap to actual frame timestamp,
// e.g. video fps = 25, sample fps = 4 will sample every 0.28s, not 0.25
// video fps = 25, sample fps = 5 will sample every 0.24s, not 0.2,
// because of floating-point division accuracy (1 / 5.0 is not exactly 0.2)
struct SampleInterval {
double timestamp;
double fps;
SampleInterval() : timestamp(-1), fps(SpecialFps::SAMPLE_ALL_FRAMES) {}
SampleInterval(double ts, double f) : timestamp(ts), fps(f) {}
bool operator<(const SampleInterval& itvl) const {
return (timestamp < itvl.timestamp);
}
};
class Params {
public:
// return all key-frames regardless of specified fps
bool keyFrames_ = false;
// Output image pixel format
AVPixelFormat pixelFormat_ = AVPixelFormat::AV_PIX_FMT_RGB24;
// Index of stream to decode.
// -1 will automatically decode the first video stream.
int streamIndex_ = -1;
// How many frames to output at most from the video
// -1 no limit
int maximumOutputFrames_ = -1;
// params for video resolution
int video_res_type_ = VideoResType::USE_WIDTH_HEIGHT;
// the size of the patch croped from the input video
int crop_height_ = -1;
int crop_width_ = -1;
// minimal resolution for resizing when using USE_MINIMAL_WIDTH_HEIGHT
int height_min_ = -1;
int width_min_ = -1;
// the video resolution after resizing
int scale_w_ = -1;
int scale_h_ = -1;
// params for decoding behavior
int decode_type_ = DecodeType::DO_TMP_JITTER;
int num_of_required_frame_ = -1;
// intervals_ control variable sampling fps between different timestamps
// intervals_ must be ordered strictly ascending by timestamps
// the first interval must have a timestamp of zero
// fps must be either the 3 special fps defined in SpecialFps, or > 0
std::vector<SampleInterval> intervals_ = {{0, SpecialFps::SAMPLE_ALL_FRAMES}};
Params() {}
/**
* FPS of output frames
* setting here will reset intervals_ and force decoding at target FPS
* This can be used if user just want to decode at a steady fps
*/
Params& fps(float v) {
intervals_.clear();
intervals_.emplace_back(0, v);
return *this;
}
/**
* Sample output frames at a specified list of timestamps
* Timestamps must be in increasing order, and timestamps past the end of the
* video will be ignored
* Setting here will reset intervals_
*/
Params& setSampleTimestamps(const std::vector<double>& timestamps) {
intervals_.clear();
// insert an interval per desired frame.
for (auto& timestamp : timestamps) {
intervals_.emplace_back(timestamp, SpecialFps::SAMPLE_TIMESTAMP_ONLY);
}
return *this;
}
/**
* Pixel format of output buffer, default PIX_FMT_RGB24
*/
Params& pixelFormat(AVPixelFormat pixelFormat) {
pixelFormat_ = pixelFormat;
return *this;
}
/**
* Return all key-frames
*/
Params& keyFrames(bool keyFrames) {
keyFrames_ = keyFrames;
return *this;
}
/**
* Index of video stream to process, defaults to the first video stream
*/
Params& streamIndex(int index) {
streamIndex_ = index;
return *this;
}
/**
* Only output this many frames, default to no limit
*/
Params& maxOutputFrames(int count) {
maximumOutputFrames_ = count;
return *this;
}
/**
* Output frame width, default to video width
*/
Params& outputWidth(int width) {
scale_w_ = width;
return *this;
}
/**
* Output frame height, default to video height
*/
Params& outputHeight(int height) {
scale_h_ = height;
return *this;
}
};
// data structure for storing decoded video frames
class DecodedFrame {
public:
struct avDeleter {
void operator()(unsigned char* p) const {
av_free(p);
}
};
using AvDataPtr = std::unique_ptr<uint8_t, avDeleter>;
// decoded data buffer
AvDataPtr data_;
// size in bytes
int size_ = 0;
// frame dimensions
int width_ = 0;
int height_ = 0;
// timestamp in seconds since beginning of video
double timestamp_ = 0;
// true if this is a key frame.
bool keyFrame_ = false;
// index of frame in video
int index_ = -1;
// Sequential number of outputted frame
int outputFrameIndex_ = -1;
};
class VideoIOContext {
public:
explicit VideoIOContext(const std::string& fname)
: workBuffersize_(VIO_BUFFER_SZ),
workBuffer_((uint8_t*)av_malloc(workBuffersize_)),
inputFile_(nullptr),
inputBuffer_(nullptr),
inputBufferSize_(0) {
inputFile_ = fopen(fname.c_str(), "rb");
if (inputFile_ == nullptr) {
LOG(ERROR) << "Error opening video file " << fname;
}
ctx_ = avio_alloc_context(
static_cast<unsigned char*>(workBuffer_.get()),
workBuffersize_,
0,
this,
&VideoIOContext::readFile,
nullptr, // no write function
&VideoIOContext::seekFile);
}
explicit VideoIOContext(const char* buffer, int size)
: workBuffersize_(VIO_BUFFER_SZ),
workBuffer_((uint8_t*)av_malloc(workBuffersize_)),
inputFile_(nullptr),
inputBuffer_(buffer),
inputBufferSize_(size) {
ctx_ = avio_alloc_context(
static_cast<unsigned char*>(workBuffer_.get()),
workBuffersize_,
0,
this,
&VideoIOContext::readMemory,
nullptr, // no write function
&VideoIOContext::seekMemory);
}
~VideoIOContext() {
av_free(ctx_);
if (inputFile_) {
fclose(inputFile_);
}
}
int read(unsigned char* buf, int buf_size) {
if (inputBuffer_) {
return readMemory(this, buf, buf_size);
} else if (inputFile_) {
return readFile(this, buf, buf_size);
} else {
return -1;
}
}
int64_t seek(int64_t offset, int whence) {
if (inputBuffer_) {
return seekMemory(this, offset, whence);
} else if (inputFile_) {
return seekFile(this, offset, whence);
} else {
return -1;
}
}
static int readFile(void* opaque, unsigned char* buf, int buf_size) {
VideoIOContext* h = static_cast<VideoIOContext*>(opaque);
if (feof(h->inputFile_)) {
return AVERROR_EOF;
}
size_t ret = fread(buf, 1, buf_size, h->inputFile_);
if (ret < buf_size) {
if (ferror(h->inputFile_)) {
return -1;
}
}
return ret;
}
static int64_t seekFile(void* opaque, int64_t offset, int whence) {
VideoIOContext* h = static_cast<VideoIOContext*>(opaque);
switch (whence) {
case SEEK_CUR: // from current position
case SEEK_END: // from eof
case SEEK_SET: // from beginning of file
return fseek(h->inputFile_, static_cast<long>(offset), whence);
break;
case AVSEEK_SIZE:
int64_t cur = ftell(h->inputFile_);
fseek(h->inputFile_, 0L, SEEK_END);
int64_t size = ftell(h->inputFile_);
fseek(h->inputFile_, cur, SEEK_SET);
return size;
}
return -1;
}
static int readMemory(void* opaque, unsigned char* buf, int buf_size) {
VideoIOContext* h = static_cast<VideoIOContext*>(opaque);
if (buf_size < 0) {
return -1;
}
int reminder = h->inputBufferSize_ - h->offset_;
int r = buf_size < reminder ? buf_size : reminder;
if (r < 0) {
return AVERROR_EOF;
}
memcpy(buf, h->inputBuffer_ + h->offset_, r);
h->offset_ += r;
return r;
}
static int64_t seekMemory(void* opaque, int64_t offset, int whence) {
VideoIOContext* h = static_cast<VideoIOContext*>(opaque);
switch (whence) {
case SEEK_CUR: // from current position
h->offset_ += offset;
break;
case SEEK_END: // from eof
h->offset_ = h->inputBufferSize_ + offset;
break;
case SEEK_SET: // from beginning of file
h->offset_ = offset;
break;
case AVSEEK_SIZE:
return h->inputBufferSize_;
}
return h->offset_;
}
AVIOContext* get_avio() {
return ctx_;
}
private:
int workBuffersize_;
DecodedFrame::AvDataPtr workBuffer_;
// for file mode
FILE* inputFile_;
// for memory mode
const char* inputBuffer_;
int inputBufferSize_;
int offset_ = 0;
AVIOContext* ctx_;
};
struct VideoMeta {
double fps;
int width;
int height;
enum AVMediaType codec_type;
AVPixelFormat pixFormat;
VideoMeta()
: fps(-1),
width(-1),
height(-1),
codec_type(AVMEDIA_TYPE_VIDEO),
pixFormat(AVPixelFormat::AV_PIX_FMT_RGB24) {}
};
class VideoDecoder {
public:
VideoDecoder();
void decodeFile(
const std::string& filename,
const Params& params,
const int start_frm,
std::vector<std::unique_ptr<DecodedFrame>>& sampledFrames);
void decodeMemory(
const char* buffer,
const int size,
const Params& params,
const int start_frm,
std::vector<std::unique_ptr<DecodedFrame>>& sampledFrames);
private:
std::string ffmpegErrorStr(int result);
void ResizeAndKeepAspectRatio(
const int origHeight,
const int origWidth,
const int heightMin,
const int widthMin,
int& outHeight,
int& outWidth);
void decodeLoop(
const std::string& videoName,
VideoIOContext& ioctx,
const Params& params,
const int start_frm,
std::vector<std::unique_ptr<DecodedFrame>>& sampledFrames);
};
} // namespace caffe2
#endif // CAFFE2_VIDEO_VIDEO_DECODER_H_