caffe2/video/video_decoder.h - platform/external/pytorch - Git at Google

 #ifndef CAFFE2_VIDEO_VIDEO_DECODER_H_
 #define CAFFE2_VIDEO_VIDEO_DECODER_H_

 #include <caffe2/core/logging.h>
 #include <stdio.h>
 #include <memory>
 #include <string>
 #include <vector>

 extern "C" {
 #include <libavformat/avformat.h>
 #include <libavformat/avio.h>
 }

 namespace caffe2 {

 #define VIO_BUFFER_SZ 32768
 #define MAX_DECODING_FRAMES 10000

 // enum to specify 3 special fps sampling behaviors:
 // 0: disable fps sampling, no frame sampled at all
 // -1: unlimited fps sampling, will sample at native video fps
 // -2: disable fps sampling, but will get the frame at specific timestamp
 enum SpecialFps {
   SAMPLE_NO_FRAME = 0,
   SAMPLE_ALL_FRAMES = -1,
   SAMPLE_TIMESTAMP_ONLY = -2,
 };

 // three different types of resolution when decoding the video
 // 0: resize to width x height and ignore the aspect ratio;
 // 1: resize to make size at least (width x height) and keep the aspect ratio;
 // 2: using the original resolution of the video; if resolution
 //    is smaller than crop_height x crop_width, resize to ensure
 //    new height >= crop_height and new width >= crop_width
 //    and keep the aspect ratio;
 enum VideoResType {
   USE_WIDTH_HEIGHT = 0,
   USE_MINIMAL_WIDTH_HEIGHT = 1,
   ORIGINAL_RES = 2,
 };

 // three different types of decoding behavior are supported
 // 0: do temporal jittering to sample a random clip from the video
 // 1: sample a clip from a given starting frame
 // 2: uniformly sample multiple clips from the video;
 enum DecodeType {
   DO_TMP_JITTER = 0,
   DO_UNIFORM_SMP = 1,
   USE_START_FRM = 2,
 };

 // sampling interval for fps starting at specified timestamp
 // use enum SpecialFps to set special fps decoding behavior
 // note sampled fps will not always accurately follow the target fps,
 // because sampled frame has to snap to actual frame timestamp,
 // e.g. video fps = 25, sample fps = 4 will sample every 0.28s, not 0.25
 // video fps = 25, sample fps = 5 will sample every 0.24s, not 0.2,
 // because of floating-point division accuracy (1 / 5.0 is not exactly 0.2)
 struct SampleInterval {
   double timestamp;
   double fps;
   SampleInterval() : timestamp(-1), fps(SpecialFps::SAMPLE_ALL_FRAMES) {}
   SampleInterval(double ts, double f) : timestamp(ts), fps(f) {}
   bool operator<(const SampleInterval& itvl) const {
     return (timestamp < itvl.timestamp);
   }
 };

 class Params {
  public:
   // return all key-frames regardless of specified fps
   bool keyFrames_ = false;

   // Output image pixel format
   AVPixelFormat pixelFormat_ = AVPixelFormat::AV_PIX_FMT_RGB24;

   // Index of stream to decode.
   // -1 will automatically decode the first video stream.
   int streamIndex_ = -1;

   // How many frames to output at most from the video
   // -1 no limit
   int maximumOutputFrames_ = -1;

   // params for video resolution
   int video_res_type_ = VideoResType::USE_WIDTH_HEIGHT;

   // the size of the patch croped from the input video
   int crop_height_ = -1;
   int crop_width_ = -1;

   // minimal resolution for resizing when using USE_MINIMAL_WIDTH_HEIGHT
   int height_min_ = -1;
   int width_min_ = -1;

   // the video resolution after resizing
   int scale_w_ = -1;
   int scale_h_ = -1;

   // params for decoding behavior
   int decode_type_ = DecodeType::DO_TMP_JITTER;
   int num_of_required_frame_ = -1;

   // intervals_ control variable sampling fps between different timestamps
   // intervals_ must be ordered strictly ascending by timestamps
   // the first interval must have a timestamp of zero
   // fps must be either the 3 special fps defined in SpecialFps, or > 0
   std::vector<SampleInterval> intervals_ = {{0, SpecialFps::SAMPLE_ALL_FRAMES}};

   Params() {}

   /**
    * FPS of output frames
    * setting here will reset intervals_ and force decoding at target FPS
    * This can be used if user just want to decode at a steady fps
    */
   Params& fps(float v) {
     intervals_.clear();
     intervals_.emplace_back(0, v);
     return *this;
   }

   /**
    * Sample output frames at a specified list of timestamps
    * Timestamps must be in increasing order, and timestamps past the end of the
    * video will be ignored
    * Setting here will reset intervals_
    */
   Params& setSampleTimestamps(const std::vector<double>& timestamps) {
     intervals_.clear();
     // insert an interval per desired frame.
     for (auto& timestamp : timestamps) {
       intervals_.emplace_back(timestamp, SpecialFps::SAMPLE_TIMESTAMP_ONLY);
     }
     return *this;
   }

   /**
    * Pixel format of output buffer, default PIX_FMT_RGB24
    */
   Params& pixelFormat(AVPixelFormat pixelFormat) {
     pixelFormat_ = pixelFormat;
     return *this;
   }

   /**
    * Return all key-frames
    */
   Params& keyFrames(bool keyFrames) {
     keyFrames_ = keyFrames;
     return *this;
   }

   /**
    * Index of video stream to process, defaults to the first video stream
    */
   Params& streamIndex(int index) {
     streamIndex_ = index;
     return *this;
   }

   /**
    * Only output this many frames, default to no limit
    */
   Params& maxOutputFrames(int count) {
     maximumOutputFrames_ = count;
     return *this;
   }

   /**
    * Output frame width, default to video width
    */
   Params& outputWidth(int width) {
     scale_w_ = width;
     return *this;
   }

   /**
    * Output frame height, default to video height
    */
   Params& outputHeight(int height) {
     scale_h_ = height;
     return *this;
   }
 };

 // data structure for storing decoded video frames
 class DecodedFrame {
  public:
   struct avDeleter {
     void operator()(unsigned char* p) const {
       av_free(p);
     }
   };
   using AvDataPtr = std::unique_ptr<uint8_t, avDeleter>;

   // decoded data buffer
   AvDataPtr data_;

   // size in bytes
   int size_ = 0;

   // frame dimensions
   int width_ = 0;
   int height_ = 0;

   // timestamp in seconds since beginning of video
   double timestamp_ = 0;

   // true if this is a key frame.
   bool keyFrame_ = false;

   // index of frame in video
   int index_ = -1;

   // Sequential number of outputted frame
   int outputFrameIndex_ = -1;
 };

 class VideoIOContext {
  public:
   explicit VideoIOContext(const std::string& fname)
       : workBuffersize_(VIO_BUFFER_SZ),
         workBuffer_((uint8_t*)av_malloc(workBuffersize_)),
         inputFile_(nullptr),
         inputBuffer_(nullptr),
         inputBufferSize_(0) {
     inputFile_ = fopen(fname.c_str(), "rb");
     if (inputFile_ == nullptr) {
       LOG(ERROR) << "Error opening video file " << fname;
     }
     ctx_ = avio_alloc_context(
         static_cast<unsigned char*>(workBuffer_.get()),
         workBuffersize_,
         0,
         this,
         &VideoIOContext::readFile,
         nullptr, // no write function
         &VideoIOContext::seekFile);
   }

   explicit VideoIOContext(const char* buffer, int size)
       : workBuffersize_(VIO_BUFFER_SZ),
         workBuffer_((uint8_t*)av_malloc(workBuffersize_)),
         inputFile_(nullptr),
         inputBuffer_(buffer),
         inputBufferSize_(size) {
     ctx_ = avio_alloc_context(
         static_cast<unsigned char*>(workBuffer_.get()),
         workBuffersize_,
         0,
         this,
         &VideoIOContext::readMemory,
         nullptr, // no write function
         &VideoIOContext::seekMemory);
   }

   ~VideoIOContext() {
     av_free(ctx_);
     if (inputFile_) {
       fclose(inputFile_);
     }
   }

   int read(unsigned char* buf, int buf_size) {
     if (inputBuffer_) {
       return readMemory(this, buf, buf_size);
     } else if (inputFile_) {
       return readFile(this, buf, buf_size);
     } else {
       return -1;
     }
   }

   int64_t seek(int64_t offset, int whence) {
     if (inputBuffer_) {
       return seekMemory(this, offset, whence);
     } else if (inputFile_) {
       return seekFile(this, offset, whence);
     } else {
       return -1;
     }
   }

   static int readFile(void* opaque, unsigned char* buf, int buf_size) {
     VideoIOContext* h = static_cast<VideoIOContext*>(opaque);
     if (feof(h->inputFile_)) {
       return AVERROR_EOF;
     }
     size_t ret = fread(buf, 1, buf_size, h->inputFile_);
     if (ret < buf_size) {
       if (ferror(h->inputFile_)) {
         return -1;
       }
     }
     return ret;
   }

   static int64_t seekFile(void* opaque, int64_t offset, int whence) {
     VideoIOContext* h = static_cast<VideoIOContext*>(opaque);
     switch (whence) {
       case SEEK_CUR: // from current position
       case SEEK_END: // from eof
       case SEEK_SET: // from beginning of file
         return fseek(h->inputFile_, static_cast<long>(offset), whence);
         break;
       case AVSEEK_SIZE:
         int64_t cur = ftell(h->inputFile_);
         fseek(h->inputFile_, 0L, SEEK_END);
         int64_t size = ftell(h->inputFile_);
         fseek(h->inputFile_, cur, SEEK_SET);
         return size;
     }

     return -1;
   }

   static int readMemory(void* opaque, unsigned char* buf, int buf_size) {
     VideoIOContext* h = static_cast<VideoIOContext*>(opaque);
     if (buf_size < 0) {
       return -1;
     }

     int reminder = h->inputBufferSize_ - h->offset_;
     int r = buf_size < reminder ? buf_size : reminder;
     if (r < 0) {
       return AVERROR_EOF;
     }

     memcpy(buf, h->inputBuffer_ + h->offset_, r);
     h->offset_ += r;
     return r;
   }

   static int64_t seekMemory(void* opaque, int64_t offset, int whence) {
     VideoIOContext* h = static_cast<VideoIOContext*>(opaque);
     switch (whence) {
       case SEEK_CUR: // from current position
         h->offset_ += offset;
         break;
       case SEEK_END: // from eof
         h->offset_ = h->inputBufferSize_ + offset;
         break;
       case SEEK_SET: // from beginning of file
         h->offset_ = offset;
         break;
       case AVSEEK_SIZE:
         return h->inputBufferSize_;
     }
     return h->offset_;
   }

   AVIOContext* get_avio() {
     return ctx_;
   }

  private:
   int workBuffersize_;
   DecodedFrame::AvDataPtr workBuffer_;
   // for file mode
   FILE* inputFile_;

   // for memory mode
   const char* inputBuffer_;
   int inputBufferSize_;
   int offset_ = 0;

   AVIOContext* ctx_;
 };

 struct VideoMeta {
   double fps;
   int width;
   int height;
   enum AVMediaType codec_type;
   AVPixelFormat pixFormat;
   VideoMeta()
       : fps(-1),
         width(-1),
         height(-1),
         codec_type(AVMEDIA_TYPE_VIDEO),
         pixFormat(AVPixelFormat::AV_PIX_FMT_RGB24) {}
 };

 class VideoDecoder {
  public:
   VideoDecoder();

   void decodeFile(
       const std::string& filename,
       const Params& params,
       const int start_frm,
       std::vector<std::unique_ptr<DecodedFrame>>& sampledFrames);

   void decodeMemory(
       const char* buffer,
       const int size,
       const Params& params,
       const int start_frm,
       std::vector<std::unique_ptr<DecodedFrame>>& sampledFrames);

  private:
   std::string ffmpegErrorStr(int result);

   void ResizeAndKeepAspectRatio(
       const int origHeight,
       const int origWidth,
       const int heightMin,
       const int widthMin,
       int& outHeight,
       int& outWidth);

   void decodeLoop(
       const std::string& videoName,
       VideoIOContext& ioctx,
       const Params& params,
       const int start_frm,
       std::vector<std::unique_ptr<DecodedFrame>>& sampledFrames);
 };
 } // namespace caffe2

 #endif // CAFFE2_VIDEO_VIDEO_DECODER_H_
	#ifndef CAFFE2_VIDEO_VIDEO_DECODER_H_
	#define CAFFE2_VIDEO_VIDEO_DECODER_H_

	#include <caffe2/core/logging.h>
	#include <stdio.h>
	#include <memory>
	#include <string>
	#include <vector>

	extern "C" {
	#include <libavformat/avformat.h>
	#include <libavformat/avio.h>
	}

	namespace caffe2 {

	#define VIO_BUFFER_SZ 32768
	#define MAX_DECODING_FRAMES 10000

	// enum to specify 3 special fps sampling behaviors:
	// 0: disable fps sampling, no frame sampled at all
	// -1: unlimited fps sampling, will sample at native video fps
	// -2: disable fps sampling, but will get the frame at specific timestamp
	enum SpecialFps {
	SAMPLE_NO_FRAME = 0,
	SAMPLE_ALL_FRAMES = -1,
	SAMPLE_TIMESTAMP_ONLY = -2,
	};

	// three different types of resolution when decoding the video
	// 0: resize to width x height and ignore the aspect ratio;
	// 1: resize to make size at least (width x height) and keep the aspect ratio;
	// 2: using the original resolution of the video; if resolution
	// is smaller than crop_height x crop_width, resize to ensure
	// new height >= crop_height and new width >= crop_width
	// and keep the aspect ratio;
	enum VideoResType {
	USE_WIDTH_HEIGHT = 0,
	USE_MINIMAL_WIDTH_HEIGHT = 1,
	ORIGINAL_RES = 2,
	};

	// three different types of decoding behavior are supported
	// 0: do temporal jittering to sample a random clip from the video
	// 1: sample a clip from a given starting frame
	// 2: uniformly sample multiple clips from the video;
	enum DecodeType {
	DO_TMP_JITTER = 0,
	DO_UNIFORM_SMP = 1,
	USE_START_FRM = 2,
	};

	// sampling interval for fps starting at specified timestamp
	// use enum SpecialFps to set special fps decoding behavior
	// note sampled fps will not always accurately follow the target fps,
	// because sampled frame has to snap to actual frame timestamp,
	// e.g. video fps = 25, sample fps = 4 will sample every 0.28s, not 0.25
	// video fps = 25, sample fps = 5 will sample every 0.24s, not 0.2,
	// because of floating-point division accuracy (1 / 5.0 is not exactly 0.2)
	struct SampleInterval {
	double timestamp;
	double fps;
	SampleInterval() : timestamp(-1), fps(SpecialFps::SAMPLE_ALL_FRAMES) {}
	SampleInterval(double ts, double f) : timestamp(ts), fps(f) {}
	bool operator<(const SampleInterval& itvl) const {
	return (timestamp < itvl.timestamp);
	}
	};

	class Params {
	public:
	// return all key-frames regardless of specified fps
	bool keyFrames_ = false;

	// Output image pixel format
	AVPixelFormat pixelFormat_ = AVPixelFormat::AV_PIX_FMT_RGB24;

	// Index of stream to decode.
	// -1 will automatically decode the first video stream.
	int streamIndex_ = -1;

	// How many frames to output at most from the video
	// -1 no limit
	int maximumOutputFrames_ = -1;

	// params for video resolution
	int video_res_type_ = VideoResType::USE_WIDTH_HEIGHT;

	// the size of the patch croped from the input video
	int crop_height_ = -1;
	int crop_width_ = -1;

	// minimal resolution for resizing when using USE_MINIMAL_WIDTH_HEIGHT
	int height_min_ = -1;
	int width_min_ = -1;

	// the video resolution after resizing
	int scale_w_ = -1;
	int scale_h_ = -1;

	// params for decoding behavior
	int decode_type_ = DecodeType::DO_TMP_JITTER;
	int num_of_required_frame_ = -1;

	// intervals_ control variable sampling fps between different timestamps
	// intervals_ must be ordered strictly ascending by timestamps
	// the first interval must have a timestamp of zero
	// fps must be either the 3 special fps defined in SpecialFps, or > 0
	std::vector<SampleInterval> intervals_ = {{0, SpecialFps::SAMPLE_ALL_FRAMES}};

	Params() {}

	/**
	* FPS of output frames
	* setting here will reset intervals_ and force decoding at target FPS
	* This can be used if user just want to decode at a steady fps
	*/
	Params& fps(float v) {
	intervals_.clear();
	intervals_.emplace_back(0, v);
	return *this;
	}

	/**
	* Sample output frames at a specified list of timestamps
	* Timestamps must be in increasing order, and timestamps past the end of the
	* video will be ignored
	* Setting here will reset intervals_
	*/
	Params& setSampleTimestamps(const std::vector<double>& timestamps) {
	intervals_.clear();
	// insert an interval per desired frame.
	for (auto& timestamp : timestamps) {
	intervals_.emplace_back(timestamp, SpecialFps::SAMPLE_TIMESTAMP_ONLY);
	}
	return *this;
	}

	/**
	* Pixel format of output buffer, default PIX_FMT_RGB24
	*/
	Params& pixelFormat(AVPixelFormat pixelFormat) {
	pixelFormat_ = pixelFormat;
	return *this;
	}

	/**
	* Return all key-frames
	*/
	Params& keyFrames(bool keyFrames) {
	keyFrames_ = keyFrames;
	return *this;
	}

	/**
	* Index of video stream to process, defaults to the first video stream
	*/
	Params& streamIndex(int index) {
	streamIndex_ = index;
	return *this;
	}

	/**
	* Only output this many frames, default to no limit
	*/
	Params& maxOutputFrames(int count) {
	maximumOutputFrames_ = count;
	return *this;
	}

	/**
	* Output frame width, default to video width
	*/
	Params& outputWidth(int width) {
	scale_w_ = width;
	return *this;
	}

	/**
	* Output frame height, default to video height
	*/
	Params& outputHeight(int height) {
	scale_h_ = height;
	return *this;
	}
	};

	// data structure for storing decoded video frames
	class DecodedFrame {
	public:
	struct avDeleter {
	void operator()(unsigned char* p) const {
	av_free(p);
	}
	};
	using AvDataPtr = std::unique_ptr<uint8_t, avDeleter>;

	// decoded data buffer
	AvDataPtr data_;

	// size in bytes
	int size_ = 0;

	// frame dimensions
	int width_ = 0;
	int height_ = 0;

	// timestamp in seconds since beginning of video
	double timestamp_ = 0;

	// true if this is a key frame.
	bool keyFrame_ = false;

	// index of frame in video
	int index_ = -1;

	// Sequential number of outputted frame
	int outputFrameIndex_ = -1;
	};

	class VideoIOContext {
	public:
	explicit VideoIOContext(const std::string& fname)
	: workBuffersize_(VIO_BUFFER_SZ),
	workBuffer_((uint8_t*)av_malloc(workBuffersize_)),
	inputFile_(nullptr),
	inputBuffer_(nullptr),
	inputBufferSize_(0) {
	inputFile_ = fopen(fname.c_str(), "rb");
	if (inputFile_ == nullptr) {
	LOG(ERROR) << "Error opening video file " << fname;
	}
	ctx_ = avio_alloc_context(
	static_cast<unsigned char*>(workBuffer_.get()),
	workBuffersize_,
	0,
	this,
	&VideoIOContext::readFile,
	nullptr, // no write function
	&VideoIOContext::seekFile);
	}

	explicit VideoIOContext(const char* buffer, int size)
	: workBuffersize_(VIO_BUFFER_SZ),
	workBuffer_((uint8_t*)av_malloc(workBuffersize_)),
	inputFile_(nullptr),
	inputBuffer_(buffer),
	inputBufferSize_(size) {
	ctx_ = avio_alloc_context(
	static_cast<unsigned char*>(workBuffer_.get()),
	workBuffersize_,
	0,
	this,
	&VideoIOContext::readMemory,
	nullptr, // no write function
	&VideoIOContext::seekMemory);
	}

	~VideoIOContext() {
	av_free(ctx_);
	if (inputFile_) {
	fclose(inputFile_);
	}
	}

	int read(unsigned char* buf, int buf_size) {
	if (inputBuffer_) {
	return readMemory(this, buf, buf_size);
	} else if (inputFile_) {
	return readFile(this, buf, buf_size);
	} else {
	return -1;
	}
	}

	int64_t seek(int64_t offset, int whence) {
	if (inputBuffer_) {
	return seekMemory(this, offset, whence);
	} else if (inputFile_) {
	return seekFile(this, offset, whence);
	} else {
	return -1;
	}
	}

	static int readFile(void* opaque, unsigned char* buf, int buf_size) {
	VideoIOContext* h = static_cast<VideoIOContext*>(opaque);
	if (feof(h->inputFile_)) {
	return AVERROR_EOF;
	}
	size_t ret = fread(buf, 1, buf_size, h->inputFile_);
	if (ret < buf_size) {
	if (ferror(h->inputFile_)) {
	return -1;
	}
	}
	return ret;
	}

	static int64_t seekFile(void* opaque, int64_t offset, int whence) {
	VideoIOContext* h = static_cast<VideoIOContext*>(opaque);
	switch (whence) {
	case SEEK_CUR: // from current position
	case SEEK_END: // from eof
	case SEEK_SET: // from beginning of file
	return fseek(h->inputFile_, static_cast<long>(offset), whence);
	break;
	case AVSEEK_SIZE:
	int64_t cur = ftell(h->inputFile_);
	fseek(h->inputFile_, 0L, SEEK_END);
	int64_t size = ftell(h->inputFile_);
	fseek(h->inputFile_, cur, SEEK_SET);
	return size;
	}

	return -1;
	}

	static int readMemory(void* opaque, unsigned char* buf, int buf_size) {
	VideoIOContext* h = static_cast<VideoIOContext*>(opaque);
	if (buf_size < 0) {
	return -1;
	}

	int reminder = h->inputBufferSize_ - h->offset_;
	int r = buf_size < reminder ? buf_size : reminder;
	if (r < 0) {
	return AVERROR_EOF;
	}

	memcpy(buf, h->inputBuffer_ + h->offset_, r);
	h->offset_ += r;
	return r;
	}

	static int64_t seekMemory(void* opaque, int64_t offset, int whence) {
	VideoIOContext* h = static_cast<VideoIOContext*>(opaque);
	switch (whence) {
	case SEEK_CUR: // from current position
	h->offset_ += offset;
	break;
	case SEEK_END: // from eof
	h->offset_ = h->inputBufferSize_ + offset;
	break;
	case SEEK_SET: // from beginning of file
	h->offset_ = offset;
	break;
	case AVSEEK_SIZE:
	return h->inputBufferSize_;
	}
	return h->offset_;
	}

	AVIOContext* get_avio() {
	return ctx_;
	}

	private:
	int workBuffersize_;
	DecodedFrame::AvDataPtr workBuffer_;
	// for file mode
	FILE* inputFile_;

	// for memory mode
	const char* inputBuffer_;
	int inputBufferSize_;
	int offset_ = 0;

	AVIOContext* ctx_;
	};

	struct VideoMeta {
	double fps;
	int width;
	int height;
	enum AVMediaType codec_type;
	AVPixelFormat pixFormat;
	VideoMeta()
	: fps(-1),
	width(-1),
	height(-1),
	codec_type(AVMEDIA_TYPE_VIDEO),
	pixFormat(AVPixelFormat::AV_PIX_FMT_RGB24) {}
	};

	class VideoDecoder {
	public:
	VideoDecoder();

	void decodeFile(
	const std::string& filename,
	const Params& params,
	const int start_frm,
	std::vector<std::unique_ptr<DecodedFrame>>& sampledFrames);

	void decodeMemory(
	const char* buffer,
	const int size,
	const Params& params,
	const int start_frm,
	std::vector<std::unique_ptr<DecodedFrame>>& sampledFrames);

	private:
	std::string ffmpegErrorStr(int result);

	void ResizeAndKeepAspectRatio(
	const int origHeight,
	const int origWidth,
	const int heightMin,
	const int widthMin,
	int& outHeight,
	int& outWidth);

	void decodeLoop(
	const std::string& videoName,
	VideoIOContext& ioctx,
	const Params& params,
	const int start_frm,
	std::vector<std::unique_ptr<DecodedFrame>>& sampledFrames);
	};
	} // namespace caffe2

	#endif // CAFFE2_VIDEO_VIDEO_DECODER_H_