blob: 0824cded6868c2a66feb1c02c47a5674674f26c8 [file] [log] [blame]
#include <caffe2/video/video_io.h>
#include <caffe2/core/logging.h>
#include <algorithm>
#include <random>
#include <string>
namespace caffe2 {
// assume CLHW order and color channels RGB
void Saturation(
float* clip,
const int length,
const int crop_height,
const int crop_width,
const float alpha_rand,
std::mt19937* randgen) {
float alpha = 1.0f +
std::uniform_real_distribution<float>(-alpha_rand, alpha_rand)(*randgen);
// RGB to Gray scale image: R -> 0.299, G -> 0.587, B -> 0.114
const int channel_size = length * crop_height * crop_width;
int p = 0;
for (int l = 0; l < length; ++l) {
for (int h = 0; h < crop_height; ++h) {
for (int w = 0; w < crop_width; ++w) {
float gray_color = clip[p] * 0.299f + clip[p + channel_size] * 0.587f +
clip[p + 2 * channel_size] * 0.114f;
for (int c = 0; c < 3; ++c) {
clip[c * channel_size + p] =
clip[c * channel_size + p] * alpha + gray_color * (1.0f - alpha);
}
p++;
}
}
}
}
// assume CLHW order and color channels RGB
void Brightness(
float* clip,
const int length,
const int crop_height,
const int crop_width,
const float alpha_rand,
std::mt19937* randgen) {
float alpha = 1.0f +
std::uniform_real_distribution<float>(-alpha_rand, alpha_rand)(*randgen);
int p = 0;
for (int c = 0; c < 3; ++c) {
for (int l = 0; l < length; ++l) {
for (int h = 0; h < crop_height; ++h) {
for (int w = 0; w < crop_width; ++w) {
clip[p++] *= alpha;
}
}
}
}
}
// assume CLHW order and color channels RGB
void Contrast(
float* clip,
const int length,
const int crop_height,
const int crop_width,
const float alpha_rand,
std::mt19937* randgen) {
const int channel_size = length * crop_height * crop_width;
float gray_mean = 0;
int p = 0;
for (int l = 0; l < length; ++l) {
for (int h = 0; h < crop_height; ++h) {
for (int w = 0; w < crop_width; ++w) {
// RGB to Gray scale image: R -> 0.299, G -> 0.587, B -> 0.114
gray_mean += clip[p] * 0.299f + clip[p + channel_size] * 0.587f +
clip[p + 2 * channel_size] * 0.114f;
p++;
}
}
}
gray_mean /= (length * crop_height * crop_width);
float alpha = 1.0f +
std::uniform_real_distribution<float>(-alpha_rand, alpha_rand)(*randgen);
p = 0;
for (int c = 0; c < 3; ++c) {
for (int l = 0; l < length; ++l) {
for (int h = 0; h < crop_height; ++h) {
for (int w = 0; w < crop_width; ++w) {
clip[p] = clip[p] * alpha + gray_mean * (1.0f - alpha);
p++;
}
}
}
}
}
// assume CLHW order and color channels RGB
void ColorJitter(
float* clip,
const int length,
const int crop_height,
const int crop_width,
const float saturation,
const float brightness,
const float contrast,
std::mt19937* randgen) {
std::srand(unsigned(std::time(0)));
std::vector<int> jitter_order{0, 1, 2};
// obtain a time-based seed:
unsigned seed = std::chrono::system_clock::now().time_since_epoch().count();
std::shuffle(
jitter_order.begin(),
jitter_order.end(),
std::default_random_engine(seed));
for (int i = 0; i < 3; ++i) {
if (jitter_order[i] == 0) {
Saturation(clip, length, crop_height, crop_width, saturation, randgen);
} else if (jitter_order[i] == 1) {
Brightness(clip, length, crop_height, crop_width, brightness, randgen);
} else {
Contrast(clip, length, crop_height, crop_width, contrast, randgen);
}
}
}
// assume CLHW order and color channels RGB
void ColorLighting(
float* clip,
const int length,
const int crop_height,
const int crop_width,
const float alpha_std,
const std::vector<std::vector<float>>& eigvecs,
const std::vector<float>& eigvals,
std::mt19937* randgen) {
std::normal_distribution<float> d(0, alpha_std);
std::vector<float> alphas(3);
for (int i = 0; i < 3; ++i) {
alphas[i] = d(*randgen);
}
std::vector<float> delta_rgb(3, 0.0);
for (int i = 0; i < 3; ++i) {
for (int j = 0; j < 3; ++j) {
delta_rgb[i] += eigvecs[i][j] * eigvals[j] * alphas[j];
}
}
int p = 0;
for (int c = 0; c < 3; ++c) {
for (int l = 0; l < length; ++l) {
for (int h = 0; h < crop_height; ++h) {
for (int w = 0; w < crop_width; ++w) {
clip[p++] += delta_rgb[c];
}
}
}
}
}
// assume CLHW order and color channels RGB
// mean subtraction and scaling.
void ColorNormalization(
float* clip,
const int length,
const int crop_height,
const int crop_width,
const int channels,
const std::vector<float>& mean,
const std::vector<float>& inv_std) {
int p = 0;
for (int c = 0; c < channels; ++c) {
for (int l = 0; l < length; ++l) {
for (int h = 0; h < crop_height; ++h) {
for (int w = 0; w < crop_width; ++w) {
clip[p] = (clip[p] - mean[c]) * inv_std[c];
p++;
}
}
}
}
}
void ClipTransformRGB(
const unsigned char* buffer_rgb,
const int multi_crop_count,
const int crop_height,
const int crop_width,
const int length_rgb,
const int channels_rgb,
const int sampling_rate_rgb,
const int height,
const int width,
const int h_off,
const int w_off,
const int* multi_crop_h_off,
const int* multi_crop_w_off,
const bool mirror_me,
const bool color_jitter,
const float saturation,
const float brightness,
const float contrast,
const bool color_lighting,
const float color_lighting_std,
const std::vector<std::vector<float>>& color_lighting_eigvecs,
const std::vector<float>& color_lighting_eigvals,
const std::vector<float>& mean_rgb,
const std::vector<float>& inv_std_rgb,
std::mt19937* randgen,
float* transformed_clip) {
CAFFE_ENFORCE_EQ(
channels_rgb, mean_rgb.size(), "rgb channels must be equal to mean size");
CAFFE_ENFORCE_EQ(
mean_rgb.size(),
inv_std_rgb.size(),
"mean size must be equal to inv_std size");
int orig_index, tran_index;
if (multi_crop_count == 1) {
// Case 1: Multi_cropping is disabled
// The order of output dimensions is C, L, H, W
bool do_color_jitter_lighting =
(color_jitter || color_lighting) && channels_rgb == 3;
for (int c = 0; c < channels_rgb; ++c) {
for (int l = 0; l < length_rgb; ++l) {
int orig_index_l =
l * sampling_rate_rgb * height * width * channels_rgb;
int tran_index_l = (c * length_rgb + l) * crop_height;
for (int h = 0; h < crop_height; ++h) {
int orig_index_h = orig_index_l + (h + h_off) * width * channels_rgb;
int tran_index_h = (tran_index_l + h) * crop_width;
for (int w = 0; w < crop_width; ++w) {
orig_index = orig_index_h + (w + w_off) * channels_rgb + c;
// mirror the frame
if (mirror_me) {
tran_index = tran_index_h + (crop_width - 1 - w);
} else {
tran_index = tran_index_h + w;
}
// normalize and transform the clip
if (do_color_jitter_lighting) {
transformed_clip[tran_index] = buffer_rgb[orig_index];
} else {
transformed_clip[tran_index] =
(buffer_rgb[orig_index] - mean_rgb[c]) * inv_std_rgb[c];
}
}
}
}
}
if (color_jitter && channels_rgb == 3) {
ColorJitter(
transformed_clip,
length_rgb,
crop_height,
crop_width,
saturation,
brightness,
contrast,
randgen);
}
if (color_lighting && channels_rgb == 3) {
ColorLighting(
transformed_clip,
length_rgb,
crop_height,
crop_width,
color_lighting_std,
color_lighting_eigvecs,
color_lighting_eigvals,
randgen);
}
if (do_color_jitter_lighting) {
// Color normalization
// Mean subtraction and division by standard deviation.
ColorNormalization(
transformed_clip,
length_rgb,
crop_height,
crop_width,
channels_rgb,
mean_rgb,
inv_std_rgb);
}
} else {
// Case 2: Multi_cropping is enabled. Multi cropping should be only used at
// testing stage. So color jittering and lighting are not used
for (int multi_crop_mirror = 0; multi_crop_mirror < 2;
++multi_crop_mirror) {
for (int i = 0; i < multi_crop_count / 2; ++i) {
for (int c = 0; c < channels_rgb; ++c) {
for (int l = 0; l < length_rgb; ++l) {
int orig_index_l =
l * sampling_rate_rgb * height * width * channels_rgb;
int tran_index_l = (c * length_rgb + l) * crop_height;
for (int h = 0; h < crop_height; ++h) {
int orig_index_h = orig_index_l +
(h + multi_crop_h_off[i]) * width * channels_rgb;
int tran_index_h = (tran_index_l + h) * crop_width;
for (int w = 0; w < crop_width; ++w) {
orig_index =
orig_index_h + (w + multi_crop_w_off[i]) * channels_rgb + c;
if (multi_crop_mirror == 1) {
tran_index = tran_index_h + (crop_width - 1 - w);
} else {
tran_index = tran_index_h + w;
}
transformed_clip[tran_index] =
(buffer_rgb[orig_index] - mean_rgb[c]) * inv_std_rgb[c];
}
}
}
}
transformed_clip +=
channels_rgb * length_rgb * crop_height * crop_width;
}
}
}
}
void ClipTransformOpticalFlow(
const unsigned char* buffer_rgb,
const int crop_height,
const int crop_width,
const int length_of,
const int channels_of,
const int sampling_rate_of,
const int height,
const int width,
const cv::Rect& rect,
const int channels_rgb,
const bool mirror_me,
const int flow_alg_type,
const int flow_data_type,
const int frame_gap_of,
const bool do_flow_aggregation,
const std::vector<float>& mean_of,
const std::vector<float>& inv_std_of,
float* transformed_clip) {
const int frame_size = crop_height * crop_width;
const int channel_size_flow = length_of * frame_size;
// for get the mean and std of the input data
bool extract_statistics = false;
static std::vector<double> mean_static(channels_of, 0.f);
static std::vector<double> std_static(channels_of, 0.f);
static long long count = 0;
cv::Scalar mean_img, std_img;
for (int l = 0; l < length_of; l++) {
// get the grayscale frames
std::vector<cv::Mat> grays, rgbs;
int step_size = do_flow_aggregation ? 1 : frame_gap_of;
for (int j = 0; j <= frame_gap_of; j += step_size) {
// get the current frame
const unsigned char* curr_frame = buffer_rgb +
(l * sampling_rate_of + j) * height * width * channels_rgb;
cv::Mat img = cv::Mat::zeros(height, width, CV_8UC3);
memcpy(
img.data,
curr_frame,
height * width * channels_rgb * sizeof(unsigned char));
// crop and mirror the frame
cv::Mat img_cropped = img(rect);
if (mirror_me) {
cv::flip(img_cropped, img_cropped, 1);
}
cv::Mat gray;
cv::cvtColor(img_cropped, gray, cv::COLOR_RGB2GRAY);
grays.push_back(gray);
rgbs.push_back(img_cropped);
}
cv::Mat first_gray, first_rgb;
cv::Mat flow = cv::Mat::zeros(crop_height, crop_width, CV_32FC2);
MultiFrameOpticalFlowExtractor(grays, flow_alg_type, flow);
std::vector<cv::Mat> imgs;
cv::split(flow, imgs);
// save the 2-channel optical flow first
int c = 0;
for (; c < 2; c++) {
if (extract_statistics) {
cv::meanStdDev(imgs[c], mean_img, std_img);
mean_static[c] += mean_img[0];
std_static[c] += std_img[0];
}
imgs[c] -= mean_of[c];
imgs[c] *= inv_std_of[c];
memcpy(
transformed_clip + c * channel_size_flow + l * frame_size,
imgs[c].data,
frame_size * sizeof(float));
}
cv::Mat mag;
std::vector<cv::Mat> chans;
// augment the optical flow with more channels
switch (flow_data_type) {
case FlowDataType::Flow2C:
// nothing to do if we only need two channels
break;
case FlowDataType::Flow3C:
// use magnitude as the third channel
mag = cv::abs(imgs[0]) + cv::abs(imgs[1]);
if (extract_statistics) {
cv::meanStdDev(mag, mean_img, std_img);
mean_static[c] += mean_img[0];
std_static[c] += std_img[0];
}
mag -= mean_of[c];
mag *= inv_std_of[c];
memcpy(
transformed_clip + c * channel_size_flow + l * frame_size,
mag.data,
frame_size * sizeof(float));
break;
case FlowDataType::FlowWithGray:
// add grayscale image as the third channel
grays[0].convertTo(first_gray, CV_32FC1);
if (extract_statistics) {
cv::meanStdDev(first_gray, mean_img, std_img);
mean_static[c] += mean_img[0];
std_static[c] += std_img[0];
}
first_gray -= mean_of[c];
first_gray *= inv_std_of[c];
memcpy(
transformed_clip + c * channel_size_flow + l * frame_size,
first_gray.data,
frame_size * sizeof(float));
break;
case FlowDataType::FlowWithRGB:
// add all three rgb channels
rgbs[0].convertTo(first_rgb, CV_32FC3);
cv::split(first_rgb, chans);
for (; c < channels_of; c++) {
if (extract_statistics) {
cv::meanStdDev(chans[c - 2], mean_img, std_img);
mean_static[c] += mean_img[0];
std_static[c] += std_img[0];
}
chans[c - 2] -= mean_of[c];
chans[c - 2] *= inv_std_of[c];
memcpy(
transformed_clip + c * channel_size_flow + l * frame_size,
chans[c - 2].data,
frame_size * sizeof(float));
}
break;
default:
LOG(ERROR) << "Unsupported optical flow data type " << flow_data_type;
break;
}
if (extract_statistics) {
count++;
if (count % 1000 == 1) {
for (int i = 0; i < channels_of; i++) {
LOG(INFO) << i
<< "-th channel mean: " << mean_static[i] / float(count)
<< " std: " << std_static[i] / float(count);
}
}
}
}
}
void FreeDecodedData(
std::vector<std::unique_ptr<DecodedFrame>>& sampledFrames) {
// free the sampledFrames
for (int i = 0; i < sampledFrames.size(); i++) {
DecodedFrame* p = sampledFrames[i].release();
delete p;
}
sampledFrames.clear();
}
bool DecodeMultipleClipsFromVideo(
const char* video_buffer,
const std::string& video_filename,
const int encoded_size,
const Params& params,
const int start_frm,
const int clip_per_video,
const bool use_local_file,
int& height,
int& width,
std::vector<unsigned char*>& buffer_rgb) {
std::vector<std::unique_ptr<DecodedFrame>> sampledFrames;
VideoDecoder decoder;
// decoding from buffer or file
if (!use_local_file) {
decoder.decodeMemory(
video_buffer, encoded_size, params, start_frm, sampledFrames);
} else {
decoder.decodeFile(video_filename, params, start_frm, sampledFrames);
}
for (int i = 0; i < buffer_rgb.size(); i++) {
unsigned char* buff = buffer_rgb[i];
delete[] buff;
}
buffer_rgb.clear();
if (sampledFrames.size() < params.num_of_required_frame_) {
// LOG(ERROR) << "The video seems faulty and we could not decode enough
// frames: "
// << sampledFrames.size() << " VS " <<
// params.num_of_required_frame_;
FreeDecodedData(sampledFrames);
return true;
}
height = sampledFrames[0]->height_;
width = sampledFrames[0]->width_;
float sample_stepsz = (clip_per_video <= 1)
? 0
: (float(sampledFrames.size() - params.num_of_required_frame_) /
(clip_per_video - 1));
int image_size = 3 * height * width;
int clip_size = params.num_of_required_frame_ * image_size;
// get the RGB frames for each clip
for (int i = 0; i < clip_per_video; i++) {
unsigned char* buffer_rgb_ptr = new unsigned char[clip_size];
int clip_start = floor(i * sample_stepsz);
for (int j = 0; j < params.num_of_required_frame_; j++) {
memcpy(
buffer_rgb_ptr + j * image_size,
(unsigned char*)sampledFrames[j + clip_start]->data_.get(),
image_size * sizeof(unsigned char));
}
buffer_rgb.push_back(buffer_rgb_ptr);
}
FreeDecodedData(sampledFrames);
return true;
}
} // namespace caffe2