| #include <caffe2/video/video_io.h> |
| #include <caffe2/core/logging.h> |
| #include <algorithm> |
| #include <random> |
| #include <string> |
| |
| namespace caffe2 { |
| |
| // assume CLHW order and color channels RGB |
| void Saturation( |
| float* clip, |
| const int length, |
| const int crop_height, |
| const int crop_width, |
| const float alpha_rand, |
| std::mt19937* randgen) { |
| float alpha = 1.0f + |
| std::uniform_real_distribution<float>(-alpha_rand, alpha_rand)(*randgen); |
| |
| // RGB to Gray scale image: R -> 0.299, G -> 0.587, B -> 0.114 |
| const int channel_size = length * crop_height * crop_width; |
| int p = 0; |
| for (int l = 0; l < length; ++l) { |
| for (int h = 0; h < crop_height; ++h) { |
| for (int w = 0; w < crop_width; ++w) { |
| float gray_color = clip[p] * 0.299f + clip[p + channel_size] * 0.587f + |
| clip[p + 2 * channel_size] * 0.114f; |
| for (int c = 0; c < 3; ++c) { |
| clip[c * channel_size + p] = |
| clip[c * channel_size + p] * alpha + gray_color * (1.0f - alpha); |
| } |
| p++; |
| } |
| } |
| } |
| } |
| |
| // assume CLHW order and color channels RGB |
| void Brightness( |
| float* clip, |
| const int length, |
| const int crop_height, |
| const int crop_width, |
| const float alpha_rand, |
| std::mt19937* randgen) { |
| float alpha = 1.0f + |
| std::uniform_real_distribution<float>(-alpha_rand, alpha_rand)(*randgen); |
| |
| int p = 0; |
| for (int c = 0; c < 3; ++c) { |
| for (int l = 0; l < length; ++l) { |
| for (int h = 0; h < crop_height; ++h) { |
| for (int w = 0; w < crop_width; ++w) { |
| clip[p++] *= alpha; |
| } |
| } |
| } |
| } |
| } |
| |
| // assume CLHW order and color channels RGB |
| void Contrast( |
| float* clip, |
| const int length, |
| const int crop_height, |
| const int crop_width, |
| const float alpha_rand, |
| std::mt19937* randgen) { |
| const int channel_size = length * crop_height * crop_width; |
| float gray_mean = 0; |
| int p = 0; |
| for (int l = 0; l < length; ++l) { |
| for (int h = 0; h < crop_height; ++h) { |
| for (int w = 0; w < crop_width; ++w) { |
| // RGB to Gray scale image: R -> 0.299, G -> 0.587, B -> 0.114 |
| gray_mean += clip[p] * 0.299f + clip[p + channel_size] * 0.587f + |
| clip[p + 2 * channel_size] * 0.114f; |
| p++; |
| } |
| } |
| } |
| gray_mean /= (length * crop_height * crop_width); |
| |
| float alpha = 1.0f + |
| std::uniform_real_distribution<float>(-alpha_rand, alpha_rand)(*randgen); |
| p = 0; |
| for (int c = 0; c < 3; ++c) { |
| for (int l = 0; l < length; ++l) { |
| for (int h = 0; h < crop_height; ++h) { |
| for (int w = 0; w < crop_width; ++w) { |
| clip[p] = clip[p] * alpha + gray_mean * (1.0f - alpha); |
| p++; |
| } |
| } |
| } |
| } |
| } |
| |
| // assume CLHW order and color channels RGB |
| void ColorJitter( |
| float* clip, |
| const int length, |
| const int crop_height, |
| const int crop_width, |
| const float saturation, |
| const float brightness, |
| const float contrast, |
| std::mt19937* randgen) { |
| std::srand(unsigned(std::time(0))); |
| std::vector<int> jitter_order{0, 1, 2}; |
| // obtain a time-based seed: |
| unsigned seed = std::chrono::system_clock::now().time_since_epoch().count(); |
| std::shuffle( |
| jitter_order.begin(), |
| jitter_order.end(), |
| std::default_random_engine(seed)); |
| |
| for (int i = 0; i < 3; ++i) { |
| if (jitter_order[i] == 0) { |
| Saturation(clip, length, crop_height, crop_width, saturation, randgen); |
| } else if (jitter_order[i] == 1) { |
| Brightness(clip, length, crop_height, crop_width, brightness, randgen); |
| } else { |
| Contrast(clip, length, crop_height, crop_width, contrast, randgen); |
| } |
| } |
| } |
| |
| // assume CLHW order and color channels RGB |
| void ColorLighting( |
| float* clip, |
| const int length, |
| const int crop_height, |
| const int crop_width, |
| const float alpha_std, |
| const std::vector<std::vector<float>>& eigvecs, |
| const std::vector<float>& eigvals, |
| std::mt19937* randgen) { |
| std::normal_distribution<float> d(0, alpha_std); |
| std::vector<float> alphas(3); |
| for (int i = 0; i < 3; ++i) { |
| alphas[i] = d(*randgen); |
| } |
| |
| std::vector<float> delta_rgb(3, 0.0); |
| for (int i = 0; i < 3; ++i) { |
| for (int j = 0; j < 3; ++j) { |
| delta_rgb[i] += eigvecs[i][j] * eigvals[j] * alphas[j]; |
| } |
| } |
| |
| int p = 0; |
| for (int c = 0; c < 3; ++c) { |
| for (int l = 0; l < length; ++l) { |
| for (int h = 0; h < crop_height; ++h) { |
| for (int w = 0; w < crop_width; ++w) { |
| clip[p++] += delta_rgb[c]; |
| } |
| } |
| } |
| } |
| } |
| |
| // assume CLHW order and color channels RGB |
| // mean subtraction and scaling. |
| void ColorNormalization( |
| float* clip, |
| const int length, |
| const int crop_height, |
| const int crop_width, |
| const int channels, |
| const std::vector<float>& mean, |
| const std::vector<float>& inv_std) { |
| int p = 0; |
| for (int c = 0; c < channels; ++c) { |
| for (int l = 0; l < length; ++l) { |
| for (int h = 0; h < crop_height; ++h) { |
| for (int w = 0; w < crop_width; ++w) { |
| clip[p] = (clip[p] - mean[c]) * inv_std[c]; |
| p++; |
| } |
| } |
| } |
| } |
| } |
| |
| void ClipTransformRGB( |
| const unsigned char* buffer_rgb, |
| const int multi_crop_count, |
| const int crop_height, |
| const int crop_width, |
| const int length_rgb, |
| const int channels_rgb, |
| const int sampling_rate_rgb, |
| const int height, |
| const int width, |
| const int h_off, |
| const int w_off, |
| const int* multi_crop_h_off, |
| const int* multi_crop_w_off, |
| const bool mirror_me, |
| const bool color_jitter, |
| const float saturation, |
| const float brightness, |
| const float contrast, |
| const bool color_lighting, |
| const float color_lighting_std, |
| const std::vector<std::vector<float>>& color_lighting_eigvecs, |
| const std::vector<float>& color_lighting_eigvals, |
| const std::vector<float>& mean_rgb, |
| const std::vector<float>& inv_std_rgb, |
| std::mt19937* randgen, |
| float* transformed_clip) { |
| CAFFE_ENFORCE_EQ( |
| channels_rgb, mean_rgb.size(), "rgb channels must be equal to mean size"); |
| CAFFE_ENFORCE_EQ( |
| mean_rgb.size(), |
| inv_std_rgb.size(), |
| "mean size must be equal to inv_std size"); |
| int orig_index, tran_index; |
| if (multi_crop_count == 1) { |
| // Case 1: Multi_cropping is disabled |
| // The order of output dimensions is C, L, H, W |
| bool do_color_jitter_lighting = |
| (color_jitter || color_lighting) && channels_rgb == 3; |
| for (int c = 0; c < channels_rgb; ++c) { |
| for (int l = 0; l < length_rgb; ++l) { |
| int orig_index_l = |
| l * sampling_rate_rgb * height * width * channels_rgb; |
| int tran_index_l = (c * length_rgb + l) * crop_height; |
| |
| for (int h = 0; h < crop_height; ++h) { |
| int orig_index_h = orig_index_l + (h + h_off) * width * channels_rgb; |
| int tran_index_h = (tran_index_l + h) * crop_width; |
| |
| for (int w = 0; w < crop_width; ++w) { |
| orig_index = orig_index_h + (w + w_off) * channels_rgb + c; |
| |
| // mirror the frame |
| if (mirror_me) { |
| tran_index = tran_index_h + (crop_width - 1 - w); |
| } else { |
| tran_index = tran_index_h + w; |
| } |
| |
| // normalize and transform the clip |
| if (do_color_jitter_lighting) { |
| transformed_clip[tran_index] = buffer_rgb[orig_index]; |
| } else { |
| transformed_clip[tran_index] = |
| (buffer_rgb[orig_index] - mean_rgb[c]) * inv_std_rgb[c]; |
| } |
| } |
| } |
| } |
| } |
| if (color_jitter && channels_rgb == 3) { |
| ColorJitter( |
| transformed_clip, |
| length_rgb, |
| crop_height, |
| crop_width, |
| saturation, |
| brightness, |
| contrast, |
| randgen); |
| } |
| if (color_lighting && channels_rgb == 3) { |
| ColorLighting( |
| transformed_clip, |
| length_rgb, |
| crop_height, |
| crop_width, |
| color_lighting_std, |
| color_lighting_eigvecs, |
| color_lighting_eigvals, |
| randgen); |
| } |
| if (do_color_jitter_lighting) { |
| // Color normalization |
| // Mean subtraction and division by standard deviation. |
| ColorNormalization( |
| transformed_clip, |
| length_rgb, |
| crop_height, |
| crop_width, |
| channels_rgb, |
| mean_rgb, |
| inv_std_rgb); |
| } |
| } else { |
| // Case 2: Multi_cropping is enabled. Multi cropping should be only used at |
| // testing stage. So color jittering and lighting are not used |
| for (int multi_crop_mirror = 0; multi_crop_mirror < 2; |
| ++multi_crop_mirror) { |
| for (int i = 0; i < multi_crop_count / 2; ++i) { |
| for (int c = 0; c < channels_rgb; ++c) { |
| for (int l = 0; l < length_rgb; ++l) { |
| int orig_index_l = |
| l * sampling_rate_rgb * height * width * channels_rgb; |
| int tran_index_l = (c * length_rgb + l) * crop_height; |
| |
| for (int h = 0; h < crop_height; ++h) { |
| int orig_index_h = orig_index_l + |
| (h + multi_crop_h_off[i]) * width * channels_rgb; |
| int tran_index_h = (tran_index_l + h) * crop_width; |
| |
| for (int w = 0; w < crop_width; ++w) { |
| orig_index = |
| orig_index_h + (w + multi_crop_w_off[i]) * channels_rgb + c; |
| |
| if (multi_crop_mirror == 1) { |
| tran_index = tran_index_h + (crop_width - 1 - w); |
| } else { |
| tran_index = tran_index_h + w; |
| } |
| |
| transformed_clip[tran_index] = |
| (buffer_rgb[orig_index] - mean_rgb[c]) * inv_std_rgb[c]; |
| } |
| } |
| } |
| } |
| transformed_clip += |
| channels_rgb * length_rgb * crop_height * crop_width; |
| } |
| } |
| } |
| } |
| |
| void ClipTransformOpticalFlow( |
| const unsigned char* buffer_rgb, |
| const int crop_height, |
| const int crop_width, |
| const int length_of, |
| const int channels_of, |
| const int sampling_rate_of, |
| const int height, |
| const int width, |
| const cv::Rect& rect, |
| const int channels_rgb, |
| const bool mirror_me, |
| const int flow_alg_type, |
| const int flow_data_type, |
| const int frame_gap_of, |
| const bool do_flow_aggregation, |
| const std::vector<float>& mean_of, |
| const std::vector<float>& inv_std_of, |
| float* transformed_clip) { |
| const int frame_size = crop_height * crop_width; |
| const int channel_size_flow = length_of * frame_size; |
| |
| // for get the mean and std of the input data |
| bool extract_statistics = false; |
| static std::vector<double> mean_static(channels_of, 0.f); |
| static std::vector<double> std_static(channels_of, 0.f); |
| static long long count = 0; |
| cv::Scalar mean_img, std_img; |
| |
| for (int l = 0; l < length_of; l++) { |
| // get the grayscale frames |
| std::vector<cv::Mat> grays, rgbs; |
| int step_size = do_flow_aggregation ? 1 : frame_gap_of; |
| for (int j = 0; j <= frame_gap_of; j += step_size) { |
| // get the current frame |
| const unsigned char* curr_frame = buffer_rgb + |
| (l * sampling_rate_of + j) * height * width * channels_rgb; |
| cv::Mat img = cv::Mat::zeros(height, width, CV_8UC3); |
| memcpy( |
| img.data, |
| curr_frame, |
| height * width * channels_rgb * sizeof(unsigned char)); |
| |
| // crop and mirror the frame |
| cv::Mat img_cropped = img(rect); |
| if (mirror_me) { |
| cv::flip(img_cropped, img_cropped, 1); |
| } |
| |
| cv::Mat gray; |
| cv::cvtColor(img_cropped, gray, cv::COLOR_RGB2GRAY); |
| grays.push_back(gray); |
| rgbs.push_back(img_cropped); |
| } |
| |
| cv::Mat first_gray, first_rgb; |
| cv::Mat flow = cv::Mat::zeros(crop_height, crop_width, CV_32FC2); |
| MultiFrameOpticalFlowExtractor(grays, flow_alg_type, flow); |
| |
| std::vector<cv::Mat> imgs; |
| cv::split(flow, imgs); |
| // save the 2-channel optical flow first |
| int c = 0; |
| for (; c < 2; c++) { |
| if (extract_statistics) { |
| cv::meanStdDev(imgs[c], mean_img, std_img); |
| mean_static[c] += mean_img[0]; |
| std_static[c] += std_img[0]; |
| } |
| |
| imgs[c] -= mean_of[c]; |
| imgs[c] *= inv_std_of[c]; |
| memcpy( |
| transformed_clip + c * channel_size_flow + l * frame_size, |
| imgs[c].data, |
| frame_size * sizeof(float)); |
| } |
| |
| cv::Mat mag; |
| std::vector<cv::Mat> chans; |
| // augment the optical flow with more channels |
| switch (flow_data_type) { |
| case FlowDataType::Flow2C: |
| // nothing to do if we only need two channels |
| break; |
| |
| case FlowDataType::Flow3C: |
| // use magnitude as the third channel |
| mag = cv::abs(imgs[0]) + cv::abs(imgs[1]); |
| if (extract_statistics) { |
| cv::meanStdDev(mag, mean_img, std_img); |
| mean_static[c] += mean_img[0]; |
| std_static[c] += std_img[0]; |
| } |
| |
| mag -= mean_of[c]; |
| mag *= inv_std_of[c]; |
| memcpy( |
| transformed_clip + c * channel_size_flow + l * frame_size, |
| mag.data, |
| frame_size * sizeof(float)); |
| break; |
| |
| case FlowDataType::FlowWithGray: |
| // add grayscale image as the third channel |
| grays[0].convertTo(first_gray, CV_32FC1); |
| if (extract_statistics) { |
| cv::meanStdDev(first_gray, mean_img, std_img); |
| mean_static[c] += mean_img[0]; |
| std_static[c] += std_img[0]; |
| } |
| |
| first_gray -= mean_of[c]; |
| first_gray *= inv_std_of[c]; |
| memcpy( |
| transformed_clip + c * channel_size_flow + l * frame_size, |
| first_gray.data, |
| frame_size * sizeof(float)); |
| break; |
| |
| case FlowDataType::FlowWithRGB: |
| // add all three rgb channels |
| rgbs[0].convertTo(first_rgb, CV_32FC3); |
| cv::split(first_rgb, chans); |
| for (; c < channels_of; c++) { |
| if (extract_statistics) { |
| cv::meanStdDev(chans[c - 2], mean_img, std_img); |
| mean_static[c] += mean_img[0]; |
| std_static[c] += std_img[0]; |
| } |
| |
| chans[c - 2] -= mean_of[c]; |
| chans[c - 2] *= inv_std_of[c]; |
| memcpy( |
| transformed_clip + c * channel_size_flow + l * frame_size, |
| chans[c - 2].data, |
| frame_size * sizeof(float)); |
| } |
| break; |
| |
| default: |
| LOG(ERROR) << "Unsupported optical flow data type " << flow_data_type; |
| break; |
| } |
| |
| if (extract_statistics) { |
| count++; |
| if (count % 1000 == 1) { |
| for (int i = 0; i < channels_of; i++) { |
| LOG(INFO) << i |
| << "-th channel mean: " << mean_static[i] / float(count) |
| << " std: " << std_static[i] / float(count); |
| } |
| } |
| } |
| } |
| } |
| |
| void FreeDecodedData( |
| std::vector<std::unique_ptr<DecodedFrame>>& sampledFrames) { |
| // free the sampledFrames |
| for (int i = 0; i < sampledFrames.size(); i++) { |
| DecodedFrame* p = sampledFrames[i].release(); |
| delete p; |
| } |
| sampledFrames.clear(); |
| } |
| |
| bool DecodeMultipleClipsFromVideo( |
| const char* video_buffer, |
| const std::string& video_filename, |
| const int encoded_size, |
| const Params& params, |
| const int start_frm, |
| const int clip_per_video, |
| const bool use_local_file, |
| int& height, |
| int& width, |
| std::vector<unsigned char*>& buffer_rgb) { |
| std::vector<std::unique_ptr<DecodedFrame>> sampledFrames; |
| VideoDecoder decoder; |
| |
| // decoding from buffer or file |
| if (!use_local_file) { |
| decoder.decodeMemory( |
| video_buffer, encoded_size, params, start_frm, sampledFrames); |
| } else { |
| decoder.decodeFile(video_filename, params, start_frm, sampledFrames); |
| } |
| |
| for (int i = 0; i < buffer_rgb.size(); i++) { |
| unsigned char* buff = buffer_rgb[i]; |
| delete[] buff; |
| } |
| buffer_rgb.clear(); |
| |
| if (sampledFrames.size() < params.num_of_required_frame_) { |
| // LOG(ERROR) << "The video seems faulty and we could not decode enough |
| // frames: " |
| // << sampledFrames.size() << " VS " << |
| // params.num_of_required_frame_; |
| FreeDecodedData(sampledFrames); |
| return true; |
| } |
| |
| height = sampledFrames[0]->height_; |
| width = sampledFrames[0]->width_; |
| float sample_stepsz = (clip_per_video <= 1) |
| ? 0 |
| : (float(sampledFrames.size() - params.num_of_required_frame_) / |
| (clip_per_video - 1)); |
| |
| int image_size = 3 * height * width; |
| int clip_size = params.num_of_required_frame_ * image_size; |
| // get the RGB frames for each clip |
| for (int i = 0; i < clip_per_video; i++) { |
| unsigned char* buffer_rgb_ptr = new unsigned char[clip_size]; |
| int clip_start = floor(i * sample_stepsz); |
| for (int j = 0; j < params.num_of_required_frame_; j++) { |
| memcpy( |
| buffer_rgb_ptr + j * image_size, |
| (unsigned char*)sampledFrames[j + clip_start]->data_.get(), |
| image_size * sizeof(unsigned char)); |
| } |
| buffer_rgb.push_back(buffer_rgb_ptr); |
| } |
| FreeDecodedData(sampledFrames); |
| |
| return true; |
| } |
| |
| } // namespace caffe2 |