|  |  | 
|  | #ifndef CAFFE2_IMAGE_IMAGE_INPUT_OP_H_ | 
|  | #define CAFFE2_IMAGE_IMAGE_INPUT_OP_H_ | 
|  |  | 
|  | #include <opencv2/opencv.hpp> | 
|  |  | 
|  | #include <algorithm> | 
|  | #include <iostream> | 
|  |  | 
|  | #include "c10/core/thread_pool.h" | 
|  | #include "caffe2/core/common.h" | 
|  | #include "caffe2/core/db.h" | 
|  | #include "caffe2/image/transform_gpu.h" | 
|  | #include "caffe2/operators/prefetch_op.h" | 
|  | #include "caffe2/proto/caffe2_legacy.pb.h" | 
|  | #include "caffe2/utils/cast.h" | 
|  | #include "caffe2/utils/math.h" | 
|  |  | 
|  | namespace caffe2 { | 
|  |  | 
|  | class CUDAContext; | 
|  |  | 
|  | template <class Context> | 
|  | class ImageInputOp final : public PrefetchOperator<Context> { | 
|  | // SINGLE_LABEL: single integer label for multi-class classification | 
|  | // MULTI_LABEL_SPARSE: sparse active label indices for multi-label | 
|  | // classification MULTI_LABEL_DENSE: dense label embedding vector for label | 
|  | // embedding regression MULTI_LABEL_WEIGHTED_SPARSE: sparse active label | 
|  | // indices with per-label weights for multi-label classification | 
|  | // SINGLE_LABEL_WEIGHTED: single integer label for multi-class classification | 
|  | // with weighted sampling EMBEDDING_LABEL: an array of floating numbers | 
|  | // representing dense embedding. | 
|  | //   It is useful for model distillation | 
|  | enum LABEL_TYPE { | 
|  | SINGLE_LABEL = 0, | 
|  | MULTI_LABEL_SPARSE = 1, | 
|  | MULTI_LABEL_DENSE = 2, | 
|  | MULTI_LABEL_WEIGHTED_SPARSE = 3, | 
|  | SINGLE_LABEL_WEIGHTED = 4, | 
|  | EMBEDDING_LABEL = 5, | 
|  | }; | 
|  |  | 
|  | // INCEPTION_STYLE: Random crop with size 8% - 100% image area and aspect | 
|  | // ratio in [3/4, 4/3]. Reference: GoogleNet paper | 
|  | enum SCALE_JITTER_TYPE { | 
|  | NO_SCALE_JITTER = 0, | 
|  | INCEPTION_STYLE = 1 | 
|  | // TODO(zyan3): ResNet-style random scale jitter | 
|  | }; | 
|  |  | 
|  | public: | 
|  | using OperatorBase::OutputSize; | 
|  | using PrefetchOperator<Context>::context_; | 
|  | using PrefetchOperator<Context>::prefetch_thread_; | 
|  | explicit ImageInputOp(const OperatorDef& operator_def, Workspace* ws); | 
|  | ~ImageInputOp() { | 
|  | PrefetchOperator<Context>::Finalize(); | 
|  | } | 
|  |  | 
|  | bool Prefetch() override; | 
|  | bool CopyPrefetched() override; | 
|  |  | 
|  | private: | 
|  | using BoundingBox = struct { | 
|  | bool valid; | 
|  | int ymin; | 
|  | int xmin; | 
|  | int height; | 
|  | int width; | 
|  | }; | 
|  |  | 
|  | // Structure to store per-image information | 
|  | // This can be modified by the DecodeAnd* so needs | 
|  | // to be privatized per launch. | 
|  | using PerImageArg = struct { BoundingBox bounding_params; }; | 
|  |  | 
|  | bool GetImageAndLabelAndInfoFromDBValue( | 
|  | const string& value, | 
|  | cv::Mat* img, | 
|  | PerImageArg& info, | 
|  | int item_id, | 
|  | std::mt19937* randgen); | 
|  | void DecodeAndTransform( | 
|  | const std::string& value, | 
|  | float* image_data, | 
|  | int item_id, | 
|  | const int channels, | 
|  | std::size_t thread_index); | 
|  | void DecodeAndTransposeOnly( | 
|  | const std::string& value, | 
|  | uint8_t* image_data, | 
|  | int item_id, | 
|  | const int channels, | 
|  | std::size_t thread_index); | 
|  | bool ApplyTransformOnGPU( | 
|  | const std::vector<std::int64_t>& dims, | 
|  | const c10::Device& type); | 
|  |  | 
|  | unique_ptr<db::DBReader> owned_reader_; | 
|  | const db::DBReader* reader_; | 
|  | Tensor prefetched_image_; | 
|  | Tensor prefetched_label_; | 
|  | vector<Tensor> prefetched_additional_outputs_; | 
|  | Tensor prefetched_image_on_device_; | 
|  | Tensor prefetched_label_on_device_; | 
|  | vector<Tensor> prefetched_additional_outputs_on_device_; | 
|  | // Default parameters for images | 
|  | PerImageArg default_arg_; | 
|  | int batch_size_; | 
|  | LABEL_TYPE label_type_; | 
|  | int num_labels_; | 
|  |  | 
|  | bool color_; | 
|  | bool color_jitter_; | 
|  | float img_saturation_; | 
|  | float img_brightness_; | 
|  | float img_contrast_; | 
|  | bool color_lighting_; | 
|  | float color_lighting_std_; | 
|  | std::vector<std::vector<float>> color_lighting_eigvecs_; | 
|  | std::vector<float> color_lighting_eigvals_; | 
|  | SCALE_JITTER_TYPE scale_jitter_type_; | 
|  | int scale_; | 
|  | // Minsize is similar to scale except that it will only | 
|  | // force the image to scale up if it is too small. In other words, | 
|  | // it ensures that both dimensions of the image are at least minsize_ | 
|  | int minsize_; | 
|  | bool warp_; | 
|  | int crop_; | 
|  | std::vector<float> mean_; | 
|  | std::vector<float> std_; | 
|  | Tensor mean_gpu_; | 
|  | Tensor std_gpu_; | 
|  | bool mirror_; | 
|  | bool is_test_; | 
|  | bool use_caffe_datum_; | 
|  | bool gpu_transform_; | 
|  | bool mean_std_copied_ = false; | 
|  |  | 
|  | // thread pool for parse + decode | 
|  | int num_decode_threads_; | 
|  | int additional_inputs_offset_; | 
|  | int additional_inputs_count_; | 
|  | std::vector<int> additional_output_sizes_; | 
|  | std::shared_ptr<TaskThreadPool> thread_pool_; | 
|  |  | 
|  | // Output type for GPU transform path | 
|  | TensorProto_DataType output_type_; | 
|  |  | 
|  | // random minsize | 
|  | vector<int> random_scale_; | 
|  | bool random_scaling_; | 
|  |  | 
|  | // Working variables | 
|  | std::vector<std::mt19937> randgen_per_thread_; | 
|  |  | 
|  | // number of exceptions produced by opencv while reading image data | 
|  | std::atomic<long> num_decode_errors_in_batch_{0}; | 
|  | // opencv exceptions tolerance | 
|  | float max_decode_error_ratio_; | 
|  | }; | 
|  |  | 
|  | template <class Context> | 
|  | ImageInputOp<Context>::ImageInputOp( | 
|  | const OperatorDef& operator_def, | 
|  | Workspace* ws) | 
|  | : PrefetchOperator<Context>(operator_def, ws), | 
|  | reader_(nullptr), | 
|  | batch_size_( | 
|  | OperatorBase::template GetSingleArgument<int>("batch_size", 0)), | 
|  | label_type_(static_cast<LABEL_TYPE>( | 
|  | OperatorBase::template GetSingleArgument<int>("label_type", 0))), | 
|  | num_labels_( | 
|  | OperatorBase::template GetSingleArgument<int>("num_labels", 0)), | 
|  | color_(OperatorBase::template GetSingleArgument<int>("color", 1)), | 
|  | color_jitter_( | 
|  | OperatorBase::template GetSingleArgument<int>("color_jitter", 0)), | 
|  | img_saturation_(OperatorBase::template GetSingleArgument<float>( | 
|  | "img_saturation", | 
|  | 0.4)), | 
|  | img_brightness_(OperatorBase::template GetSingleArgument<float>( | 
|  | "img_brightness", | 
|  | 0.4)), | 
|  | img_contrast_( | 
|  | OperatorBase::template GetSingleArgument<float>("img_contrast", 0.4)), | 
|  | color_lighting_( | 
|  | OperatorBase::template GetSingleArgument<int>("color_lighting", 0)), | 
|  | color_lighting_std_(OperatorBase::template GetSingleArgument<float>( | 
|  | "color_lighting_std", | 
|  | 0.1)), | 
|  | scale_jitter_type_(static_cast<SCALE_JITTER_TYPE>( | 
|  | OperatorBase::template GetSingleArgument<int>( | 
|  | "scale_jitter_type", | 
|  | 0))), | 
|  | scale_(OperatorBase::template GetSingleArgument<int>("scale", -1)), | 
|  | minsize_(OperatorBase::template GetSingleArgument<int>("minsize", -1)), | 
|  | warp_(OperatorBase::template GetSingleArgument<int>("warp", 0)), | 
|  | crop_(OperatorBase::template GetSingleArgument<int>("crop", -1)), | 
|  | mirror_(OperatorBase::template GetSingleArgument<int>("mirror", 0)), | 
|  | is_test_(OperatorBase::template GetSingleArgument<int>( | 
|  | OpSchema::Arg_IsTest, | 
|  | 0)), | 
|  | use_caffe_datum_( | 
|  | OperatorBase::template GetSingleArgument<int>("use_caffe_datum", 0)), | 
|  | gpu_transform_(OperatorBase::template GetSingleArgument<int>( | 
|  | "use_gpu_transform", | 
|  | 0)), | 
|  | num_decode_threads_( | 
|  | OperatorBase::template GetSingleArgument<int>("decode_threads", 4)), | 
|  | additional_output_sizes_( | 
|  | OperatorBase::template GetRepeatedArgument<int>("output_sizes", {})), | 
|  | thread_pool_(std::make_shared<TaskThreadPool>(num_decode_threads_)), | 
|  | // output type only supported with CUDA and use_gpu_transform for now | 
|  | output_type_( | 
|  | cast::GetCastDataType(ArgumentHelper(operator_def), "output_type")), | 
|  | random_scale_(OperatorBase::template GetRepeatedArgument<int>( | 
|  | "random_scale", | 
|  | {-1, -1})), | 
|  | max_decode_error_ratio_(OperatorBase::template GetSingleArgument<float>( | 
|  | "max_decode_error_ratio", | 
|  | 1.0)) { | 
|  | if ((random_scale_[0] == -1) || (random_scale_[1] == -1)) { | 
|  | random_scaling_ = false; | 
|  | } else { | 
|  | random_scaling_ = true; | 
|  | minsize_ = random_scale_[0]; | 
|  | } | 
|  |  | 
|  | mean_ = OperatorBase::template GetRepeatedArgument<float>( | 
|  | "mean_per_channel", | 
|  | {OperatorBase::template GetSingleArgument<float>("mean", 0.)}); | 
|  |  | 
|  | std_ = OperatorBase::template GetRepeatedArgument<float>( | 
|  | "std_per_channel", | 
|  | {OperatorBase::template GetSingleArgument<float>("std", 1.)}); | 
|  |  | 
|  | if (additional_output_sizes_.size() == 0) { | 
|  | additional_output_sizes_ = std::vector<int>(OutputSize() - 2, 1); | 
|  | } else { | 
|  | CAFFE_ENFORCE( | 
|  | additional_output_sizes_.size() == OutputSize() - 2, | 
|  | "If the output sizes are specified, they must be specified for all " | 
|  | "additional outputs"); | 
|  | } | 
|  | additional_inputs_count_ = OutputSize() - 2; | 
|  |  | 
|  | default_arg_.bounding_params = { | 
|  | false, | 
|  | OperatorBase::template GetSingleArgument<int>("bounding_ymin", -1), | 
|  | OperatorBase::template GetSingleArgument<int>("bounding_xmin", -1), | 
|  | OperatorBase::template GetSingleArgument<int>("bounding_height", -1), | 
|  | OperatorBase::template GetSingleArgument<int>("bounding_width", -1), | 
|  | }; | 
|  |  | 
|  | if (operator_def.input_size() == 0) { | 
|  | LOG(ERROR) << "You are using an old ImageInputOp format that creates " | 
|  | "a local db reader. Consider moving to the new style " | 
|  | "that takes in a DBReader blob instead."; | 
|  | string db_name = OperatorBase::template GetSingleArgument<string>("db", ""); | 
|  | CAFFE_ENFORCE_GT(db_name.size(), 0, "Must specify a db name."); | 
|  | owned_reader_.reset(new db::DBReader( | 
|  | OperatorBase::template GetSingleArgument<string>("db_type", "leveldb"), | 
|  | db_name)); | 
|  | reader_ = owned_reader_.get(); | 
|  | } | 
|  |  | 
|  | // hard-coded PCA eigenvectors and eigenvalues, based on RBG channel order | 
|  | color_lighting_eigvecs_.push_back( | 
|  | std::vector<float>{-144.7125f, 183.396f, 102.2295f}); | 
|  | color_lighting_eigvecs_.push_back( | 
|  | std::vector<float>{-148.104f, -1.1475f, -207.57f}); | 
|  | color_lighting_eigvecs_.push_back( | 
|  | std::vector<float>{-148.818f, -177.174f, 107.1765f}); | 
|  |  | 
|  | color_lighting_eigvals_ = std::vector<float>{0.2175f, 0.0188f, 0.0045f}; | 
|  |  | 
|  | CAFFE_ENFORCE_GT(batch_size_, 0, "Batch size should be nonnegative."); | 
|  | if (use_caffe_datum_) { | 
|  | CAFFE_ENFORCE( | 
|  | label_type_ == SINGLE_LABEL || label_type_ == SINGLE_LABEL_WEIGHTED, | 
|  | "Caffe datum only supports single integer label"); | 
|  | } | 
|  | if (label_type_ != SINGLE_LABEL && label_type_ != SINGLE_LABEL_WEIGHTED) { | 
|  | CAFFE_ENFORCE_GT( | 
|  | num_labels_, | 
|  | 0, | 
|  | "Number of labels must be set for using either sparse label indices or dense label embedding."); | 
|  | } | 
|  | if (label_type_ == MULTI_LABEL_WEIGHTED_SPARSE || | 
|  | label_type_ == SINGLE_LABEL_WEIGHTED) { | 
|  | additional_inputs_offset_ = 3; | 
|  | } else { | 
|  | additional_inputs_offset_ = 2; | 
|  | } | 
|  | CAFFE_ENFORCE( | 
|  | (scale_ > 0) != (minsize_ > 0), | 
|  | "Must provide one and only one of scaling or minsize"); | 
|  | CAFFE_ENFORCE_GT(crop_, 0, "Must provide the cropping value."); | 
|  | CAFFE_ENFORCE_GE( | 
|  | scale_ > 0 ? scale_ : minsize_, | 
|  | crop_, | 
|  | "The scale/minsize value must be no smaller than the crop value."); | 
|  |  | 
|  | CAFFE_ENFORCE_EQ( | 
|  | mean_.size(), | 
|  | std_.size(), | 
|  | "The mean and std. dev vectors must be of the same size."); | 
|  | CAFFE_ENFORCE( | 
|  | mean_.size() == 1 || mean_.size() == 3, | 
|  | "The mean and std. dev vectors must be of size 1 or 3"); | 
|  | CAFFE_ENFORCE( | 
|  | !use_caffe_datum_ || OutputSize() == 2, | 
|  | "There can only be 2 outputs if the Caffe datum format is used"); | 
|  |  | 
|  | CAFFE_ENFORCE( | 
|  | random_scale_.size() == 2, "Must provide [scale_min, scale_max]"); | 
|  | CAFFE_ENFORCE_GE( | 
|  | random_scale_[1], | 
|  | random_scale_[0], | 
|  | "random scale must provide a range [min, max]"); | 
|  |  | 
|  | if (default_arg_.bounding_params.ymin < 0 || | 
|  | default_arg_.bounding_params.xmin < 0 || | 
|  | default_arg_.bounding_params.height < 0 || | 
|  | default_arg_.bounding_params.width < 0) { | 
|  | default_arg_.bounding_params.valid = false; | 
|  | } else { | 
|  | default_arg_.bounding_params.valid = true; | 
|  | } | 
|  |  | 
|  | if (mean_.size() == 1) { | 
|  | // We are going to extend to 3 using the first value | 
|  | mean_.resize(3, mean_[0]); | 
|  | std_.resize(3, std_[0]); | 
|  | } | 
|  |  | 
|  | LOG(INFO) << "Creating an image input op with the following setting: "; | 
|  | LOG(INFO) << "    Using " << num_decode_threads_ << " CPU threads;"; | 
|  | if (gpu_transform_) { | 
|  | LOG(INFO) << "    Performing transformation on GPU"; | 
|  | } | 
|  | LOG(INFO) << "    Outputting in batches of " << batch_size_ << " images;"; | 
|  | LOG(INFO) << "    Treating input image as " | 
|  | << (color_ ? "color " : "grayscale ") << "image;"; | 
|  | if (default_arg_.bounding_params.valid) { | 
|  | LOG(INFO) << "    Applying a default bounding box of Y [" | 
|  | << default_arg_.bounding_params.ymin << "; " | 
|  | << default_arg_.bounding_params.ymin + | 
|  | default_arg_.bounding_params.height | 
|  | << ") x X [" << default_arg_.bounding_params.xmin << "; " | 
|  | << default_arg_.bounding_params.xmin + | 
|  | default_arg_.bounding_params.width | 
|  | << ")"; | 
|  | } | 
|  | if (scale_ > 0 && !random_scaling_) { | 
|  | LOG(INFO) << "    Scaling image to " << scale_ | 
|  | << (warp_ ? " with " : " without ") << "warping;"; | 
|  | } else { | 
|  | if (random_scaling_) { | 
|  | // randomly set min_size_ for each image | 
|  | LOG(INFO) << "    Randomly scaling shortest side between " | 
|  | << random_scale_[0] << " and " << random_scale_[1]; | 
|  | } else { | 
|  | // Here, minsize_ > 0 | 
|  | LOG(INFO) << "    Ensuring minimum image size of " << minsize_ | 
|  | << (warp_ ? " with " : " without ") << "warping;"; | 
|  | } | 
|  | } | 
|  | LOG(INFO) << "    " << (is_test_ ? "Central" : "Random") | 
|  | << " cropping image to " << crop_ | 
|  | << (mirror_ ? " with " : " without ") << "random mirroring;"; | 
|  | LOG(INFO) << "Label Type: " << label_type_; | 
|  | LOG(INFO) << "Num Labels: " << num_labels_; | 
|  |  | 
|  | auto mit = mean_.begin(); | 
|  | auto sit = std_.begin(); | 
|  |  | 
|  | for (int i = 0; mit != mean_.end() && sit != std_.end(); ++mit, ++sit, ++i) { | 
|  | LOG(INFO) << "    Default [Channel " << i << "] Subtract mean " << *mit | 
|  | << " and divide by std " << *sit << "."; | 
|  | // We actually will use the inverse of std, so inverse it here | 
|  | *sit = 1.f / *sit; | 
|  | } | 
|  | LOG(INFO) << "    Outputting images as " | 
|  | << OperatorBase::template GetSingleArgument<string>( | 
|  | "output_type", "unknown") | 
|  | << "."; | 
|  |  | 
|  | std::mt19937 meta_randgen(time(nullptr)); | 
|  | for (int i = 0; i < num_decode_threads_; ++i) { | 
|  | randgen_per_thread_.emplace_back(meta_randgen()); | 
|  | } | 
|  | ReinitializeTensor( | 
|  | &prefetched_image_, | 
|  | {int64_t(batch_size_), | 
|  | int64_t(crop_), | 
|  | int64_t(crop_), | 
|  | int64_t(color_ ? 3 : 1)}, | 
|  | at::dtype<uint8_t>().device(CPU)); | 
|  | std::vector<int64_t> sizes; | 
|  | if (label_type_ != SINGLE_LABEL && label_type_ != SINGLE_LABEL_WEIGHTED) { | 
|  | sizes = std::vector<int64_t>{int64_t(batch_size_), int64_t(num_labels_)}; | 
|  | } else { | 
|  | sizes = std::vector<int64_t>{batch_size_}; | 
|  | } | 
|  | // data type for prefetched_label_ is actually not known here.. | 
|  | ReinitializeTensor(&prefetched_label_, sizes, at::dtype<int>().device(CPU)); | 
|  |  | 
|  | for (int i = 0; i < additional_output_sizes_.size(); ++i) { | 
|  | prefetched_additional_outputs_on_device_.emplace_back(); | 
|  | prefetched_additional_outputs_.emplace_back(); | 
|  | } | 
|  | } | 
|  |  | 
|  | // Inception-stype scale jittering | 
|  | template <class Context> | 
|  | bool RandomSizedCropping(cv::Mat* img, const int crop, std::mt19937* randgen) { | 
|  | cv::Mat scaled_img; | 
|  | bool inception_scale_jitter = false; | 
|  | int im_height = img->rows, im_width = img->cols; | 
|  | int area = im_height * im_width; | 
|  | std::uniform_real_distribution<> area_dis(0.08, 1.0); | 
|  | std::uniform_real_distribution<> aspect_ratio_dis(3.0 / 4.0, 4.0 / 3.0); | 
|  |  | 
|  | cv::Mat cropping; | 
|  | for (int i = 0; i < 10; ++i) { | 
|  | int target_area = int(ceil(area_dis(*randgen) * area)); | 
|  | float aspect_ratio = aspect_ratio_dis(*randgen); | 
|  | int nh = floor(std::sqrt(((float)target_area / aspect_ratio))); | 
|  | int nw = floor(std::sqrt(((float)target_area * aspect_ratio))); | 
|  | if (nh >= 1 && nh <= im_height && nw >= 1 && nw <= im_width) { | 
|  | int height_offset = | 
|  | std::uniform_int_distribution<>(0, im_height - nh)(*randgen); | 
|  | int width_offset = | 
|  | std::uniform_int_distribution<>(0, im_width - nw)(*randgen); | 
|  | cv::Rect ROI(width_offset, height_offset, nw, nh); | 
|  | cropping = (*img)(ROI); | 
|  | cv::resize( | 
|  | cropping, scaled_img, cv::Size(crop, crop), 0, 0, cv::INTER_AREA); | 
|  | *img = scaled_img; | 
|  | inception_scale_jitter = true; | 
|  | break; | 
|  | } | 
|  | } | 
|  | return inception_scale_jitter; | 
|  | } | 
|  |  | 
|  | template <class Context> | 
|  | bool ImageInputOp<Context>::GetImageAndLabelAndInfoFromDBValue( | 
|  | const string& value, | 
|  | cv::Mat* img, | 
|  | PerImageArg& info, | 
|  | int item_id, | 
|  | std::mt19937* randgen) { | 
|  | // | 
|  | // recommend using --caffe2_use_fatal_for_enforce=1 when using ImageInputOp | 
|  | // as this function runs on a worker thread and the exceptions from | 
|  | // CAFFE_ENFORCE are silently dropped by the thread worker functions | 
|  | // | 
|  | cv::Mat src; | 
|  |  | 
|  | // Use the default information for images | 
|  | info = default_arg_; | 
|  | if (use_caffe_datum_) { | 
|  | // The input is a caffe datum format. | 
|  | CaffeDatum datum; | 
|  | CAFFE_ENFORCE(datum.ParseFromString(value)); | 
|  |  | 
|  | prefetched_label_.mutable_data<int>()[item_id] = datum.label(); | 
|  | if (datum.encoded()) { | 
|  | // encoded image in datum. | 
|  | // count the number of exceptions from opencv imdecode | 
|  | try { | 
|  | src = cv::imdecode( | 
|  | cv::Mat( | 
|  | 1, | 
|  | datum.data().size(), | 
|  | CV_8UC1, | 
|  | const_cast<char*>(datum.data().data())), | 
|  | color_ ? cv::IMREAD_COLOR : cv::IMREAD_GRAYSCALE); | 
|  | if (src.rows == 0 || src.cols == 0) { | 
|  | num_decode_errors_in_batch_++; | 
|  | src = cv::Mat::zeros(cv::Size(224, 224), CV_8UC3); | 
|  | } | 
|  | } catch (cv::Exception& e) { | 
|  | num_decode_errors_in_batch_++; | 
|  | src = cv::Mat::zeros(cv::Size(224, 224), CV_8UC3); | 
|  | } | 
|  | } else { | 
|  | // Raw image in datum. | 
|  | CAFFE_ENFORCE(datum.channels() == 3 || datum.channels() == 1); | 
|  |  | 
|  | int src_c = datum.channels(); | 
|  | src.create( | 
|  | datum.height(), datum.width(), (src_c == 3) ? CV_8UC3 : CV_8UC1); | 
|  |  | 
|  | if (src_c == 1) { | 
|  | memcpy(src.ptr<uchar>(0), datum.data().data(), datum.data().size()); | 
|  | } else { | 
|  | // Datum stores things in CHW order, let's do HWC for images to make | 
|  | // things more consistent with conventional image storage. | 
|  | for (int c = 0; c < 3; ++c) { | 
|  | const char* datum_buffer = | 
|  | datum.data().data() + datum.height() * datum.width() * c; | 
|  | uchar* ptr = src.ptr<uchar>(0) + c; | 
|  | for (int h = 0; h < datum.height(); ++h) { | 
|  | for (int w = 0; w < datum.width(); ++w) { | 
|  | *ptr = *(datum_buffer++); | 
|  | ptr += 3; | 
|  | } | 
|  | } | 
|  | } | 
|  | } | 
|  | } | 
|  | } else { | 
|  | // The input is a caffe2 format. | 
|  | TensorProtos protos; | 
|  | CAFFE_ENFORCE(protos.ParseFromString(value)); | 
|  | const TensorProto& image_proto = protos.protos(0); | 
|  | const TensorProto& label_proto = protos.protos(1); | 
|  | // add handle protos | 
|  | vector<TensorProto> additional_output_protos; | 
|  | int start = additional_inputs_offset_; | 
|  | int end = start + additional_inputs_count_; | 
|  | for (int i = start; i < end; ++i) { | 
|  | additional_output_protos.push_back(protos.protos(i)); | 
|  | } | 
|  |  | 
|  | if (protos.protos_size() == end + 1) { | 
|  | // We have bounding box information | 
|  | const TensorProto& bounding_proto = protos.protos(end); | 
|  | DCHECK_EQ(bounding_proto.data_type(), TensorProto::INT32); | 
|  | DCHECK_EQ(bounding_proto.int32_data_size(), 4); | 
|  | info.bounding_params.valid = true; | 
|  | info.bounding_params.ymin = bounding_proto.int32_data(0); | 
|  | info.bounding_params.xmin = bounding_proto.int32_data(1); | 
|  | info.bounding_params.height = bounding_proto.int32_data(2); | 
|  | info.bounding_params.width = bounding_proto.int32_data(3); | 
|  | } | 
|  |  | 
|  | if (image_proto.data_type() == TensorProto::STRING) { | 
|  | // encoded image string. | 
|  | DCHECK_EQ(image_proto.string_data_size(), 1); | 
|  | const string& encoded_image_str = image_proto.string_data(0); | 
|  | int encoded_size = encoded_image_str.size(); | 
|  | // We use a cv::Mat to wrap the encoded str so we do not need a copy. | 
|  | // count the number of exceptions from opencv imdecode | 
|  | try { | 
|  | src = cv::imdecode( | 
|  | cv::Mat( | 
|  | 1, | 
|  | &encoded_size, | 
|  | CV_8UC1, | 
|  | const_cast<char*>(encoded_image_str.data())), | 
|  | color_ ? cv::IMREAD_COLOR : cv::IMREAD_GRAYSCALE); | 
|  | if (src.rows == 0 || src.cols == 0) { | 
|  | num_decode_errors_in_batch_++; | 
|  | src = cv::Mat::zeros(cv::Size(224, 224), CV_8UC3); | 
|  | } | 
|  | } catch (cv::Exception& e) { | 
|  | num_decode_errors_in_batch_++; | 
|  | src = cv::Mat::zeros(cv::Size(224, 224), CV_8UC3); | 
|  | } | 
|  | } else if (image_proto.data_type() == TensorProto::BYTE) { | 
|  | // raw image content. | 
|  | int src_c = (image_proto.dims_size() == 3) ? image_proto.dims(2) : 1; | 
|  | CAFFE_ENFORCE(src_c == 3 || src_c == 1); | 
|  |  | 
|  | src.create( | 
|  | image_proto.dims(0), | 
|  | image_proto.dims(1), | 
|  | (src_c == 3) ? CV_8UC3 : CV_8UC1); | 
|  | memcpy( | 
|  | src.ptr<uchar>(0), | 
|  | image_proto.byte_data().data(), | 
|  | image_proto.byte_data().size()); | 
|  | } else { | 
|  | LOG(FATAL) << "Unknown image data type."; | 
|  | } | 
|  |  | 
|  | // TODO: if image decoding was unsuccessful, set label to 0 | 
|  | if (label_proto.data_type() == TensorProto::FLOAT) { | 
|  | if (label_type_ == SINGLE_LABEL || label_type_ == SINGLE_LABEL_WEIGHTED) { | 
|  | DCHECK_EQ(label_proto.float_data_size(), 1); | 
|  | prefetched_label_.mutable_data<float>()[item_id] = | 
|  | label_proto.float_data(0); | 
|  | } else if (label_type_ == MULTI_LABEL_SPARSE) { | 
|  | float* label_data = | 
|  | prefetched_label_.mutable_data<float>() + item_id * num_labels_; | 
|  | memset(label_data, 0, sizeof(float) * num_labels_); | 
|  | for (int i = 0; i < label_proto.float_data_size(); ++i) { | 
|  | label_data[(int)label_proto.float_data(i)] = 1.0; | 
|  | } | 
|  | } else if (label_type_ == MULTI_LABEL_WEIGHTED_SPARSE) { | 
|  | const TensorProto& weight_proto = protos.protos(2); | 
|  | float* label_data = | 
|  | prefetched_label_.mutable_data<float>() + item_id * num_labels_; | 
|  | memset(label_data, 0, sizeof(float) * num_labels_); | 
|  | for (int i = 0; i < label_proto.float_data_size(); ++i) { | 
|  | label_data[(int)label_proto.float_data(i)] = | 
|  | weight_proto.float_data(i); | 
|  | } | 
|  | } else if ( | 
|  | label_type_ == MULTI_LABEL_DENSE || label_type_ == EMBEDDING_LABEL) { | 
|  | CAFFE_ENFORCE(label_proto.float_data_size() == num_labels_); | 
|  | float* label_data = | 
|  | prefetched_label_.mutable_data<float>() + item_id * num_labels_; | 
|  | for (int i = 0; i < label_proto.float_data_size(); ++i) { | 
|  | label_data[i] = label_proto.float_data(i); | 
|  | } | 
|  | } else { | 
|  | LOG(ERROR) << "Unknown label type:" << label_type_; | 
|  | } | 
|  | } else if (label_proto.data_type() == TensorProto::INT32) { | 
|  | if (label_type_ == SINGLE_LABEL || label_type_ == SINGLE_LABEL_WEIGHTED) { | 
|  | DCHECK_EQ(label_proto.int32_data_size(), 1); | 
|  | prefetched_label_.mutable_data<int>()[item_id] = | 
|  | label_proto.int32_data(0); | 
|  | } else if (label_type_ == MULTI_LABEL_SPARSE) { | 
|  | int* label_data = | 
|  | prefetched_label_.mutable_data<int>() + item_id * num_labels_; | 
|  | memset(label_data, 0, sizeof(int) * num_labels_); | 
|  | for (int i = 0; i < label_proto.int32_data_size(); ++i) { | 
|  | label_data[label_proto.int32_data(i)] = 1; | 
|  | } | 
|  | } else if (label_type_ == MULTI_LABEL_WEIGHTED_SPARSE) { | 
|  | const TensorProto& weight_proto = protos.protos(2); | 
|  | float* label_data = | 
|  | prefetched_label_.mutable_data<float>() + item_id * num_labels_; | 
|  | memset(label_data, 0, sizeof(float) * num_labels_); | 
|  | for (int i = 0; i < label_proto.int32_data_size(); ++i) { | 
|  | label_data[label_proto.int32_data(i)] = weight_proto.float_data(i); | 
|  | } | 
|  | } else if ( | 
|  | label_type_ == MULTI_LABEL_DENSE || label_type_ == EMBEDDING_LABEL) { | 
|  | CAFFE_ENFORCE(label_proto.int32_data_size() == num_labels_); | 
|  | int* label_data = | 
|  | prefetched_label_.mutable_data<int>() + item_id * num_labels_; | 
|  | for (int i = 0; i < label_proto.int32_data_size(); ++i) { | 
|  | label_data[i] = label_proto.int32_data(i); | 
|  | } | 
|  | } else { | 
|  | LOG(ERROR) << "Unknown label type:" << label_type_; | 
|  | } | 
|  | } else { | 
|  | LOG(FATAL) << "Unsupported label data type."; | 
|  | } | 
|  |  | 
|  | for (int i = 0; i < additional_output_protos.size(); ++i) { | 
|  | auto additional_output_proto = additional_output_protos[i]; | 
|  | if (additional_output_proto.data_type() == TensorProto::FLOAT) { | 
|  | float* additional_output = | 
|  | prefetched_additional_outputs_[i].template mutable_data<float>() + | 
|  | item_id * additional_output_proto.float_data_size(); | 
|  |  | 
|  | for (int j = 0; j < additional_output_proto.float_data_size(); ++j) { | 
|  | additional_output[j] = additional_output_proto.float_data(j); | 
|  | } | 
|  | } else if (additional_output_proto.data_type() == TensorProto::INT32) { | 
|  | int* additional_output = | 
|  | prefetched_additional_outputs_[i].template mutable_data<int>() + | 
|  | item_id * additional_output_proto.int32_data_size(); | 
|  |  | 
|  | for (int j = 0; j < additional_output_proto.int32_data_size(); ++j) { | 
|  | additional_output[j] = additional_output_proto.int32_data(j); | 
|  | } | 
|  | } else if (additional_output_proto.data_type() == TensorProto::INT64) { | 
|  | int64_t* additional_output = | 
|  | prefetched_additional_outputs_[i].template mutable_data<int64_t>() + | 
|  | item_id * additional_output_proto.int64_data_size(); | 
|  |  | 
|  | for (int j = 0; j < additional_output_proto.int64_data_size(); ++j) { | 
|  | additional_output[j] = additional_output_proto.int64_data(j); | 
|  | } | 
|  | } else if (additional_output_proto.data_type() == TensorProto::UINT8) { | 
|  | uint8_t* additional_output = | 
|  | prefetched_additional_outputs_[i].template mutable_data<uint8_t>() + | 
|  | item_id * additional_output_proto.int32_data_size(); | 
|  |  | 
|  | for (int j = 0; j < additional_output_proto.int32_data_size(); ++j) { | 
|  | additional_output[j] = | 
|  | static_cast<uint8_t>(additional_output_proto.int32_data(j)); | 
|  | } | 
|  | } else { | 
|  | LOG(FATAL) << "Unsupported output type."; | 
|  | } | 
|  | } | 
|  | } | 
|  |  | 
|  | // | 
|  | // convert source to the color format requested from Op | 
|  | // | 
|  | int out_c = color_ ? 3 : 1; | 
|  | if (out_c == src.channels()) { | 
|  | *img = src; | 
|  | } else { | 
|  | cv::cvtColor( | 
|  | src, *img, (out_c == 1) ? cv::COLOR_BGR2GRAY : cv::COLOR_GRAY2BGR); | 
|  | } | 
|  |  | 
|  | // Note(Yangqing): I believe that the mat should be created continuous. | 
|  | CAFFE_ENFORCE(img->isContinuous()); | 
|  |  | 
|  | // Sanity check now that we decoded everything | 
|  |  | 
|  | // Ensure that the bounding box is legit | 
|  | if (info.bounding_params.valid && | 
|  | (src.rows < info.bounding_params.ymin + info.bounding_params.height || | 
|  | src.cols < info.bounding_params.xmin + info.bounding_params.width)) { | 
|  | info.bounding_params.valid = false; | 
|  | } | 
|  |  | 
|  | // Apply the bounding box if requested | 
|  | if (info.bounding_params.valid) { | 
|  | // If we reach here, we know the parameters are sane | 
|  | cv::Rect bounding_box( | 
|  | info.bounding_params.xmin, | 
|  | info.bounding_params.ymin, | 
|  | info.bounding_params.width, | 
|  | info.bounding_params.height); | 
|  | *img = (*img)(bounding_box); | 
|  |  | 
|  | /* | 
|  | LOG(INFO) << "Did bounding with ymin:" | 
|  | << info.bounding_params.ymin << " xmin:" << | 
|  | info.bounding_params.xmin | 
|  | << " height:" << info.bounding_params.height | 
|  | << " width:" << info.bounding_params.width << "\n"; | 
|  | LOG(INFO) << "Bounded matrix: " << img; | 
|  | */ | 
|  | } else { | 
|  | // LOG(INFO) << "No bounding\n"; | 
|  | } | 
|  |  | 
|  | cv::Mat scaled_img; | 
|  | bool inception_scale_jitter = false; | 
|  | if (scale_jitter_type_ == INCEPTION_STYLE) { | 
|  | if (!is_test_) { | 
|  | // Inception-stype scale jittering is only used for training | 
|  | inception_scale_jitter = | 
|  | RandomSizedCropping<Context>(img, crop_, randgen); | 
|  | // if a random crop is still not found, do simple random cropping later | 
|  | } | 
|  | } | 
|  |  | 
|  | if ((scale_jitter_type_ == NO_SCALE_JITTER) || | 
|  | (scale_jitter_type_ == INCEPTION_STYLE && !inception_scale_jitter)) { | 
|  | int scaled_width, scaled_height; | 
|  | int scale_to_use = scale_ > 0 ? scale_ : minsize_; | 
|  |  | 
|  | // set the random minsize | 
|  | if (random_scaling_) { | 
|  | scale_to_use = std::uniform_int_distribution<>( | 
|  | random_scale_[0], random_scale_[1])(*randgen); | 
|  | } | 
|  |  | 
|  | if (warp_) { | 
|  | scaled_width = scale_to_use; | 
|  | scaled_height = scale_to_use; | 
|  | } else if (img->rows > img->cols) { | 
|  | scaled_width = scale_to_use; | 
|  | scaled_height = static_cast<float>(img->rows) * scale_to_use / img->cols; | 
|  | } else { | 
|  | scaled_height = scale_to_use; | 
|  | scaled_width = static_cast<float>(img->cols) * scale_to_use / img->rows; | 
|  | } | 
|  | if ((scale_ > 0 && | 
|  | (scaled_height != img->rows || scaled_width != img->cols)) || | 
|  | (scaled_height > img->rows || scaled_width > img->cols)) { | 
|  | // We rescale in all cases if we are using scale_ | 
|  | // but only to make the image bigger if using minsize_ | 
|  | /* | 
|  | LOG(INFO) << "Scaling to " << scaled_width << " x " << scaled_height | 
|  | << " From " << img->cols << " x " << img->rows; | 
|  | */ | 
|  | cv::resize( | 
|  | *img, | 
|  | scaled_img, | 
|  | cv::Size(scaled_width, scaled_height), | 
|  | 0, | 
|  | 0, | 
|  | cv::INTER_AREA); | 
|  | *img = scaled_img; | 
|  | } | 
|  | } | 
|  |  | 
|  | // TODO(Yangqing): return false if any error happens. | 
|  | return true; | 
|  | } | 
|  |  | 
|  | // assume HWC order and color channels BGR | 
|  | template <class Context> | 
|  | void Saturation( | 
|  | float* img, | 
|  | const int img_size, | 
|  | const float alpha_rand, | 
|  | std::mt19937* randgen) { | 
|  | float alpha = 1.0f + | 
|  | std::uniform_real_distribution<float>(-alpha_rand, alpha_rand)(*randgen); | 
|  | // BGR to Gray scale image: R -> 0.299, G -> 0.587, B -> 0.114 | 
|  | int p = 0; | 
|  | for (int h = 0; h < img_size; ++h) { | 
|  | for (int w = 0; w < img_size; ++w) { | 
|  | float gray_color = img[3 * p] * 0.114f + img[3 * p + 1] * 0.587f + | 
|  | img[3 * p + 2] * 0.299f; | 
|  | for (int c = 0; c < 3; ++c) { | 
|  | img[3 * p + c] = img[3 * p + c] * alpha + gray_color * (1.0f - alpha); | 
|  | } | 
|  | p++; | 
|  | } | 
|  | } | 
|  | } | 
|  |  | 
|  | // assume HWC order and color channels BGR | 
|  | template <class Context> | 
|  | void Brightness( | 
|  | float* img, | 
|  | const int img_size, | 
|  | const float alpha_rand, | 
|  | std::mt19937* randgen) { | 
|  | float alpha = 1.0f + | 
|  | std::uniform_real_distribution<float>(-alpha_rand, alpha_rand)(*randgen); | 
|  | int p = 0; | 
|  | for (int h = 0; h < img_size; ++h) { | 
|  | for (int w = 0; w < img_size; ++w) { | 
|  | for (int c = 0; c < 3; ++c) { | 
|  | img[p++] *= alpha; | 
|  | } | 
|  | } | 
|  | } | 
|  | } | 
|  |  | 
|  | // assume HWC order and color channels BGR | 
|  | template <class Context> | 
|  | void Contrast( | 
|  | float* img, | 
|  | const int img_size, | 
|  | const float alpha_rand, | 
|  | std::mt19937* randgen) { | 
|  | float gray_mean = 0; | 
|  | int p = 0; | 
|  | for (int h = 0; h < img_size; ++h) { | 
|  | for (int w = 0; w < img_size; ++w) { | 
|  | // BGR to Gray scale image: R -> 0.299, G -> 0.587, B -> 0.114 | 
|  | gray_mean += img[3 * p] * 0.114f + img[3 * p + 1] * 0.587f + | 
|  | img[3 * p + 2] * 0.299f; | 
|  | p++; | 
|  | } | 
|  | } | 
|  | gray_mean /= (img_size * img_size); | 
|  |  | 
|  | float alpha = 1.0f + | 
|  | std::uniform_real_distribution<float>(-alpha_rand, alpha_rand)(*randgen); | 
|  | p = 0; | 
|  | for (int h = 0; h < img_size; ++h) { | 
|  | for (int w = 0; w < img_size; ++w) { | 
|  | for (int c = 0; c < 3; ++c) { | 
|  | img[p] = img[p] * alpha + gray_mean * (1.0f - alpha); | 
|  | p++; | 
|  | } | 
|  | } | 
|  | } | 
|  | } | 
|  |  | 
|  | // assume HWC order and color channels BGR | 
|  | template <class Context> | 
|  | void ColorJitter( | 
|  | float* img, | 
|  | const int img_size, | 
|  | const float saturation, | 
|  | const float brightness, | 
|  | const float contrast, | 
|  | std::mt19937* randgen) { | 
|  | std::srand(unsigned(std::time(0))); | 
|  | std::vector<int> jitter_order{0, 1, 2}; | 
|  | // obtain a time-based seed: | 
|  | unsigned seed = std::chrono::system_clock::now().time_since_epoch().count(); | 
|  | std::shuffle( | 
|  | jitter_order.begin(), | 
|  | jitter_order.end(), | 
|  | std::default_random_engine(seed)); | 
|  |  | 
|  | for (int i = 0; i < 3; ++i) { | 
|  | if (jitter_order[i] == 0) { | 
|  | Saturation<Context>(img, img_size, saturation, randgen); | 
|  | } else if (jitter_order[i] == 1) { | 
|  | Brightness<Context>(img, img_size, brightness, randgen); | 
|  | } else { | 
|  | Contrast<Context>(img, img_size, contrast, randgen); | 
|  | } | 
|  | } | 
|  | } | 
|  |  | 
|  | // assume HWC order and color channels BGR | 
|  | template <class Context> | 
|  | void ColorLighting( | 
|  | float* img, | 
|  | const int img_size, | 
|  | const float alpha_std, | 
|  | const std::vector<std::vector<float>>& eigvecs, | 
|  | const std::vector<float>& eigvals, | 
|  | std::mt19937* randgen) { | 
|  | std::normal_distribution<float> d(0, alpha_std); | 
|  | std::vector<float> alphas(3); | 
|  | for (int i = 0; i < 3; ++i) { | 
|  | alphas[i] = d(*randgen); | 
|  | } | 
|  |  | 
|  | std::vector<float> delta_rgb(3, 0.0); | 
|  | for (int i = 0; i < 3; ++i) { | 
|  | for (int j = 0; j < 3; ++j) { | 
|  | delta_rgb[i] += eigvecs[i][j] * eigvals[j] * alphas[j]; | 
|  | } | 
|  | } | 
|  |  | 
|  | int p = 0; | 
|  | for (int h = 0; h < img_size; ++h) { | 
|  | for (int w = 0; w < img_size; ++w) { | 
|  | for (int c = 0; c < 3; ++c) { | 
|  | img[p++] += delta_rgb[2 - c]; | 
|  | } | 
|  | } | 
|  | } | 
|  | } | 
|  |  | 
|  | // assume HWC order and color channels BGR | 
|  | // mean subtraction and scaling. | 
|  | template <class Context> | 
|  | void ColorNormalization( | 
|  | float* img, | 
|  | const int img_size, | 
|  | const int channels, | 
|  | const std::vector<float>& mean, | 
|  | const std::vector<float>& std) { | 
|  | int p = 0; | 
|  | for (int h = 0; h < img_size; ++h) { | 
|  | for (int w = 0; w < img_size; ++w) { | 
|  | for (int c = 0; c < channels; ++c) { | 
|  | img[p] = (img[p] - mean[c]) * std[c]; | 
|  | p++; | 
|  | } | 
|  | } | 
|  | } | 
|  | } | 
|  |  | 
|  | // Factored out image transformation | 
|  | template <class Context> | 
|  | void TransformImage( | 
|  | const cv::Mat& scaled_img, | 
|  | const int channels, | 
|  | float* image_data, | 
|  | const bool color_jitter, | 
|  | const float saturation, | 
|  | const float brightness, | 
|  | const float contrast, | 
|  | const bool color_lighting, | 
|  | const float color_lighting_std, | 
|  | const std::vector<std::vector<float>>& color_lighting_eigvecs, | 
|  | const std::vector<float>& color_lighting_eigvals, | 
|  | const int crop, | 
|  | const bool mirror, | 
|  | const std::vector<float>& mean, | 
|  | const std::vector<float>& std, | 
|  | std::mt19937* randgen, | 
|  | std::bernoulli_distribution* mirror_this_image, | 
|  | bool is_test = false) { | 
|  | CAFFE_ENFORCE_GE( | 
|  | scaled_img.rows, crop, "Image height must be bigger than crop."); | 
|  | CAFFE_ENFORCE_GE( | 
|  | scaled_img.cols, crop, "Image width must be bigger than crop."); | 
|  |  | 
|  | // find the cropped region, and copy it to the destination matrix | 
|  | int width_offset, height_offset; | 
|  | if (is_test) { | 
|  | width_offset = (scaled_img.cols - crop) / 2; | 
|  | height_offset = (scaled_img.rows - crop) / 2; | 
|  | } else { | 
|  | width_offset = | 
|  | std::uniform_int_distribution<>(0, scaled_img.cols - crop)(*randgen); | 
|  | height_offset = | 
|  | std::uniform_int_distribution<>(0, scaled_img.rows - crop)(*randgen); | 
|  | } | 
|  |  | 
|  | float* image_data_ptr = image_data; | 
|  | if (!is_test && mirror && (*mirror_this_image)(*randgen)) { | 
|  | // Copy mirrored image. | 
|  | for (int h = height_offset; h < height_offset + crop; ++h) { | 
|  | for (int w = width_offset + crop - 1; w >= width_offset; --w) { | 
|  | const uint8_t* cv_data = scaled_img.ptr(h) + w * channels; | 
|  | for (int c = 0; c < channels; ++c) { | 
|  | *(image_data_ptr++) = static_cast<float>(cv_data[c]); | 
|  | } | 
|  | } | 
|  | } | 
|  | } else { | 
|  | // Copy normally. | 
|  | for (int h = height_offset; h < height_offset + crop; ++h) { | 
|  | for (int w = width_offset; w < width_offset + crop; ++w) { | 
|  | const uint8_t* cv_data = scaled_img.ptr(h) + w * channels; | 
|  | for (int c = 0; c < channels; ++c) { | 
|  | *(image_data_ptr++) = static_cast<float>(cv_data[c]); | 
|  | } | 
|  | } | 
|  | } | 
|  | } | 
|  |  | 
|  | if (color_jitter && channels == 3 && !is_test) { | 
|  | ColorJitter<Context>( | 
|  | image_data, crop, saturation, brightness, contrast, randgen); | 
|  | } | 
|  | if (color_lighting && channels == 3 && !is_test) { | 
|  | ColorLighting<Context>( | 
|  | image_data, | 
|  | crop, | 
|  | color_lighting_std, | 
|  | color_lighting_eigvecs, | 
|  | color_lighting_eigvals, | 
|  | randgen); | 
|  | } | 
|  |  | 
|  | // Color normalization | 
|  | // Mean subtraction and scaling. | 
|  | ColorNormalization<Context>(image_data, crop, channels, mean, std); | 
|  | } | 
|  |  | 
|  | // Only crop / transose the image | 
|  | // leave in uint8_t dataType | 
|  | template <class Context> | 
|  | void CropTransposeImage( | 
|  | const cv::Mat& scaled_img, | 
|  | const int channels, | 
|  | uint8_t* cropped_data, | 
|  | const int crop, | 
|  | const bool mirror, | 
|  | std::mt19937* randgen, | 
|  | std::bernoulli_distribution* mirror_this_image, | 
|  | bool is_test = false) { | 
|  | CAFFE_ENFORCE_GE( | 
|  | scaled_img.rows, crop, "Image height must be bigger than crop."); | 
|  | CAFFE_ENFORCE_GE( | 
|  | scaled_img.cols, crop, "Image width must be bigger than crop."); | 
|  |  | 
|  | // find the cropped region, and copy it to the destination matrix | 
|  | int width_offset, height_offset; | 
|  | if (is_test) { | 
|  | width_offset = (scaled_img.cols - crop) / 2; | 
|  | height_offset = (scaled_img.rows - crop) / 2; | 
|  | } else { | 
|  | width_offset = | 
|  | std::uniform_int_distribution<>(0, scaled_img.cols - crop)(*randgen); | 
|  | height_offset = | 
|  | std::uniform_int_distribution<>(0, scaled_img.rows - crop)(*randgen); | 
|  | } | 
|  |  | 
|  | if (mirror && (*mirror_this_image)(*randgen)) { | 
|  | // Copy mirrored image. | 
|  | for (int h = height_offset; h < height_offset + crop; ++h) { | 
|  | for (int w = width_offset + crop - 1; w >= width_offset; --w) { | 
|  | const uint8_t* cv_data = scaled_img.ptr(h) + w * channels; | 
|  | for (int c = 0; c < channels; ++c) { | 
|  | *(cropped_data++) = cv_data[c]; | 
|  | } | 
|  | } | 
|  | } | 
|  | } else { | 
|  | // Copy normally. | 
|  | for (int h = height_offset; h < height_offset + crop; ++h) { | 
|  | for (int w = width_offset; w < width_offset + crop; ++w) { | 
|  | const uint8_t* cv_data = scaled_img.ptr(h) + w * channels; | 
|  | for (int c = 0; c < channels; ++c) { | 
|  | *(cropped_data++) = cv_data[c]; | 
|  | } | 
|  | } | 
|  | } | 
|  | } | 
|  | } | 
|  |  | 
|  | // Parse datum, decode image, perform transform | 
|  | // Intended as entry point for binding to thread pool | 
|  | template <class Context> | 
|  | void ImageInputOp<Context>::DecodeAndTransform( | 
|  | const std::string& value, | 
|  | float* image_data, | 
|  | int item_id, | 
|  | const int channels, | 
|  | std::size_t thread_index) { | 
|  | CAFFE_ENFORCE((int)thread_index < num_decode_threads_); | 
|  |  | 
|  | std::bernoulli_distribution mirror_this_image(0.5f); | 
|  | std::mt19937* randgen = &(randgen_per_thread_[thread_index]); | 
|  |  | 
|  | cv::Mat img; | 
|  | // Decode the image | 
|  | PerImageArg info; | 
|  | CHECK( | 
|  | GetImageAndLabelAndInfoFromDBValue(value, &img, info, item_id, randgen)); | 
|  | // Factor out the image transformation | 
|  | TransformImage<Context>( | 
|  | img, | 
|  | channels, | 
|  | image_data, | 
|  | color_jitter_, | 
|  | img_saturation_, | 
|  | img_brightness_, | 
|  | img_contrast_, | 
|  | color_lighting_, | 
|  | color_lighting_std_, | 
|  | color_lighting_eigvecs_, | 
|  | color_lighting_eigvals_, | 
|  | crop_, | 
|  | mirror_, | 
|  | mean_, | 
|  | std_, | 
|  | randgen, | 
|  | &mirror_this_image, | 
|  | is_test_); | 
|  | } | 
|  |  | 
|  | template <class Context> | 
|  | void ImageInputOp<Context>::DecodeAndTransposeOnly( | 
|  | const std::string& value, | 
|  | uint8_t* image_data, | 
|  | int item_id, | 
|  | const int channels, | 
|  | std::size_t thread_index) { | 
|  | CAFFE_ENFORCE((int)thread_index < num_decode_threads_); | 
|  |  | 
|  | std::bernoulli_distribution mirror_this_image(0.5f); | 
|  | std::mt19937* randgen = &(randgen_per_thread_[thread_index]); | 
|  |  | 
|  | cv::Mat img; | 
|  | // Decode the image | 
|  | PerImageArg info; | 
|  | CHECK( | 
|  | GetImageAndLabelAndInfoFromDBValue(value, &img, info, item_id, randgen)); | 
|  |  | 
|  | // Factor out the image transformation | 
|  | CropTransposeImage<Context>( | 
|  | img, | 
|  | channels, | 
|  | image_data, | 
|  | crop_, | 
|  | mirror_, | 
|  | randgen, | 
|  | &mirror_this_image, | 
|  | is_test_); | 
|  | } | 
|  |  | 
|  | template <class Context> | 
|  | bool ImageInputOp<Context>::Prefetch() { | 
|  | if (!owned_reader_.get()) { | 
|  | // if we are not owning the reader, we will get the reader pointer from | 
|  | // input. Otherwise the constructor should have already set the reader | 
|  | // pointer. | 
|  | reader_ = &OperatorBase::Input<db::DBReader>(0); | 
|  | } | 
|  | const int channels = color_ ? 3 : 1; | 
|  | // Call mutable_data() once to allocate the underlying memory. | 
|  | if (gpu_transform_) { | 
|  | // we'll transfer up in int8, then convert later | 
|  | prefetched_image_.mutable_data<uint8_t>(); | 
|  | } else { | 
|  | prefetched_image_.mutable_data<float>(); | 
|  | } | 
|  |  | 
|  | prefetched_label_.mutable_data<int>(); | 
|  | // Prefetching handled with a thread pool of "decode_threads" threads. | 
|  |  | 
|  | for (int item_id = 0; item_id < batch_size_; ++item_id) { | 
|  | std::string key, value; | 
|  | cv::Mat img; | 
|  |  | 
|  | // read data | 
|  | reader_->Read(&key, &value); | 
|  |  | 
|  | // determine label type based on first item | 
|  | if (item_id == 0) { | 
|  | if (use_caffe_datum_) { | 
|  | prefetched_label_.mutable_data<int>(); | 
|  | } else { | 
|  | TensorProtos protos; | 
|  | CAFFE_ENFORCE(protos.ParseFromString(value)); | 
|  | TensorProto_DataType labeldt = protos.protos(1).data_type(); | 
|  | if (labeldt == TensorProto::INT32) { | 
|  | prefetched_label_.mutable_data<int>(); | 
|  | } else if (labeldt == TensorProto::FLOAT) { | 
|  | prefetched_label_.mutable_data<float>(); | 
|  | } else { | 
|  | LOG(FATAL) << "Unsupported label type."; | 
|  | } | 
|  |  | 
|  | for (int i = 0; i < additional_inputs_count_; ++i) { | 
|  | int index = additional_inputs_offset_ + i; | 
|  | TensorProto additional_output_proto = protos.protos(index); | 
|  | auto sizes = | 
|  | std::vector<int64_t>({batch_size_, additional_output_sizes_[i]}); | 
|  | if (additional_output_proto.data_type() == TensorProto::FLOAT) { | 
|  | prefetched_additional_outputs_[i] = | 
|  | caffe2::empty(sizes, at::dtype<float>().device(CPU)); | 
|  | } else if ( | 
|  | additional_output_proto.data_type() == TensorProto::INT32) { | 
|  | prefetched_additional_outputs_[i] = | 
|  | caffe2::empty(sizes, at::dtype<int>().device(CPU)); | 
|  | } else if ( | 
|  | additional_output_proto.data_type() == TensorProto::INT64) { | 
|  | prefetched_additional_outputs_[i] = | 
|  | caffe2::empty(sizes, at::dtype<int64_t>().device(CPU)); | 
|  | } else if ( | 
|  | additional_output_proto.data_type() == TensorProto::UINT8) { | 
|  | prefetched_additional_outputs_[i] = | 
|  | caffe2::empty(sizes, at::dtype<uint8_t>().device(CPU)); | 
|  | } else { | 
|  | LOG(FATAL) << "Unsupported output type."; | 
|  | } | 
|  | } | 
|  | } | 
|  | } | 
|  |  | 
|  | // launch into thread pool for processing | 
|  | // TODO: support color jitter and color lighting in gpu_transform | 
|  | if (gpu_transform_) { | 
|  | // output of decode will still be int8 | 
|  | uint8_t* image_data = prefetched_image_.mutable_data<uint8_t>() + | 
|  | crop_ * crop_ * channels * item_id; | 
|  | thread_pool_->runTaskWithID(std::bind( | 
|  | &ImageInputOp<Context>::DecodeAndTransposeOnly, | 
|  | this, | 
|  | std::string(value), | 
|  | image_data, | 
|  | item_id, | 
|  | channels, | 
|  | std::placeholders::_1)); | 
|  | } else { | 
|  | float* image_data = prefetched_image_.mutable_data<float>() + | 
|  | crop_ * crop_ * channels * item_id; | 
|  | thread_pool_->runTaskWithID(std::bind( | 
|  | &ImageInputOp<Context>::DecodeAndTransform, | 
|  | this, | 
|  | std::string(value), | 
|  | image_data, | 
|  | item_id, | 
|  | channels, | 
|  | std::placeholders::_1)); | 
|  | } | 
|  | } | 
|  | thread_pool_->waitWorkComplete(); | 
|  |  | 
|  | // we allow to get at most max_decode_error_ratio from | 
|  | // opencv imdecode until raising a runtime exception | 
|  | if ((float)num_decode_errors_in_batch_ / batch_size_ > | 
|  | max_decode_error_ratio_) { | 
|  | throw std::runtime_error( | 
|  | "max_decode_error_ratio exceeded " + | 
|  | c10::to_string(max_decode_error_ratio_)); | 
|  | } | 
|  |  | 
|  | // If the context is not CPUContext, we will need to do a copy in the | 
|  | // prefetch function as well. | 
|  | auto device = at::device(Context::GetDeviceType()); | 
|  | if (!std::is_same<Context, CPUContext>::value) { | 
|  | // do sync copies | 
|  | ReinitializeAndCopyFrom( | 
|  | &prefetched_image_on_device_, device, prefetched_image_); | 
|  | ReinitializeAndCopyFrom( | 
|  | &prefetched_label_on_device_, device, prefetched_label_); | 
|  |  | 
|  | for (int i = 0; i < prefetched_additional_outputs_on_device_.size(); ++i) { | 
|  | ReinitializeAndCopyFrom( | 
|  | &prefetched_additional_outputs_on_device_[i], | 
|  | device, | 
|  | prefetched_additional_outputs_[i]); | 
|  | } | 
|  | } | 
|  |  | 
|  | num_decode_errors_in_batch_ = 0; | 
|  |  | 
|  | return true; | 
|  | } | 
|  |  | 
|  | template <class Context> | 
|  | bool ImageInputOp<Context>::CopyPrefetched() { | 
|  | auto type = Device(Context::GetDeviceType()); | 
|  | auto options = at::device(type); | 
|  |  | 
|  | // Note(jiayq): The if statement below should be optimized away by the | 
|  | // compiler since std::is_same is a constexpr. | 
|  | if (std::is_same<Context, CPUContext>::value) { | 
|  | OperatorBase::OutputTensorCopyFrom( | 
|  | 0, options, prefetched_image_, /* async */ true); | 
|  | OperatorBase::OutputTensorCopyFrom( | 
|  | 1, options, prefetched_label_, /* async */ true); | 
|  |  | 
|  | for (int i = 2; i < OutputSize(); ++i) { | 
|  | OperatorBase::OutputTensorCopyFrom( | 
|  | i, options, prefetched_additional_outputs_[i - 2], /* async */ true); | 
|  | } | 
|  | } else { | 
|  | // TODO: support color jitter and color lighting in gpu_transform | 
|  | if (gpu_transform_) { | 
|  | if (!mean_std_copied_) { | 
|  | ReinitializeTensor( | 
|  | &mean_gpu_, | 
|  | {static_cast<int64_t>(mean_.size())}, | 
|  | at::dtype<float>().device(Context::GetDeviceType())); | 
|  | ReinitializeTensor( | 
|  | &std_gpu_, | 
|  | {static_cast<int64_t>(std_.size())}, | 
|  | at::dtype<float>().device(Context::GetDeviceType())); | 
|  |  | 
|  | context_.template CopyFromCPU<float>( | 
|  | mean_.size(), | 
|  | mean_.data(), | 
|  | mean_gpu_.template mutable_data<float>()); | 
|  | context_.template CopyFromCPU<float>( | 
|  | std_.size(), std_.data(), std_gpu_.template mutable_data<float>()); | 
|  | mean_std_copied_ = true; | 
|  | } | 
|  | const auto& X = prefetched_image_on_device_; | 
|  | // data comes in as NHWC | 
|  | const int N = X.dim32(0), C = X.dim32(3), H = X.dim32(1), W = X.dim32(2); | 
|  | // data goes out as NCHW | 
|  | auto dims = std::vector<int64_t>{N, C, H, W}; | 
|  | if (!ApplyTransformOnGPU(dims, type)) { | 
|  | return false; | 
|  | } | 
|  |  | 
|  | } else { | 
|  | OperatorBase::OutputTensorCopyFrom( | 
|  | 0, type, prefetched_image_on_device_, /* async */ true); | 
|  | } | 
|  | OperatorBase::OutputTensorCopyFrom( | 
|  | 1, type, prefetched_label_on_device_, /* async */ true); | 
|  |  | 
|  | for (int i = 2; i < OutputSize(); ++i) { | 
|  | OperatorBase::OutputTensorCopyFrom( | 
|  | i, | 
|  | type, | 
|  | prefetched_additional_outputs_on_device_[i - 2], | 
|  | /* async */ true); | 
|  | } | 
|  | } | 
|  | return true; | 
|  | } | 
|  | } // namespace caffe2 | 
|  |  | 
|  | #endif // CAFFE2_IMAGE_IMAGE_INPUT_OP_H_ |