| #include <cuda.h> |
| #include <thrust/device_vector.h> |
| #include <thrust/transform_reduce.h> |
| #include <thrust/system/cuda/execution_policy.h> |
| |
| #include "caffe2/operators/summarize_op.h" |
| #include "caffe2/core/context_gpu.h" |
| |
| namespace caffe2 { |
| |
| namespace { |
| |
| // structure used to accumulate the moments and other statistical properties |
| // encountered so far. |
| template <typename T> |
| struct SummaryStatsData { |
| T n; |
| T min; |
| T max; |
| T mean; |
| T M2; |
| |
| // initialize to the identity element |
| void initialize() { |
| n = mean = M2 = 0; |
| min = std::numeric_limits<T>::max(); |
| max = std::numeric_limits<T>::min(); |
| } |
| |
| T variance() { return (n == 1 ? 0 : M2 / (n - 1)); } |
| }; |
| |
| // stats_unary_op is a functor that takes in a value x and |
| // returns a variace_data whose mean value is initialized to x. |
| template <typename T> |
| struct summary_stats_unary_op { |
| __host__ __device__ SummaryStatsData<T> operator()(const T& x) const { |
| SummaryStatsData<T> result; |
| result.n = 1; |
| result.min = x; |
| result.max = x; |
| result.mean = x; |
| result.M2 = 0; |
| return result; |
| } |
| }; |
| |
| // summary_stats_binary_op is a functor that accepts two SummaryStatsData |
| // structs and returns a new SummaryStatsData which are an |
| // approximation to the summary_stats for |
| // all values that have been aggregated so far |
| template <typename T> |
| struct summary_stats_binary_op |
| : public thrust::binary_function<const SummaryStatsData<T>&, |
| const SummaryStatsData<T>&, |
| SummaryStatsData<T> > { |
| __host__ __device__ SummaryStatsData<T> operator()( |
| const SummaryStatsData<T>& x, const SummaryStatsData <T>& y) const { |
| SummaryStatsData<T> result; |
| T n = x.n + y.n; |
| T delta = y.mean - x.mean; |
| T delta2 = delta * delta; |
| result.n = n; |
| result.min = thrust::min(x.min, y.min); |
| result.max = thrust::max(x.max, y.max); |
| result.mean = x.mean + delta * y.n / n; |
| result.M2 = x.M2 + y.M2; |
| result.M2 += delta2 * x.n * y.n / n; |
| return result; |
| } |
| }; |
| |
| } // namespace |
| |
| template<> |
| bool SummarizeOp<float, CUDAContext>::RunOnDevice() { |
| auto& X = Input(0); |
| const int N = X.numel(); |
| TORCH_DCHECK_GT(N, 0); |
| |
| // TODO(Yangqing): Any better way to avoid having to const cast? |
| thrust::device_ptr<float> Xdata(const_cast<float*>(X.data<float>())); |
| summary_stats_unary_op<float> unary_op; |
| summary_stats_binary_op<float> binary_op; |
| SummaryStatsData<float> init; |
| init.initialize(); |
| // compute summary statistics |
| SummaryStatsData<float> result = thrust::transform_reduce( |
| #if THRUST_VERSION >= 100800 |
| thrust::cuda::par.on(context_.cuda_stream()), |
| #endif // THRUST_VERSION >= 100800 |
| Xdata, Xdata + N, unary_op, init, binary_op); |
| float standard_deviation = std::sqrt(result.variance()); |
| if (to_file_) { |
| (*log_file_) << result.min << " " << result.max << " " << result.mean << " " |
| << standard_deviation << std::endl; |
| } |
| if (OutputSize()) { |
| auto* Y = Output(0, {4}, at::dtype<float>()); |
| float output_buffer[NUM_STATS] = {result.min, result.max, result.mean, |
| standard_deviation}; |
| context_.CopyFromCPU<float>( |
| NUM_STATS, output_buffer, Y->template mutable_data<float>()); |
| } |
| return true; |
| } |
| |
| REGISTER_CUDA_OPERATOR(Summarize, SummarizeOp<float, CUDAContext>); |
| } // namespace caffe2 |