modules/detectron/sigmoid_focal_loss_op.cu - platform/external/pytorch - Git at Google

 /**
  * Copyright (c) 2016-present, Facebook, Inc.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
  * You may obtain a copy of the License at
  *
  *     http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */

 #include <cfloat>

 #include "caffe2/core/context_gpu.h"
 #include "modules/detectron/sigmoid_focal_loss_op.h"

 namespace caffe2 {

 namespace {

 __global__ void SigmoidFocalLossKernel(
     const int N, const int D, const int H, const int W, const float* logits,
     const int* targets, const float* weight_pos,
     const float gamma, const float alpha,
     const int num_classes, float* losses) {
   CUDA_1D_KERNEL_LOOP(i, N * D * H * W) {
     int x = i % W;
     int y = (i / W) % H;
     int c = (i / (W * H)) % D;  // channel, here D is channel dim in input NxDxHxW
     int n = i / (W * H * D);    // n in NxDxHxW

     int A = D / num_classes;   // num_anchors = A
     int a = c / num_classes;   // current anchor out of A anchors in D = A * num_cls
     int d = c % num_classes;   // current class
     int t = targets[n * (H * W * A) + a * (H * W) + y * W + x];   // target

     // check whether the class is true class or not.
     // The target classes are in range 1 - 81 and the d is in range 0-80
     // because we predict A*80 dim, so for comparison purpose, compare t and (d+1)
     float c1 = (t == (d + 1));
     float c2 = (t != -1 & t != (d + 1));

     float Np = c10::cuda::compat::max(weight_pos[0], static_cast<float>(1.0));
     float zn = (1.0 - alpha) / Np;
     float zp = alpha / Np;

     // p = 1. / 1. + expf(-x)
     float p = 1. / (1. + expf(-logits[i]));

     // (1 - p)**gamma * log(p) where
     float term1 = powf((1. - p), gamma) * logf(c10::cuda::compat::max(p, FLT_MIN));
     // p**gamma * log(1 - p)
     float term2 =
         powf(p, gamma) *
         (-1. * logits[i] * (logits[i] >= 0) -
          logf(1. + expf(logits[i] - 2. * logits[i] * (logits[i] >= 0))));

     losses[i] = 0.0;
     losses[i] += -c1 * term1 * zp;
     losses[i] += -c2 * term2 * zn;
   }
 }

 __global__ void SigmoidFocalLossGradientKernel(
     const int N, const int D, const int H, const int W, const float* logits,
     const int* targets, float* dX_data, const float* weight_pos,
     const float gamma, const float alpha, const int num_classes,
     const float* avg_loss) {
   CUDA_1D_KERNEL_LOOP(i, N * D * H * W) {
       float a_loss = avg_loss[0];
       int x = i % W;
       int y = (i / W) % H;
       int c = (i / (W * H)) % D;
       int n = i / (W * H * D);

       int A = D / num_classes;   // num_anchors
       int a = c / num_classes;   // current anchor
       int d = c % num_classes;   // current class

       float Np = c10::cuda::compat::max(weight_pos[0], static_cast<float>(1.0));
       float zn = (1.0 - alpha) / Np;
       float zp = alpha / Np;
       int t = targets[n * (H * W * A) + a * (H * W) + y * W + x];

       float c1 = (t == (d + 1));
       float c2 = (t != -1 & t != (d + 1));
       float p = 1. / (1. + expf(-logits[i]));

       // (1-p)**g * (1 - p - g*p*log(p))
       float term1 =
           powf((1. - p), gamma) *
           (1. - p - (p * gamma * logf(c10::cuda::compat::max(p, FLT_MIN))));
       // (p**g) * (g*(1-p)*log(1-p) - p)
       float term2 =
           powf(p, gamma) *
           ((-1. * logits[i] * (logits[i] >= 0) -
            logf(1. + expf(logits[i] - 2. * logits[i] * (logits[i] >= 0)))) *
            (1. - p) * gamma - p);
       dX_data[i] = 0.0;
       dX_data[i] += -c1 * zp * term1;
       dX_data[i] += -c2 * zn * term2;
       dX_data[i] = dX_data[i] * a_loss;
   }
 }
 } // namespace

 template<>
 bool SigmoidFocalLossOp<float, CUDAContext>::RunOnDevice() {
   // Input logits, for example: N x (A * 80) x H x W in cls-agnostic
   auto& X = Input(0);
   // Target, for example: N x A x H x W
   auto& T = Input(1);
   // Number of positive examples: scalar
   auto& wp = Input(2);
   // output avg Sigmoid focal loss as mentioned in RetinaNet paper


   int N = X.dim32(0);
   int D = X.dim32(1);
   int H = X.dim32(2);
   int W = X.dim32(3);

   auto* avg_loss = Output(0, vector<int64_t>(), at::dtype<float>());
   losses_.ResizeLike(X);
   float* avg_loss_data = avg_loss->mutable_data<float>();

   SigmoidFocalLossKernel<<<CAFFE_GET_BLOCKS(X.size()),
           CAFFE_CUDA_NUM_THREADS, 0, context_.cuda_stream()>>>(
       N, D, H, W, X.data<float>(), T.data<int>(),
       wp.data<float>(), gamma_, alpha_, num_classes_,
       losses_.mutable_data<float>());
   C10_CUDA_KERNEL_LAUNCH_CHECK();

   math::Sum<float, CUDAContext>(
       losses_.size(), losses_.data<float>(), avg_loss_data, &context_);
   math::Scale<float, float, CUDAContext>(
       1, scale_, avg_loss_data, avg_loss_data, &context_);

   return true;
 }


 template<>
 bool SigmoidFocalLossGradientOp<float, CUDAContext>::RunOnDevice() {
   auto& X = Input(0);
   auto& T = Input(1);
   auto& wp = Input(2);
   auto& d_avg_loss = Input(InputSize() - 1);


   // get input shape
   int N = X.dim32(0);
   int D = X.dim32(1);
   int H = X.dim32(2);
   int W = X.dim32(3);

   auto* dX = Output(0, X.sizes(), at::dtype<float>());

   SigmoidFocalLossGradientKernel<<<CAFFE_GET_BLOCKS(X.size()),
           CAFFE_CUDA_NUM_THREADS, 0, context_.cuda_stream()>>>(
       N, D, H, W, X.data<float>(), T.data<int>(), dX->mutable_data<float>(),
       wp.data<float>(), gamma_, alpha_, num_classes_,
       d_avg_loss.data<float>());
   C10_CUDA_KERNEL_LAUNCH_CHECK();
   math::Scale<float, float, CUDAContext>(
       dX->size(),
       scale_,
       dX->data<float>(),
       dX->mutable_data<float>(),
       &context_);

   return true;
 }


 REGISTER_CUDA_OPERATOR(SigmoidFocalLoss,
                        SigmoidFocalLossOp<float, CUDAContext>);
 REGISTER_CUDA_OPERATOR(SigmoidFocalLossGradient,
                        SigmoidFocalLossGradientOp<float, CUDAContext>);
 } // namespace caffe2
	/**
	* Copyright (c) 2016-present, Facebook, Inc.
	*
	* Licensed under the Apache License, Version 2.0 (the "License");
	* you may not use this file except in compliance with the License.
	* You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/

	#include <cfloat>

	#include "caffe2/core/context_gpu.h"
	#include "modules/detectron/sigmoid_focal_loss_op.h"

	namespace caffe2 {

	namespace {

	__global__ void SigmoidFocalLossKernel(
	const int N, const int D, const int H, const int W, const float* logits,
	const int* targets, const float* weight_pos,
	const float gamma, const float alpha,
	const int num_classes, float* losses) {
	CUDA_1D_KERNEL_LOOP(i, N * D * H * W) {
	int x = i % W;
	int y = (i / W) % H;
	int c = (i / (W * H)) % D; // channel, here D is channel dim in input NxDxHxW
	int n = i / (W * H * D); // n in NxDxHxW

	int A = D / num_classes; // num_anchors = A
	int a = c / num_classes; // current anchor out of A anchors in D = A * num_cls
	int d = c % num_classes; // current class
	int t = targets[n * (H * W * A) + a * (H * W) + y * W + x]; // target

	// check whether the class is true class or not.
	// The target classes are in range 1 - 81 and the d is in range 0-80
	// because we predict A*80 dim, so for comparison purpose, compare t and (d+1)
	float c1 = (t == (d + 1));
	float c2 = (t != -1 & t != (d + 1));

	float Np = c10::cuda::compat::max(weight_pos[0], static_cast<float>(1.0));
	float zn = (1.0 - alpha) / Np;
	float zp = alpha / Np;

	// p = 1. / 1. + expf(-x)
	float p = 1. / (1. + expf(-logits[i]));

	// (1 - p)*gamma log(p) where
	float term1 = powf((1. - p), gamma) * logf(c10::cuda::compat::max(p, FLT_MIN));
	// p*gamma log(1 - p)
	float term2 =
	powf(p, gamma) *
	(-1. * logits[i] * (logits[i] >= 0) -
	logf(1. + expf(logits[i] - 2. * logits[i] * (logits[i] >= 0))));

	losses[i] = 0.0;
	losses[i] += -c1 * term1 * zp;
	losses[i] += -c2 * term2 * zn;
	}
	}

	__global__ void SigmoidFocalLossGradientKernel(
	const int N, const int D, const int H, const int W, const float* logits,
	const int* targets, float* dX_data, const float* weight_pos,
	const float gamma, const float alpha, const int num_classes,
	const float* avg_loss) {
	CUDA_1D_KERNEL_LOOP(i, N * D * H * W) {
	float a_loss = avg_loss[0];
	int x = i % W;
	int y = (i / W) % H;
	int c = (i / (W * H)) % D;
	int n = i / (W * H * D);

	int A = D / num_classes; // num_anchors
	int a = c / num_classes; // current anchor
	int d = c % num_classes; // current class

	float Np = c10::cuda::compat::max(weight_pos[0], static_cast<float>(1.0));
	float zn = (1.0 - alpha) / Np;
	float zp = alpha / Np;
	int t = targets[n * (H * W * A) + a * (H * W) + y * W + x];

	float c1 = (t == (d + 1));
	float c2 = (t != -1 & t != (d + 1));
	float p = 1. / (1. + expf(-logits[i]));

	// (1-p)*g (1 - p - gplog(p))
	float term1 =
	powf((1. - p), gamma) *
	(1. - p - (p * gamma * logf(c10::cuda::compat::max(p, FLT_MIN))));
	// (p*g) (g(1-p)log(1-p) - p)
	float term2 =
	powf(p, gamma) *
	((-1. * logits[i] * (logits[i] >= 0) -
	logf(1. + expf(logits[i] - 2. * logits[i] * (logits[i] >= 0)))) *
	(1. - p) * gamma - p);
	dX_data[i] = 0.0;
	dX_data[i] += -c1 * zp * term1;
	dX_data[i] += -c2 * zn * term2;
	dX_data[i] = dX_data[i] * a_loss;
	}
	}
	} // namespace

	template<>
	bool SigmoidFocalLossOp<float, CUDAContext>::RunOnDevice() {
	// Input logits, for example: N x (A * 80) x H x W in cls-agnostic
	auto& X = Input(0);
	// Target, for example: N x A x H x W
	auto& T = Input(1);
	// Number of positive examples: scalar
	auto& wp = Input(2);
	// output avg Sigmoid focal loss as mentioned in RetinaNet paper


	int N = X.dim32(0);
	int D = X.dim32(1);
	int H = X.dim32(2);
	int W = X.dim32(3);

	auto* avg_loss = Output(0, vector<int64_t>(), at::dtype<float>());
	losses_.ResizeLike(X);
	float* avg_loss_data = avg_loss->mutable_data<float>();

	SigmoidFocalLossKernel<<<CAFFE_GET_BLOCKS(X.size()),
	CAFFE_CUDA_NUM_THREADS, 0, context_.cuda_stream()>>>(
	N, D, H, W, X.data<float>(), T.data<int>(),
	wp.data<float>(), gamma_, alpha_, num_classes_,
	losses_.mutable_data<float>());
	C10_CUDA_KERNEL_LAUNCH_CHECK();

	math::Sum<float, CUDAContext>(
	losses_.size(), losses_.data<float>(), avg_loss_data, &context_);
	math::Scale<float, float, CUDAContext>(
	1, scale_, avg_loss_data, avg_loss_data, &context_);

	return true;
	}


	template<>
	bool SigmoidFocalLossGradientOp<float, CUDAContext>::RunOnDevice() {
	auto& X = Input(0);
	auto& T = Input(1);
	auto& wp = Input(2);
	auto& d_avg_loss = Input(InputSize() - 1);


	// get input shape
	int N = X.dim32(0);
	int D = X.dim32(1);
	int H = X.dim32(2);
	int W = X.dim32(3);

	auto* dX = Output(0, X.sizes(), at::dtype<float>());

	SigmoidFocalLossGradientKernel<<<CAFFE_GET_BLOCKS(X.size()),
	CAFFE_CUDA_NUM_THREADS, 0, context_.cuda_stream()>>>(
	N, D, H, W, X.data<float>(), T.data<int>(), dX->mutable_data<float>(),
	wp.data<float>(), gamma_, alpha_, num_classes_,
	d_avg_loss.data<float>());
	C10_CUDA_KERNEL_LAUNCH_CHECK();
	math::Scale<float, float, CUDAContext>(
	dX->size(),
	scale_,
	dX->data<float>(),
	dX->mutable_data<float>(),
	&context_);

	return true;
	}


	REGISTER_CUDA_OPERATOR(SigmoidFocalLoss,
	SigmoidFocalLossOp<float, CUDAContext>);
	REGISTER_CUDA_OPERATOR(SigmoidFocalLossGradient,
	SigmoidFocalLossGradientOp<float, CUDAContext>);
	} // namespace caffe2