blob: 8eb2d65b30c8c0514cd846c142d10e4c4db052a7 [file] [log] [blame]
#include <math.h>
#include <cfloat>
// TODO(jamesreed): I would use <cmath> here but std::isnan
// and std::isinf are declared constexpr there and the nvidia
// compiler throws an error because of it
#include "caffe2/core/context_gpu.h"
#include "utility_ops.h"
namespace caffe2 {
__global__ void NanCheckKernel(int N, const float* X, bool* result) {
bool has_nan = false;
CUDA_1D_KERNEL_LOOP(i, N) {
// Note: we have no need to do early return, since only if this fails
// will we not need to inspect all elements. No need to optimize the
// case that will fail.
has_nan = has_nan || isnan(X[i]) || isinf(X[i]);
}
__syncthreads();
if (has_nan) {
result[0] = true;
}
}
template <>
bool NanCheckOp<CUDAContext>::RunOnDevice() {
auto& X = Input(0);
auto* Y = Output(0);
const size_t N = X.size();
const float* data_ptr = X.data<float>();
scratch_.Resize(1);
math::Set<bool, CUDAContext>(
1, false, scratch_.mutable_data<bool>(), &context_);
NanCheckKernel<<<
CAFFE_GET_BLOCKS(N),
CAFFE_CUDA_NUM_THREADS,
0,
context_.cuda_stream()>>>(
N, X.data<float>(), scratch_.mutable_data<bool>());
bool result = false;
{
std::lock_guard<std::mutex> lock(CUDAContext::mutex());
CUDA_ENFORCE(cudaMemcpyAsync(
&result,
scratch_.raw_data(),
1,
cudaMemcpyDefault,
context_.cuda_stream()));
}
// Note: we must synchronize here so we can inspect the result
context_.FinishDeviceComputation();
// Print out diagnostic info if we have a NaN or inf
if (result) {
std::cerr << "Tensor contained NaN or inf: " << this->def().input(0)
<< std::endl;
for (int j = 0; j < InputSize(); j++) {
TensorCPU cpu_X;
cpu_X.ResizeLike(Input(j));
// Hack to cause allocaiton happen here, so it won't happen
// when we do CopyFrom. We need the mutex then because host->gpu
// copies seem to possibly lock with NCCL.
cpu_X.mutable_data<float>();
{
std::lock_guard<std::mutex> lock(CUDAContext::mutex());
cpu_X.CopyFrom(Input(j), &context_);
}
context_.FinishDeviceComputation();
std::cerr << "Input tensor: " << j << ": [" << def().input(j) << "]"
<< std::endl;
tensorPrinter_.Print<float>(cpu_X);
if (j == 0) {
std::cerr << "NaN idxs:" << std::endl;
auto* cpu_X_data = cpu_X.data<float>();
for (size_t i = 0; i < cpu_X.size(); ++i) {
if (isnan(cpu_X_data[i]) || isinf(cpu_X_data[i])) {
std::cerr << i << " ";
}
}
}
std::cerr << std::endl;
}
return false;
}
// This op should act as an identity matrix if we don't find any NaNs/infs.
// Copy over the data if we are not doing this in-place.
if (&X != Y) {
Y->CopyFrom(X, &context_);
}
return true;
}
REGISTER_CUDA_OPERATOR(NanCheck, NanCheckOp<CUDAContext>);
__global__ void
ElwiseMaxKernel(const float* X, const float* Y, float* maxout, const int N) {
CUDA_1D_KERNEL_LOOP(i, N) {
maxout[i] = max(X[i], Y[i]);
}
}
template <>
bool MaxOp<float, CUDAContext>::Compute() {
float* output_data = Output(0)->mutable_data<float>();
const int N = Input(0).size();
// Run pairwise-maxes
for (int i = 1; i < InputSize(); ++i) {
ElwiseMaxKernel<<<
CAFFE_GET_BLOCKS(N),
CAFFE_CUDA_NUM_THREADS,
0,
context_.cuda_stream()>>>(
(i == 0 ? Input(0).data<float>() : Output(0)->data<float>()),
Input(i).data<float>(),
output_data,
N);
}
return true;
}
REGISTER_CUDA_OPERATOR(Max, MaxOp<float, CUDAContext>);
template<typename T_INDEX>
__global__ void
GatherKernel(const float* X, float* Y, const T_INDEX* indices, const int N, const int block_size) {
for (int i = blockIdx.x; i < N; i += gridDim.x) {
T_INDEX idx = indices[i];
const float* src_offset = X + idx * block_size;
float* dst_offset = Y + i * block_size;
for (int j = threadIdx.x; j < block_size; j += blockDim.x) {
dst_offset[j] = src_offset[j];
}
}
}
template <>
bool GatherOp<CUDAContext>::RunOnDevice() {
return DispatchHelper<TensorTypes<int32_t,int64_t>>::call(
this, OperatorBase::Input<TensorCUDA>(INDICES));
}
template <>
template <typename Index>
bool GatherOp<CUDAContext>::DoRunWithType() {
auto& data = Input(DATA);
auto& indices = Input(INDICES);
auto* output = Output(0);
CAFFE_ENFORCE_GE(data.ndim(), 1, "DATA should be at least 1-D");
auto shape = indices.dims();
shape.insert(shape.end(), data.dims().begin() + 1, data.dims().end());
output->Resize(shape);
int block_size = data.size() / data.dim(0);
auto block_bytesize = data.size_from_dim(1) * data.meta().itemsize();
CAFFE_ENFORCE(
block_bytesize == data.nbytes() / data.dim(0),
"block_bytesize should be consistent with data dim");
int N = indices.size();
auto src_base = static_cast<const float*>(data.raw_data());
const Index* idxs = indices.template data<Index>();
auto out = static_cast<float*>(output->raw_mutable_data(data.meta()));
GatherKernel<<<
std::min(N, CAFFE_MAXIMUM_NUM_BLOCKS),
CAFFE_CUDA_NUM_THREADS,
0,
context_.cuda_stream()>>>(
src_base, out, idxs, N, block_size
);
return true;
}
namespace {
REGISTER_CUDA_OPERATOR(Gather, GatherOp<CUDAContext>);
}
} // namespace caffe2