change to work on windows && replace long with ptrdiff_t
diff --git a/AbsCriterion.cu b/AbsCriterion.cu
index fd19cae..fcb1239 100644
--- a/AbsCriterion.cu
+++ b/AbsCriterion.cu
@@ -20,7 +20,7 @@
{
THCUNN_assertSameGPU(state, 2, input, target);
- long size = THCudaTensor_nElement(state, input);
+ ptrdiff_t size = THCudaTensor_nElement(state, input);
input = THCudaTensor_newContiguous(state, input);
target = THCudaTensor_newContiguous(state, target);
@@ -56,7 +56,7 @@
{
THCUNN_assertSameGPU(state, 3, input, target, gradInput);
- long size = THCudaTensor_nElement(state, input);
+ ptrdiff_t size = THCudaTensor_nElement(state, input);
float norm = (sizeAverage ? 1./size : 1.);
input = THCudaTensor_newContiguous(state, input);
diff --git a/BCECriterion.cu b/BCECriterion.cu
index c8edcfe..098c50c 100644
--- a/BCECriterion.cu
+++ b/BCECriterion.cu
@@ -7,7 +7,16 @@
#include <thrust/transform.h>
#include <thrust/transform_reduce.h>
+#ifndef _MSC_VER
+// got compilation error: identifier "eps" is undefined in device code
+// on x64 windows with msvc19 and cuda 8.0 when eps is defined as below
const float eps = 1e-12f;
+#else
+#ifdef eps
+#error eps has alread been defined!
+#endif
+#define eps 1e-12f
+#endif
struct bce_functor
{
@@ -38,7 +47,7 @@
{
THCUNN_assertSameGPU(state, 3, input, target, weights);
- long size = THCudaTensor_nElement(state, input);
+ ptrdiff_t size = THCudaTensor_nElement(state, input);
input = THCudaTensor_newContiguous(state, input);
target = THCudaTensor_newContiguous(state, target);
@@ -114,11 +123,15 @@
}
};
+#ifdef _MSC_VER
+#undef eps
+#endif
+
void THNN_CudaBCECriterion_updateGradInput(THCState *state, THCudaTensor *input, THCudaTensor *target, THCudaTensor *gradInput, bool sizeAverage, THCudaTensor *weights)
{
THCUNN_assertSameGPU(state, 4, input, target, gradInput, weights);
- long size = THCudaTensor_nElement(state, input);
+ ptrdiff_t size = THCudaTensor_nElement(state, input);
float norm = (sizeAverage ? 1./size : 1.);
input = THCudaTensor_newContiguous(state, input);
diff --git a/CMakeLists.txt b/CMakeLists.txt
index f6439e1..2cc583a 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -25,6 +25,11 @@
endif(CMAKE_CXX_COMPILER_VERSION VERSION_GREATER "4.9.3")
endif(CMAKE_CXX_COMPILER_ID STREQUAL "GNU")
+IF(MSVC)
+ LIST(APPEND CUDA_NVCC_FLAGS "-Xcompiler /wd4819")
+ ADD_DEFINITIONS(-DTH_EXPORTS)
+ENDIF()
+
IF(NOT THCUNN_INSTALL_LIB_SUBDIR)
SET(THCUNN_INSTALL_LIB_SUBDIR "lib" CACHE PATH "THCUNN install library directory")
ENDIF()
@@ -33,6 +38,10 @@
CUDA_ADD_LIBRARY(THCUNN MODULE ${src-cuda})
+SET_TARGET_PROPERTIES(THCUNN PROPERTIES
+ PREFIX "lib"
+ IMPORT_PREFIX "lib")
+
INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR})
TARGET_LINK_LIBRARIES(THCUNN THC TH ${CUDA_cusparse_LIBRARY})
diff --git a/DistKLDivCriterion.cu b/DistKLDivCriterion.cu
index 2984672..ec42d45 100644
--- a/DistKLDivCriterion.cu
+++ b/DistKLDivCriterion.cu
@@ -24,7 +24,7 @@
float sum;
- long size = THCudaTensor_nElement(state, input);
+ ptrdiff_t size = THCudaTensor_nElement(state, input);
input = THCudaTensor_newContiguous(state, input);
target = THCudaTensor_newContiguous(state, target);
@@ -63,7 +63,7 @@
THArgCheck(THCudaTensor_nElement(state, input) == THCudaTensor_nElement(state, target), 2,
"input and target need to have the same number of elements");
- long size = THCudaTensor_nElement(state, input);
+ ptrdiff_t size = THCudaTensor_nElement(state, input);
float norm = (sizeAverage ? 1./size : 1.);
input = THCudaTensor_newContiguous(state, input);
diff --git a/L1Cost.cu b/L1Cost.cu
index 7dd6bb2..89bb853 100644
--- a/L1Cost.cu
+++ b/L1Cost.cu
@@ -17,7 +17,7 @@
{
THCUNN_assertSameGPU(state, 1, input);
float sum;
- long size = THCudaTensor_nElement(state, input);
+ ptrdiff_t size = THCudaTensor_nElement(state, input);
input = THCudaTensor_newContiguous(state, input);
thrust::device_ptr<float> input_data(THCudaTensor_data(state, input));
sum = thrust::reduce(input_data, input_data+size, (float) 0, l1cost_functor());
@@ -43,7 +43,7 @@
void THNN_CudaL1Cost_updateGradInput(THCState *state, THCudaTensor *input, THCudaTensor *gradOutput, THCudaTensor *gradInput)
{
THCUNN_assertSameGPU(state, 2, input, gradInput);
- long size = THCudaTensor_nElement(state, input);
+ ptrdiff_t size = THCudaTensor_nElement(state, input);
input = THCudaTensor_newContiguous(state, input);
THCudaTensor_resizeAs(state, gradInput, input);
diff --git a/LookupTable.cu b/LookupTable.cu
index 2b2040e..9fa1dec 100644
--- a/LookupTable.cu
+++ b/LookupTable.cu
@@ -50,7 +50,7 @@
}
__global__ void cunn_LookupTable_accGradParametersKernelByFeature(
- long *input, float *gradOutput, float *gradWeight, float scale, long numel,
+ long *input, float *gradOutput, float *gradWeight, float scale, ptrdiff_t numel,
long stride, int paddingValue) {
const int featureDim = blockIdx.x * 4 + threadIdx.x / 32;
@@ -72,7 +72,7 @@
// updates are serialized in their order of execution by using the
// warp-wide collision detector `warpHasCollision`.
const int laneId = threadIdx.x % 32;
- for (int i = laneId; i < numel; i += WARP_SIZE) {
+ for (ptrdiff_t i = laneId; i < numel; i += WARP_SIZE) {
const int weightIndex = (int) (input[i] - TH_INDEX_BASE);
if (weightIndex == paddingValue - TH_INDEX_BASE) {
continue;
@@ -97,7 +97,7 @@
__global__ void cunn_LookupTable_accGradParametersKernel(
long *input, long *indices, float *gradOutput, float *gradWeight,
- long *count, float defaultScale, long numel, long stride, int paddingValue) {
+ long *count, float defaultScale, ptrdiff_t numel, long stride, int paddingValue) {
int idx = blockIdx.x * 4 + threadIdx.y;
@@ -183,7 +183,7 @@
if (nDim != 1 && nDim != 2)
THError("input must be a vector or matrix");
- long numel = THIndexTensor_(nElement)(state, input);
+ ptrdiff_t numel = THIndexTensor_(nElement)(state, input);
long stride = gradWeight->stride[0];
cudaStream_t stream = THCState_getCurrentStream(state);
@@ -314,7 +314,7 @@
if (normType <= 0)
THError("non-positive-norm not supported");
- long numel = THIndexTensor_(nElement)(state, idx);
+ ptrdiff_t numel = THIndexTensor_(nElement)(state, idx);
long stride = weight->stride[0];
// get the unique indices
@@ -326,7 +326,7 @@
pow_v<float> unary_pow(normType);
thrust::plus<float> binary_plus;
// numel << stride, since idx usually contains sparse row indices
- for (long i = 0; i < numel; i++)
+ for (ptrdiff_t i = 0; i < numel; i++)
{
long k = idx_ptr[i] - TH_INDEX_BASE;
thrust::device_ptr<float> row_ptr = weight_ptr + k * stride;
diff --git a/MSECriterion.cu b/MSECriterion.cu
index 982e19a..98ce4cd 100644
--- a/MSECriterion.cu
+++ b/MSECriterion.cu
@@ -28,7 +28,7 @@
"input and target need to have the same number of elements"
);
- long size = THCudaTensor_nElement(state, input);
+ ptrdiff_t size = THCudaTensor_nElement(state, input);
input = THCudaTensor_newContiguous(state, input);
target = THCudaTensor_newContiguous(state, target);
@@ -72,7 +72,7 @@
"input and target need to have the same number of elements"
);
- long size = THCudaTensor_nElement(state, input);
+ ptrdiff_t size = THCudaTensor_nElement(state, input);
float norm = sizeAverage ? 2.f/size : 2.f;
input = THCudaTensor_newContiguous(state, input);
diff --git a/MarginCriterion.cu b/MarginCriterion.cu
index b49e1dc..038d03f 100644
--- a/MarginCriterion.cu
+++ b/MarginCriterion.cu
@@ -26,7 +26,7 @@
{
THCUNN_assertSameGPU(state, 2, input, target);
- long size = THCudaTensor_nElement(state, input);
+ ptrdiff_t size = THCudaTensor_nElement(state, input);
input = THCudaTensor_newContiguous(state, input);
target = THCudaTensor_newContiguous(state, target);
@@ -63,7 +63,7 @@
{
THCUNN_assertSameGPU(state, 3, input, target, gradInput);
- long size = THCudaTensor_nElement(state, input);
+ ptrdiff_t size = THCudaTensor_nElement(state, input);
float norm = sizeAverage ? 1.f/size : 1;
input = THCudaTensor_newContiguous(state, input);
diff --git a/RReLU.cu b/RReLU.cu
index 245598b..6f5cb07 100644
--- a/RReLU.cu
+++ b/RReLU.cu
@@ -6,9 +6,9 @@
// copied from cutorch/lib/THC/THCTensorRandom.cu
#define MAX_NUM_BLOCKS 64
#define BLOCK_SIZE 256
-#define NUM_BLOCKS(n) min((int)THCCeilDiv(n, (long) BLOCK_SIZE), MAX_NUM_BLOCKS)
+#define NUM_BLOCKS(n) min((int)THCCeilDiv(n, (ptrdiff_t) BLOCK_SIZE), MAX_NUM_BLOCKS)
-__global__ void rreluUpdateOutputTrain(int n, curandStateMtgp32 *state,
+__global__ void rreluUpdateOutputTrain(ptrdiff_t n, curandStateMtgp32 *state,
float *input, float* noise, float *output, double a, double b)
{
CUDA_KERNEL_LOOP(i, n)
@@ -73,7 +73,7 @@
THCudaTensor_resizeAs(state, noise, input);
float *input_data = THCudaTensor_data(state, input);
float *noise_data = THCudaTensor_data(state, noise);
- long n = THCudaTensor_nElement(state, input);
+ ptrdiff_t n = THCudaTensor_nElement(state, input);
if (inplace)
{
rreluUpdateOutputTrain<<<NUM_BLOCKS(n), BLOCK_SIZE, 0, THCState_getCurrentStream(state)>>>(
diff --git a/SmoothL1Criterion.cu b/SmoothL1Criterion.cu
index 2b528ef..7ff96a3 100644
--- a/SmoothL1Criterion.cu
+++ b/SmoothL1Criterion.cu
@@ -29,7 +29,7 @@
"input and target need to have the same number of elements"
);
- long size = THCudaTensor_nElement(state, input);
+ ptrdiff_t size = THCudaTensor_nElement(state, input);
input = THCudaTensor_newContiguous(state, input);
target = THCudaTensor_newContiguous(state, target);
@@ -81,7 +81,7 @@
"input and target need to have the same number of elements"
);
- long size = THCudaTensor_nElement(state, input);
+ ptrdiff_t size = THCudaTensor_nElement(state, input);
float norm = sizeAverage ? 1./size : 1.;
input = THCudaTensor_newContiguous(state, input);
diff --git a/SoftMarginCriterion.cu b/SoftMarginCriterion.cu
index 2a15f4a..cbf8f8b 100644
--- a/SoftMarginCriterion.cu
+++ b/SoftMarginCriterion.cu
@@ -26,7 +26,7 @@
THCUNN_assertSameGPU(state, 2, input, target);
float sum;
- long size = THCudaTensor_nElement(state, input);
+ ptrdiff_t size = THCudaTensor_nElement(state, input);
input = THCudaTensor_newContiguous(state, input);
target = THCudaTensor_newContiguous(state, target);
@@ -68,7 +68,7 @@
{
THCUNN_assertSameGPU(state, 3, input, target, gradInput);
- long size = THCudaTensor_nElement(state, input);
+ ptrdiff_t size = THCudaTensor_nElement(state, input);
float norm = (sizeAverage ? 1./size : 1.);
input = THCudaTensor_newContiguous(state, input);
diff --git a/SpatialClassNLLCriterion.cu b/SpatialClassNLLCriterion.cu
index 0469520..f52707f 100644
--- a/SpatialClassNLLCriterion.cu
+++ b/SpatialClassNLLCriterion.cu
@@ -116,7 +116,7 @@
float *total_weight_data = THCudaTensor_data(state, total_weight);
long batch_size = THCudaLongTensor_size(state, target, 0);
- long map_nelem = THCudaLongTensor_nElement(state, target) / batch_size;
+ ptrdiff_t map_nelem = THCudaLongTensor_nElement(state, target) / batch_size;
int blocks_per_sample = GET_BLOCKS(map_nelem) / 128;
blocks_per_sample = (blocks_per_sample == 0) ? 1 : blocks_per_sample;
int total_blocks = blocks_per_sample * batch_size;
@@ -179,7 +179,7 @@
float *total_weight_data = THCudaTensor_data(state, total_weight);
long batch_size = THCudaLongTensor_size(state, target, 0);
- long map_nelem = THCudaLongTensor_nElement(state, target) / batch_size;
+ ptrdiff_t map_nelem = THCudaLongTensor_nElement(state, target) / batch_size;
int blocks_per_sample = GET_BLOCKS(map_nelem) / 128;
blocks_per_sample = (blocks_per_sample == 0) ? 1 : blocks_per_sample;
int total_blocks = blocks_per_sample * batch_size;
diff --git a/cmake/select_compute_arch.cmake b/cmake/select_compute_arch.cmake
index 5376e5b..4b27441 100644
--- a/cmake/select_compute_arch.cmake
+++ b/cmake/select_compute_arch.cmake
@@ -69,6 +69,10 @@
ERROR_QUIET OUTPUT_STRIP_TRAILING_WHITESPACE)
if(nvcc_res EQUAL 0)
+ # only keep the last line of nvcc_out
+ STRING(REGEX REPLACE ";" "\\\\;" nvcc_out "${nvcc_out}")
+ STRING(REGEX REPLACE "\n" ";" nvcc_out "${nvcc_out}")
+ list(GET nvcc_out -1 nvcc_out)
string(REPLACE "2.1" "2.1(2.0)" nvcc_out "${nvcc_out}")
set(CUDA_GPU_DETECT_OUTPUT ${nvcc_out} CACHE INTERNAL "Returned GPU architetures from detect_gpus tool" FORCE)
endif()