change to work on windows && replace long with ptrdiff_t
diff --git a/AbsCriterion.cu b/AbsCriterion.cu
index fd19cae..fcb1239 100644
--- a/AbsCriterion.cu
+++ b/AbsCriterion.cu
@@ -20,7 +20,7 @@
 {
   THCUNN_assertSameGPU(state, 2, input, target);
 
-  long size = THCudaTensor_nElement(state, input);
+  ptrdiff_t size = THCudaTensor_nElement(state, input);
 
   input = THCudaTensor_newContiguous(state, input);
   target = THCudaTensor_newContiguous(state, target);
@@ -56,7 +56,7 @@
 {
   THCUNN_assertSameGPU(state, 3, input, target, gradInput);
 
-  long size = THCudaTensor_nElement(state, input);
+  ptrdiff_t size = THCudaTensor_nElement(state, input);
   float norm = (sizeAverage ? 1./size : 1.);
 
   input = THCudaTensor_newContiguous(state, input);
diff --git a/BCECriterion.cu b/BCECriterion.cu
index c8edcfe..098c50c 100644
--- a/BCECriterion.cu
+++ b/BCECriterion.cu
@@ -7,7 +7,16 @@
 #include <thrust/transform.h>
 #include <thrust/transform_reduce.h>
 
+#ifndef _MSC_VER
+// got compilation error: identifier "eps" is undefined in device code
+// on x64 windows with msvc19 and cuda 8.0 when eps is defined as below
 const float eps = 1e-12f;
+#else
+#ifdef eps
+#error eps has alread been defined!
+#endif
+#define eps 1e-12f
+#endif
 
 struct bce_functor
 {
@@ -38,7 +47,7 @@
 {
   THCUNN_assertSameGPU(state, 3, input, target, weights);
 
-  long size = THCudaTensor_nElement(state, input);
+  ptrdiff_t size = THCudaTensor_nElement(state, input);
 
   input = THCudaTensor_newContiguous(state, input);
   target = THCudaTensor_newContiguous(state, target);
@@ -114,11 +123,15 @@
   }
 };
 
+#ifdef _MSC_VER
+#undef eps
+#endif
+
 void THNN_CudaBCECriterion_updateGradInput(THCState *state, THCudaTensor *input, THCudaTensor *target, THCudaTensor *gradInput, bool sizeAverage, THCudaTensor *weights)
 {
   THCUNN_assertSameGPU(state, 4, input, target, gradInput, weights);
 
-  long size = THCudaTensor_nElement(state, input);
+  ptrdiff_t size = THCudaTensor_nElement(state, input);
   float norm = (sizeAverage ? 1./size : 1.);
 
   input = THCudaTensor_newContiguous(state, input);
diff --git a/CMakeLists.txt b/CMakeLists.txt
index f6439e1..2cc583a 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -25,6 +25,11 @@
   endif(CMAKE_CXX_COMPILER_VERSION VERSION_GREATER "4.9.3")
 endif(CMAKE_CXX_COMPILER_ID STREQUAL "GNU")
 
+IF(MSVC)
+  LIST(APPEND CUDA_NVCC_FLAGS "-Xcompiler /wd4819")
+  ADD_DEFINITIONS(-DTH_EXPORTS)
+ENDIF()
+
 IF(NOT THCUNN_INSTALL_LIB_SUBDIR)
   SET(THCUNN_INSTALL_LIB_SUBDIR "lib" CACHE PATH "THCUNN install library directory")
 ENDIF()
@@ -33,6 +38,10 @@
 
 CUDA_ADD_LIBRARY(THCUNN MODULE ${src-cuda})
 
+SET_TARGET_PROPERTIES(THCUNN PROPERTIES
+  PREFIX "lib"
+  IMPORT_PREFIX "lib")
+
 INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR})
 TARGET_LINK_LIBRARIES(THCUNN THC TH ${CUDA_cusparse_LIBRARY})
 
diff --git a/DistKLDivCriterion.cu b/DistKLDivCriterion.cu
index 2984672..ec42d45 100644
--- a/DistKLDivCriterion.cu
+++ b/DistKLDivCriterion.cu
@@ -24,7 +24,7 @@
 
   float sum;
 
-  long size = THCudaTensor_nElement(state, input);
+  ptrdiff_t size = THCudaTensor_nElement(state, input);
 
   input = THCudaTensor_newContiguous(state, input);
   target = THCudaTensor_newContiguous(state, target);
@@ -63,7 +63,7 @@
   THArgCheck(THCudaTensor_nElement(state, input) == THCudaTensor_nElement(state, target), 2,
              "input and target need to have the same number of elements");
 
-  long size = THCudaTensor_nElement(state, input);
+  ptrdiff_t size = THCudaTensor_nElement(state, input);
   float norm = (sizeAverage ? 1./size : 1.);
 
   input = THCudaTensor_newContiguous(state, input);
diff --git a/L1Cost.cu b/L1Cost.cu
index 7dd6bb2..89bb853 100644
--- a/L1Cost.cu
+++ b/L1Cost.cu
@@ -17,7 +17,7 @@
 {
   THCUNN_assertSameGPU(state, 1, input);
   float sum;
-  long size = THCudaTensor_nElement(state, input);
+  ptrdiff_t size = THCudaTensor_nElement(state, input);
   input = THCudaTensor_newContiguous(state, input);
   thrust::device_ptr<float> input_data(THCudaTensor_data(state, input));
   sum = thrust::reduce(input_data, input_data+size, (float) 0, l1cost_functor());
@@ -43,7 +43,7 @@
 void THNN_CudaL1Cost_updateGradInput(THCState *state, THCudaTensor *input, THCudaTensor *gradOutput, THCudaTensor *gradInput)
 {
   THCUNN_assertSameGPU(state, 2, input, gradInput);
-  long size = THCudaTensor_nElement(state, input);
+  ptrdiff_t size = THCudaTensor_nElement(state, input);
 
   input = THCudaTensor_newContiguous(state, input);
   THCudaTensor_resizeAs(state, gradInput, input);
diff --git a/LookupTable.cu b/LookupTable.cu
index 2b2040e..9fa1dec 100644
--- a/LookupTable.cu
+++ b/LookupTable.cu
@@ -50,7 +50,7 @@
 }
 
 __global__ void cunn_LookupTable_accGradParametersKernelByFeature(
-  long *input, float *gradOutput, float *gradWeight, float scale, long numel,
+  long *input, float *gradOutput, float *gradWeight, float scale, ptrdiff_t numel,
   long stride, int paddingValue) {
 
   const int featureDim = blockIdx.x * 4 + threadIdx.x / 32;
@@ -72,7 +72,7 @@
   // updates are serialized in their order of execution by using the
   // warp-wide collision detector `warpHasCollision`.
   const int laneId = threadIdx.x % 32;
-  for (int i = laneId; i < numel; i += WARP_SIZE) {
+  for (ptrdiff_t i = laneId; i < numel; i += WARP_SIZE) {
     const int weightIndex = (int) (input[i] - TH_INDEX_BASE);
     if (weightIndex == paddingValue - TH_INDEX_BASE) {
       continue;
@@ -97,7 +97,7 @@
 
 __global__ void cunn_LookupTable_accGradParametersKernel(
   long *input, long *indices, float *gradOutput, float *gradWeight,
-  long *count, float defaultScale, long numel, long stride, int paddingValue) {
+  long *count, float defaultScale, ptrdiff_t numel, long stride, int paddingValue) {
 
   int idx = blockIdx.x * 4 + threadIdx.y;
 
@@ -183,7 +183,7 @@
   if (nDim != 1 && nDim != 2)
     THError("input must be a vector or matrix");
 
-  long numel = THIndexTensor_(nElement)(state, input);
+  ptrdiff_t numel = THIndexTensor_(nElement)(state, input);
   long stride = gradWeight->stride[0];
 
   cudaStream_t stream = THCState_getCurrentStream(state);
@@ -314,7 +314,7 @@
   if (normType <= 0)
     THError("non-positive-norm not supported");
 
-  long numel = THIndexTensor_(nElement)(state, idx);
+  ptrdiff_t numel = THIndexTensor_(nElement)(state, idx);
   long stride = weight->stride[0];
 
   // get the unique indices
@@ -326,7 +326,7 @@
   pow_v<float> unary_pow(normType);
   thrust::plus<float> binary_plus;
   // numel << stride, since idx usually contains sparse row indices
-  for (long i = 0; i < numel; i++)
+  for (ptrdiff_t i = 0; i < numel; i++)
   {
     long k = idx_ptr[i] - TH_INDEX_BASE;
     thrust::device_ptr<float> row_ptr = weight_ptr + k * stride;
diff --git a/MSECriterion.cu b/MSECriterion.cu
index 982e19a..98ce4cd 100644
--- a/MSECriterion.cu
+++ b/MSECriterion.cu
@@ -28,7 +28,7 @@
     "input and target need to have the same number of elements"
   );
 
-  long size = THCudaTensor_nElement(state, input);
+  ptrdiff_t size = THCudaTensor_nElement(state, input);
 
   input = THCudaTensor_newContiguous(state, input);
   target = THCudaTensor_newContiguous(state, target);
@@ -72,7 +72,7 @@
     "input and target need to have the same number of elements"
   );
 
-  long size = THCudaTensor_nElement(state, input);
+  ptrdiff_t size = THCudaTensor_nElement(state, input);
   float norm = sizeAverage ? 2.f/size : 2.f;
 
   input = THCudaTensor_newContiguous(state, input);
diff --git a/MarginCriterion.cu b/MarginCriterion.cu
index b49e1dc..038d03f 100644
--- a/MarginCriterion.cu
+++ b/MarginCriterion.cu
@@ -26,7 +26,7 @@
 {
   THCUNN_assertSameGPU(state, 2, input, target);
 
-  long size = THCudaTensor_nElement(state, input);
+  ptrdiff_t size = THCudaTensor_nElement(state, input);
 
   input = THCudaTensor_newContiguous(state, input);
   target = THCudaTensor_newContiguous(state, target);
@@ -63,7 +63,7 @@
 {
   THCUNN_assertSameGPU(state, 3, input, target, gradInput);
 
-  long size = THCudaTensor_nElement(state, input);
+  ptrdiff_t size = THCudaTensor_nElement(state, input);
   float norm = sizeAverage ? 1.f/size : 1;
 
   input = THCudaTensor_newContiguous(state, input);
diff --git a/RReLU.cu b/RReLU.cu
index 245598b..6f5cb07 100644
--- a/RReLU.cu
+++ b/RReLU.cu
@@ -6,9 +6,9 @@
 // copied from cutorch/lib/THC/THCTensorRandom.cu
 #define MAX_NUM_BLOCKS 64
 #define BLOCK_SIZE 256
-#define NUM_BLOCKS(n) min((int)THCCeilDiv(n, (long) BLOCK_SIZE), MAX_NUM_BLOCKS)
+#define NUM_BLOCKS(n) min((int)THCCeilDiv(n, (ptrdiff_t) BLOCK_SIZE), MAX_NUM_BLOCKS)
 
-__global__ void rreluUpdateOutputTrain(int n, curandStateMtgp32 *state,
+__global__ void rreluUpdateOutputTrain(ptrdiff_t n, curandStateMtgp32 *state,
   float *input, float* noise, float *output, double a, double b)
 {
   CUDA_KERNEL_LOOP(i, n)
@@ -73,7 +73,7 @@
     THCudaTensor_resizeAs(state, noise, input);
     float *input_data = THCudaTensor_data(state, input);
     float *noise_data = THCudaTensor_data(state, noise);
-    long n = THCudaTensor_nElement(state, input);
+    ptrdiff_t n = THCudaTensor_nElement(state, input);
     if (inplace)
     {
       rreluUpdateOutputTrain<<<NUM_BLOCKS(n), BLOCK_SIZE, 0, THCState_getCurrentStream(state)>>>(
diff --git a/SmoothL1Criterion.cu b/SmoothL1Criterion.cu
index 2b528ef..7ff96a3 100644
--- a/SmoothL1Criterion.cu
+++ b/SmoothL1Criterion.cu
@@ -29,7 +29,7 @@
     "input and target need to have the same number of elements"
   );
 
-  long size = THCudaTensor_nElement(state, input);
+  ptrdiff_t size = THCudaTensor_nElement(state, input);
 
   input = THCudaTensor_newContiguous(state, input);
   target = THCudaTensor_newContiguous(state, target);
@@ -81,7 +81,7 @@
     "input and target need to have the same number of elements"
   );
 
-  long size = THCudaTensor_nElement(state, input);
+  ptrdiff_t size = THCudaTensor_nElement(state, input);
   float norm = sizeAverage ? 1./size : 1.;
 
   input = THCudaTensor_newContiguous(state, input);
diff --git a/SoftMarginCriterion.cu b/SoftMarginCriterion.cu
index 2a15f4a..cbf8f8b 100644
--- a/SoftMarginCriterion.cu
+++ b/SoftMarginCriterion.cu
@@ -26,7 +26,7 @@
   THCUNN_assertSameGPU(state, 2, input, target);
   float sum;
 
-  long size = THCudaTensor_nElement(state, input);
+  ptrdiff_t size = THCudaTensor_nElement(state, input);
 
   input = THCudaTensor_newContiguous(state, input);
   target = THCudaTensor_newContiguous(state, target);
@@ -68,7 +68,7 @@
 {
   THCUNN_assertSameGPU(state, 3, input, target, gradInput);
 
-  long size = THCudaTensor_nElement(state, input);
+  ptrdiff_t size = THCudaTensor_nElement(state, input);
   float norm = (sizeAverage ? 1./size : 1.);
 
   input = THCudaTensor_newContiguous(state, input);
diff --git a/SpatialClassNLLCriterion.cu b/SpatialClassNLLCriterion.cu
index 0469520..f52707f 100644
--- a/SpatialClassNLLCriterion.cu
+++ b/SpatialClassNLLCriterion.cu
@@ -116,7 +116,7 @@
   float *total_weight_data = THCudaTensor_data(state, total_weight);
 
   long batch_size = THCudaLongTensor_size(state, target, 0);
-  long map_nelem = THCudaLongTensor_nElement(state, target) / batch_size;
+  ptrdiff_t map_nelem = THCudaLongTensor_nElement(state, target) / batch_size;
   int blocks_per_sample = GET_BLOCKS(map_nelem) / 128;
   blocks_per_sample = (blocks_per_sample == 0) ? 1 : blocks_per_sample;
   int total_blocks = blocks_per_sample * batch_size;
@@ -179,7 +179,7 @@
   float *total_weight_data = THCudaTensor_data(state, total_weight);
 
   long batch_size = THCudaLongTensor_size(state, target, 0);
-  long map_nelem = THCudaLongTensor_nElement(state, target) / batch_size;
+  ptrdiff_t map_nelem = THCudaLongTensor_nElement(state, target) / batch_size;
   int blocks_per_sample = GET_BLOCKS(map_nelem) / 128;
   blocks_per_sample = (blocks_per_sample == 0) ? 1 : blocks_per_sample;
   int total_blocks = blocks_per_sample * batch_size;
diff --git a/cmake/select_compute_arch.cmake b/cmake/select_compute_arch.cmake
index 5376e5b..4b27441 100644
--- a/cmake/select_compute_arch.cmake
+++ b/cmake/select_compute_arch.cmake
@@ -69,6 +69,10 @@
                     ERROR_QUIET OUTPUT_STRIP_TRAILING_WHITESPACE)
 
     if(nvcc_res EQUAL 0)
+      # only keep the last line of nvcc_out
+      STRING(REGEX REPLACE ";" "\\\\;" nvcc_out "${nvcc_out}")
+      STRING(REGEX REPLACE "\n" ";" nvcc_out "${nvcc_out}")
+      list(GET nvcc_out -1 nvcc_out)
       string(REPLACE "2.1" "2.1(2.0)" nvcc_out "${nvcc_out}")
       set(CUDA_GPU_DETECT_OUTPUT ${nvcc_out} CACHE INTERNAL "Returned GPU architetures from detect_gpus tool" FORCE)
     endif()