Simplify ATen Build (#3496)

* THS build change

* merge THCS into ATen build

* THCUNN build change over

* update THNN build

* move THC build to ATen, as well as some of the accumulated top level config from other TH* libraries

* TH library build merged into ATen, and warnings fixes.

* fix magma support checking

* check cuda early

* fall back to GCC atomics if C11 atomics have issues.

* fix install name

* disable openmp in files that also include stdatomic.h

* make sure LAPACK is visible to TH build file.
diff --git a/aten/CMakeLists.txt b/aten/CMakeLists.txt
index 136ce27..b90a46e 100644
--- a/aten/CMakeLists.txt
+++ b/aten/CMakeLists.txt
@@ -1,5 +1,10 @@
 cmake_minimum_required(VERSION 3.0)
-set(CMAKE_MODULE_PATH ${CMAKE_CURRENT_SOURCE_DIR}/cmake/FindCUDA ${CMAKE_MODULE_PATH})
+set(CMAKE_MODULE_PATH
+  ${CMAKE_CURRENT_SOURCE_DIR}/cmake
+  ${CMAKE_CURRENT_SOURCE_DIR}/cmake/FindCUDA
+  /usr/lib/x86_64-linux-gnu/
+  ${CMAKE_CURRENT_SOURCE_DIR}/src/TH/cmake
+  ${CMAKE_MODULE_PATH})
 project(ATen)
 
 cmake_policy(SET CMP0012 NEW)
@@ -22,7 +27,331 @@
 # C++11
 set(CMAKE_CXX_FLAGS "--std=c++11 ${CMAKE_CXX_FLAGS}")
 
-# TH stuff
+
+# Top-level build config
+############################################
+# Flags
+# When using MSVC
+
+# Detect CUDA architecture and get best NVCC flags
+# finding cuda must be first because other things depend on the result
+IF(NOT CUDA_FOUND)
+  FIND_PACKAGE(CUDA 5.5)
+ENDIF()
+
+IF(MSVC)
+  # we want to respect the standard, and we are bored of those **** .
+  ADD_DEFINITIONS(-D_CRT_SECURE_NO_DEPRECATE=1)
+  LIST(APPEND CUDA_NVCC_FLAGS "-Xcompiler /wd4819 -Xcompiler /wd4503")
+  ADD_DEFINITIONS(-DTH_EXPORTS)
+ENDIF(MSVC)
+
+IF (CMAKE_VERSION VERSION_LESS "3.1")
+  SET(CMAKE_C_FLAGS "-std=c11 ${CMAKE_C_FLAGS}")
+ELSE ()
+  SET(CMAKE_C_STANDARD 11)
+ENDIF ()
+
+if(CMAKE_CXX_COMPILER_ID STREQUAL "GNU")
+  if(CMAKE_CXX_COMPILER_VERSION VERSION_GREATER "4.9")
+    if(CUDA_VERSION VERSION_LESS "8.0")
+      MESSAGE(STATUS "Found gcc >=5 and CUDA <= 7.5, adding workaround C++ flags")
+      set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -D_FORCE_INLINES -D_MWAITXINTRIN_H_INCLUDED -D__STRICT_ANSI__")
+    endif(CUDA_VERSION VERSION_LESS "8.0")
+  endif(CMAKE_CXX_COMPILER_VERSION VERSION_GREATER "4.9")
+endif(CMAKE_CXX_COMPILER_ID STREQUAL "GNU")
+
+if(CUDA_VERSION VERSION_GREATER "8.0")
+
+endif(CUDA_VERSION VERSION_GREATER "8.0")
+
+LIST(APPEND CUDA_NVCC_FLAGS -Wno-deprecated-gpu-targets)
+
+if(NOT CMAKE_CXX_COMPILER_ID STREQUAL "GNU")
+  SET(CMAKE_CXX_STANDARD 11)
+endif()
+
+IF(NOT COMMAND CUDA_SELECT_NVCC_ARCH_FLAGS)
+  INCLUDE(${CMAKE_CURRENT_SOURCE_DIR}/cmake/select_compute_arch.cmake)
+ENDIF()
+LIST(APPEND CUDA_NVCC_FLAGS $ENV{TORCH_NVCC_FLAGS})
+CUDA_SELECT_NVCC_ARCH_FLAGS(NVCC_FLAGS_EXTRA $ENV{TORCH_CUDA_ARCH_LIST})
+LIST(APPEND CUDA_NVCC_FLAGS ${NVCC_FLAGS_EXTRA})
+IF(CMAKE_POSITION_INDEPENDENT_CODE)
+  LIST(APPEND CUDA_NVCC_FLAGS "-Xcompiler -fPIC")
+ENDIF()
+
+IF(CUDA_HAS_FP16 OR NOT ${CUDA_VERSION} LESS 7.5)
+  MESSAGE(STATUS "Found CUDA with FP16 support, compiling with torch.CudaHalfTensor")
+  LIST(APPEND CUDA_NVCC_FLAGS "-DCUDA_HAS_FP16=1")
+  SET(CMAKE_C_FLAGS "-DCUDA_HAS_FP16=1 ${CMAKE_C_FLAGS}")
+ELSE(CUDA_HAS_FP16 OR NOT ${CUDA_VERSION} LESS 7.5)
+  MESSAGE(STATUS "Could not find CUDA with FP16 support, compiling without torch.CudaHalfTensor")
+ENDIF(CUDA_HAS_FP16 OR NOT ${CUDA_VERSION} LESS 7.5)
+
+OPTION(NDEBUG "disable asserts (WARNING: this may result in silent UB e.g. with out-of-bound indices)")
+IF(NOT NDEBUG)
+  MESSAGE(STATUS "Removing -DNDEBUG from compile flags")
+  STRING(REPLACE "-DNDEBUG" "" CMAKE_C_FLAGS "" ${CMAKE_C_FLAGS})
+  STRING(REPLACE "-DNDEBUG" "" CMAKE_C_FLAGS_DEBUG "" ${CMAKE_C_FLAGS_DEBUG})
+  STRING(REPLACE "-DNDEBUG" "" CMAKE_C_FLAGS_RELEASE "" ${CMAKE_C_FLAGS_RELEASE})
+  STRING(REPLACE "-DNDEBUG" "" CMAKE_CXX_FLAGS "" ${CMAKE_CXX_FLAGS})
+  STRING(REPLACE "-DNDEBUG" "" CMAKE_CXX_FLAGS_DEBUG "" ${CMAKE_CXX_FLAGS_DEBUG})
+  STRING(REPLACE "-DNDEBUG" "" CMAKE_CXX_FLAGS_RELEASE "" ${CMAKE_CXX_FLAGS_RELEASE})
+ENDIF()
+
+if(CUDA_VERSION VERSION_GREATER "8.0")
+  LIST(APPEND CUDA_NVCC_FLAGS "-D__CUDA_NO_HALF_OPERATORS__")
+endif(CUDA_VERSION VERSION_GREATER "8.0")
+
+# OpenMP support?
+SET(WITH_OPENMP ON CACHE BOOL "OpenMP support if available?")
+IF (APPLE AND CMAKE_COMPILER_IS_GNUCC)
+  EXEC_PROGRAM (uname ARGS -v  OUTPUT_VARIABLE DARWIN_VERSION)
+  STRING (REGEX MATCH "[0-9]+" DARWIN_VERSION ${DARWIN_VERSION})
+  MESSAGE (STATUS "MAC OS Darwin Version: ${DARWIN_VERSION}")
+  IF (DARWIN_VERSION GREATER 9)
+    SET(APPLE_OPENMP_SUCKS 1)
+  ENDIF (DARWIN_VERSION GREATER 9)
+  EXECUTE_PROCESS (COMMAND ${CMAKE_C_COMPILER} -dumpversion
+    OUTPUT_VARIABLE GCC_VERSION)
+  IF (APPLE_OPENMP_SUCKS AND GCC_VERSION VERSION_LESS 4.6.2)
+    MESSAGE(STATUS "Warning: Disabling OpenMP (unstable with this version of GCC)")
+    MESSAGE(STATUS " Install GCC >= 4.6.2 or change your OS to enable OpenMP")
+    SET(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -Wno-unknown-pragmas")
+    SET(WITH_OPENMP OFF CACHE BOOL "OpenMP support if available?" FORCE)
+  ENDIF ()
+ENDIF ()
+
+IF (WITH_OPENMP)
+  FIND_PACKAGE(OpenMP)
+  IF(OPENMP_FOUND)
+    MESSAGE(STATUS "Compiling with OpenMP support")
+    SET(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${OpenMP_C_FLAGS}")
+    SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OpenMP_CXX_FLAGS}")
+    SET(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} ${OpenMP_EXE_LINKER_FLAGS}")
+  ENDIF(OPENMP_FOUND)
+ENDIF (WITH_OPENMP)
+
+SET(CUDA_ATTACH_VS_BUILD_RULE_TO_CUDA_FILE OFF)
+
+FIND_PACKAGE(MAGMA)
+IF(CUDA_FOUND AND MAGMA_FOUND)
+  INCLUDE_DIRECTORIES("${MAGMA_INCLUDE_DIR}")
+  SET(CMAKE_REQUIRED_INCLUDES "${MAGMA_INCLUDE_DIR};${CUDA_INCLUDE_DIRS}")
+  INCLUDE(CheckPrototypeDefinition)
+  check_prototype_definition(magma_get_sgeqrf_nb
+   "magma_int_t magma_get_sgeqrf_nb( magma_int_t m, magma_int_t n );"
+   "0"
+   "magma.h"
+    MAGMA_V2)
+  IF (MAGMA_V2)
+    add_definitions(-DMAGMA_V2)
+  ENDIF (MAGMA_V2)
+
+  SET(USE_MAGMA 1)
+  MESSAGE(STATUS "Compiling with MAGMA support")
+  MESSAGE(STATUS "MAGMA INCLUDE DIRECTORIES: ${MAGMA_INCLUDE_DIR}")
+  MESSAGE(STATUS "MAGMA LIBRARIES: ${MAGMA_LIBRARIES}")
+  MESSAGE(STATUS "MAGMA V2 check: ${MAGMA_V2}")
+ELSE()
+  MESSAGE(STATUS "MAGMA not found. Compiling without MAGMA support")
+ENDIF()
+
+
+# ARM specific flags
+FIND_PACKAGE(ARM)
+IF (ASIMD_FOUND)
+  MESSAGE(STATUS "asimd/Neon found with compiler flag : -D__NEON__")
+  SET(CMAKE_C_FLAGS "-D__NEON__ ${CMAKE_C_FLAGS}")
+ELSEIF (NEON_FOUND)
+  MESSAGE(STATUS "Neon found with compiler flag : -mfpu=neon -D__NEON__")
+  SET(CMAKE_C_FLAGS "-mfpu=neon -D__NEON__ ${CMAKE_C_FLAGS}")
+ENDIF (ASIMD_FOUND)
+IF (CORTEXA8_FOUND)
+  MESSAGE(STATUS "Cortex-A8 Found with compiler flag : -mcpu=cortex-a8")
+  SET(CMAKE_C_FLAGS "-mcpu=cortex-a8 -fprefetch-loop-arrays ${CMAKE_C_FLAGS}")
+ENDIF (CORTEXA8_FOUND)
+IF (CORTEXA9_FOUND)
+  MESSAGE(STATUS "Cortex-A9 Found with compiler flag : -mcpu=cortex-a9")
+  SET(CMAKE_C_FLAGS "-mcpu=cortex-a9 ${CMAKE_C_FLAGS}")
+ENDIF (CORTEXA9_FOUND)
+
+IF(UNIX)
+  # prevent Unknown CMake command "check_function_exists".
+  INCLUDE(CheckFunctionExists)
+ENDIF(UNIX)
+
+INCLUDE (CheckIncludeFile)
+INCLUDE (CheckCSourceCompiles)
+CHECK_INCLUDE_FILE(cpuid.h HAVE_CPUID_H)
+# Check for a cpuid intrinsic
+IF(HAVE_CPUID_H)
+    CHECK_C_SOURCE_COMPILES("#include <cpuid.h>
+        int main()
+        {
+            unsigned int eax, ebx, ecx, edx;
+            return __get_cpuid(0, &eax, &ebx, &ecx, &edx);
+        }" HAVE_GCC_GET_CPUID)
+ENDIF()
+IF(HAVE_GCC_GET_CPUID)
+  SET(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -DHAVE_GCC_GET_CPUID")
+ENDIF(HAVE_GCC_GET_CPUID)
+
+CHECK_C_SOURCE_COMPILES("#include <stdint.h>
+    static inline void cpuid(uint32_t *eax, uint32_t *ebx,
+    			 uint32_t *ecx, uint32_t *edx)
+    {
+      uint32_t a = *eax, b, c = *ecx, d;
+      asm volatile ( \"cpuid\" : \"+a\"(a), \"=b\"(b), \"+c\"(c), \"=d\"(d) );
+      *eax = a; *ebx = b; *ecx = c; *edx = d;
+    }
+    int main() {
+      uint32_t a,b,c,d;
+      cpuid(&a, &b, &c, &d);
+      return 0;
+    }" NO_GCC_EBX_FPIC_BUG)
+
+IF(NOT NO_GCC_EBX_FPIC_BUG)
+  SET(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -DUSE_GCC_GET_CPUID")
+ENDIF(NOT NO_GCC_EBX_FPIC_BUG)
+
+FIND_PACKAGE(SSE) # checks SSE, AVX and AVX2
+IF(C_SSE2_FOUND)
+  MESSAGE(STATUS "SSE2 Found")
+  SET(CMAKE_C_FLAGS "${C_SSE2_FLAGS} -DUSE_SSE2 ${CMAKE_C_FLAGS}")
+ENDIF(C_SSE2_FOUND)
+IF(C_SSE4_1_FOUND AND C_SSE4_2_FOUND)
+  SET(CMAKE_C_FLAGS "${C_SSE4_1_FLAGS} -DUSE_SSE4_1 ${C_SSE4_2_FLAGS} -DUSE_SSE4_2 ${CMAKE_C_FLAGS}")
+ENDIF()
+IF(C_SSE3_FOUND)
+  MESSAGE(STATUS "SSE3 Found")
+  SET(CMAKE_C_FLAGS "${C_SSE3_FLAGS} -DUSE_SSE3 ${CMAKE_C_FLAGS}")
+ENDIF(C_SSE3_FOUND)
+
+# we don't set -mavx and -mavx2 flags globally, but only for specific files
+# however, we want to enable the AVX codepaths, so we still need to
+# add USE_AVX and USE_AVX2 macro defines
+IF(C_AVX_FOUND)
+  MESSAGE(STATUS "AVX Found")
+  SET(CMAKE_C_FLAGS "-DUSE_AVX ${CMAKE_C_FLAGS}")
+ENDIF(C_AVX_FOUND)
+IF(C_AVX2_FOUND)
+  MESSAGE(STATUS "AVX2 Found")
+  SET(CMAKE_C_FLAGS "-DUSE_AVX2 ${CMAKE_C_FLAGS}")
+ENDIF(C_AVX2_FOUND)
+
+CHECK_C_SOURCE_RUNS("
+#include <stdatomic.h>
+// ATOMIC_INT_LOCK_FREE is flaky on some older gcc versions
+// so if this define is not usable a preprocessor definition
+// we fail this check and fall back to GCC atomics
+#if ATOMIC_INT_LOCK_FREE == 2
+#define TH_ATOMIC_IPC_REFCOUNT 1
+#endif
+int main()
+{
+  int a;
+  int oa;
+  atomic_store(&a, 1);
+  atomic_fetch_add(&a, 1);
+  oa = atomic_load(&a);
+  if(!atomic_compare_exchange_strong(&a, &oa, 3))
+    return -1;
+  return 0;
+}
+" HAS_C11_ATOMICS)
+
+IF(NOT HAS_C11_ATOMICS)
+  CHECK_C_SOURCE_RUNS("
+#include <intrin.h>
+int main()
+{
+  long a;
+  _InterlockedExchange(&a, 1);
+  _InterlockedExchangeAdd(&a, 1);
+  if(_InterlockedCompareExchange(&a, 3, 2) != 2)
+    return -1;
+  return 0;
+}
+" HAS_MSC_ATOMICS)
+
+  CHECK_C_SOURCE_RUNS("
+int main()
+{
+  int a;
+  __sync_lock_test_and_set(&a, 1);
+  __sync_fetch_and_add(&a, 1);
+  if(!__sync_bool_compare_and_swap(&a, 2, 3))
+    return -1;
+  return 0;
+}
+" HAS_GCC_ATOMICS)
+ENDIF()
+
+IF(HAS_C11_ATOMICS)
+  ADD_DEFINITIONS(-DUSE_C11_ATOMICS=1)
+  MESSAGE(STATUS "Atomics: using C11 intrinsics")
+ELSEIF(HAS_MSC_ATOMICS)
+  ADD_DEFINITIONS(-DUSE_MSC_ATOMICS=1)
+  MESSAGE(STATUS "Atomics: using MSVC intrinsics")
+ELSEIF(HAS_GCC_ATOMICS)
+  ADD_DEFINITIONS(-DUSE_GCC_ATOMICS=1)
+    MESSAGE(STATUS "Atomics: using GCC intrinsics")
+ELSE()
+  SET(CMAKE_THREAD_PREFER_PTHREAD TRUE)
+  FIND_PACKAGE(Threads)
+  IF(THREADS_FOUND)
+    ADD_DEFINITIONS(-DUSE_PTHREAD_ATOMICS=1)
+    TARGET_LINK_LIBRARIES(TH ${CMAKE_THREAD_LIBS_INIT})
+    MESSAGE(STATUS "Atomics: using pthread")
+  ENDIF()
+ENDIF()
+
+IF (WIN32 AND NOT CYGWIN)
+  SET(BLAS_INSTALL_LIBRARIES "OFF"
+    CACHE BOOL "Copy the required BLAS DLLs into the TH install dirs")
+ENDIF (WIN32 AND NOT CYGWIN)
+
+MACRO(Install_Required_Library ln)
+    get_filename_component(libpath ${ln} PATH)
+    get_filename_component(libname ${ln} NAME_WE)
+    file(GLOB libdlls "${libpath}/${libname}*.dll")
+    install(PROGRAMS ${libdlls}
+      DESTINATION "${TH_INSTALL_BIN_SUBDIR}")
+ENDMACRO(Install_Required_Library libname)
+
+IF (BLAS_FOUND AND BLAS_INSTALL_LIBRARIES)
+  IF (BLAS_goto2_LIBRARY)
+    Install_Required_Library(${BLAS_goto2_LIBRARY})
+    Install_Required_Library("${libpath}/libgfortran")
+    Install_Required_Library("${libpath}/libquadmath")
+    Install_Required_Library("${libpath}/libgcc")
+  ENDIF()
+  IF (BLAS_openblas_LIBRARY)
+    Install_Required_Library(${BLAS_openblas_LIBRARY})
+    Install_Required_Library("${libpath}/libquadmath")
+    Install_Required_Library("${libpath}/libgfortran")
+    Install_Required_Library("${libpath}/libgcc")
+  ENDIF()
+ENDIF()
+
+FIND_PACKAGE(LAPACK)
+IF(LAPACK_FOUND)
+  SET(USE_LAPACK 1)
+ENDIF(LAPACK_FOUND)
+
+#############################################
+
+set(ATen_CPU_SRCS)
+set(ATen_CPU_INCLUDE)
+set(ATen_CUDA_SRCS)
+set(ATen_CUDA_INCLUDE)
+SET(ATEN_INSTALL_BIN_SUBDIR "bin" CACHE PATH "ATen install binary subdirectory")
+SET(ATEN_INSTALL_LIB_SUBDIR "lib" CACHE PATH "ATen install library subdirectory")
+SET(ATEN_INSTALL_INCLUDE_SUBDIR "include" CACHE PATH "ATen install include subdirectory")
+
+
 set(Torch_FOUND 1)
 add_definitions(-DTH_INDEX_BASE=0)
 set(TH_LINK_STYLE STATIC)
diff --git a/aten/src/THCS/cmake/FindMAGMA.cmake b/aten/cmake/FindMAGMA.cmake
similarity index 100%
rename from aten/src/THCS/cmake/FindMAGMA.cmake
rename to aten/cmake/FindMAGMA.cmake
diff --git a/aten/src/THCS/cmake/select_compute_arch.cmake b/aten/cmake/select_compute_arch.cmake
similarity index 100%
rename from aten/src/THCS/cmake/select_compute_arch.cmake
rename to aten/cmake/select_compute_arch.cmake
diff --git a/aten/src/ATen/ATenConfig.cmake.in b/aten/src/ATen/ATenConfig.cmake.in
index f2ef158..e945926 100644
--- a/aten/src/ATen/ATenConfig.cmake.in
+++ b/aten/src/ATen/ATenConfig.cmake.in
@@ -1,8 +1,8 @@
 # Find the TH includes and library
 #
-# TH_INCLUDE_DIR -- where to find the includes
-# TH_LIBRARIES -- list of libraries to link against
-# TH_FOUND -- set to 1 if found
+# ATEN_INCLUDE_DIR -- where to find the includes
+# ATEN_LIBRARIES -- list of libraries to link against
+# ATEN_FOUND -- set to 1 if found
 
 SET(ATEN_FOUND 1)
 SET(ATEN_INCLUDE_DIR "@ATEN_INCLUDE_DIR@")
diff --git a/aten/src/ATen/CMakeLists.txt b/aten/src/ATen/CMakeLists.txt
index 055d9ba..a923dc4 100644
--- a/aten/src/ATen/CMakeLists.txt
+++ b/aten/src/ATen/CMakeLists.txt
@@ -16,9 +16,43 @@
 endif(${CMAKE_VERSION} VERSION_LESS "2.8.12")
 
 IF(NOT MSVC)
-  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wall -pedantic -Wno-vla")
+  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wall -Wno-vla")
 ENDIF(NOT MSVC)
 
+########################
+# SET_SOURCE_FILES_PROPERTIES must be in the same CMakeLists.txt file as the target that includes the file
+# so we need to set these commands here rather than in src/TH
+IF(C_SSE4_1_FOUND AND C_SSE4_2_FOUND)
+  IF(MSVC)
+    SET_SOURCE_FILES_PROPERTIES(${PROJECT_SOURCE_DIR}/src/TH/generic/simd/convolve5x5_sse.c PROPERTIES COMPILE_FLAGS "/Ox /fp:fast")
+  ELSE(MSVC)
+    SET_SOURCE_FILES_PROPERTIES(${PROJECT_SOURCE_DIR}/src/TH/generic/simd/convolve5x5_sse.c PROPERTIES COMPILE_FLAGS "-O3 -ffast-math")
+  ENDIF(MSVC)
+ENDIF(C_SSE4_1_FOUND AND C_SSE4_2_FOUND)
+IF(C_AVX_FOUND)
+  IF(MSVC)
+    SET_SOURCE_FILES_PROPERTIES(${PROJECT_SOURCE_DIR}/src/TH/generic/simd/convolve5x5_avx.c PROPERTIES COMPILE_FLAGS "/Ox /fp:fast ${C_AVX_FLAGS}")
+    SET_SOURCE_FILES_PROPERTIES(${PROJECT_SOURCE_DIR}/src/TH/vector/AVX.c PROPERTIES COMPILE_FLAGS "/Ox /arch:AVX ${C_AVX_FLAGS}")
+  ELSE(MSVC)
+    SET_SOURCE_FILES_PROPERTIES(${PROJECT_SOURCE_DIR}/src/TH/generic/simd/convolve5x5_avx.c PROPERTIES COMPILE_FLAGS "-O3 -ffast-math ${C_AVX_FLAGS}")
+    SET_SOURCE_FILES_PROPERTIES(${PROJECT_SOURCE_DIR}/src/TH/vector/AVX.c PROPERTIES COMPILE_FLAGS "-O3 ${C_AVX_FLAGS}")
+  ENDIF(MSVC)
+ENDIF(C_AVX_FOUND)
+
+IF(C_AVX2_FOUND)
+  IF(MSVC)
+    SET_SOURCE_FILES_PROPERTIES(${PROJECT_SOURCE_DIR}/src/TH/vector/AVX2.c PROPERTIES COMPILE_FLAGS "/Ox /arch:AVX2 ${C_AVX2_FLAGS}")
+  ELSE(MSVC)
+    SET_SOURCE_FILES_PROPERTIES(${PROJECT_SOURCE_DIR}/src/TH/vector/AVX2.c PROPERTIES COMPILE_FLAGS "-O3 ${C_AVX2_FLAGS}")
+  ENDIF(MSVC)
+ENDIF(C_AVX2_FOUND)
+
+IF(NOT MSVC)
+  SET_SOURCE_FILES_PROPERTIES(${PROJECT_SOURCE_DIR}/src/TH/THAtomic.c PROPERTIES COMPILE_FLAGS "-fno-openmp")
+  SET_SOURCE_FILES_PROPERTIES(${PROJECT_SOURCE_DIR}/src/TH/THAllocator.c PROPERTIES COMPILE_FLAGS "-fno-openmp")
+ENDIF()
+########################
+
 ################################################################################
 # Helper functions
 ################################################################################
@@ -51,20 +85,6 @@
   FIND_PACKAGE(Torch REQUIRED)
 ENDIF()
 
-IF(NOT TH_LIBRARIES)
-  SET(TH_LIBRARIES "TH")
-ENDIF(NOT TH_LIBRARIES)
-MESSAGE(STATUS "TH_LIBRARIES: ${TH_LIBRARIES}")
-
-IF(NOT THS_LIBRARIES)
-  SET(THS_LIBRARIES "THS")
-ENDIF()
-
-IF(NOT THNN_LIBRARIES)
-  SET(THNN_LIBRARIES "THNN")
-ENDIF(NOT THNN_LIBRARIES)
-MESSAGE(STATUS "THNN_LIBRARIES: ${THNN_LIBRARIES}")
-
 IF ($ENV{TH_BINARY_BUILD})
   MESSAGE(STATUS "TH_BINARY_BUILD detected. Statically linking libstdc++")
   SET(CMAKE_CXX_FLAGS "-static-libstdc++ ${CMAKE_CXX_FLAGS}")
@@ -81,21 +101,6 @@
   FIND_PACKAGE(CUDA 5.5 REQUIRED)
   ADD_DEFINITIONS(-DAT_CUDA_ENABLED)
   INCLUDE_DIRECTORIES(${CUDA_INCLUDE_DIRS})
-
-  IF(NOT THC_LIBRARIES)
-    SET(THC_LIBRARIES "THC")
-  ENDIF(NOT THC_LIBRARIES)
-  MESSAGE(STATUS "THC_LIBRARIES: ${THC_LIBRARIES}")
-
-  IF(NOT THCS_LIBRARIES)
-    SET(THCS_LIBRARIES "THCS")
-  ENDIF(NOT THCS_LIBRARIES)
-  MESSAGE(STATUS "THCS_LIBRARIES: ${THCS_LIBRARIES}")
-
-  IF(NOT THCUNN_LIBRARIES)
-    SET(THCUNN_LIBRARIES "THCUNN")
-  ENDIF(NOT THCUNN_LIBRARIES)
-  MESSAGE(STATUS "THCUNN_LIBRARIES: ${THCUNN_LIBRARIES}")
 ENDIF()
 
 # Can be compiled standalone
@@ -164,7 +169,15 @@
 COMMAND ${GEN_COMMAND}
 DEPENDS ${all_python} ${all_templates} ${cwrap_files})
 
-SET(all_cpp ${base_cpp} ${generated_cpp})
+SET(all_cpp ${base_cpp} ${generated_cpp} ${ATen_CPU_SRCS})
+
+INCLUDE_DIRECTORIES(${ATen_CPU_INCLUDE})
+IF(NOT NO_CUDA)
+  INCLUDE_DIRECTORIES(${ATen_CUDA_INCLUDE})
+  INCLUDE_DIRECTORIES("${CUDA_SDK_ROOT_DIR}/common/inc")
+  SET(all_cpp ${all_cpp} ${ATen_CUDA_SRCS})
+endif()
+
 filter_list(generated_h generated_cpp "\\.h$")
 
 INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR}/..)
@@ -173,31 +186,110 @@
 IF(NOT AT_LINK_STYLE)
   SET(AT_LINK_STYLE SHARED)
 ENDIF()
-ADD_LIBRARY(ATen ${AT_LINK_STYLE} ${all_cpp})
+IF(CUDA_FOUND)
+  CUDA_ADD_LIBRARY(ATen ${AT_LINK_STYLE} ${all_cpp})
+ELSE()
+  ADD_LIBRARY(ATen ${AT_LINK_STYLE} ${all_cpp})
+ENDIF()
+
 SET_TARGET_PROPERTIES(ATen PROPERTIES VERSION 1 SOVERSION 1)
 
 if(NOT ${CMAKE_VERSION} VERSION_LESS "3.1")
     SET_PROPERTY(TARGET ATen PROPERTY CXX_STANDARD 11)
 endif(NOT ${CMAKE_VERSION} VERSION_LESS "3.1")
 
-if(APPLE)
-set(WHOLE_ARCHIVE -Wl,-all_load)
-set(END_WHOLE_ARCHIVE -Wl,-noall_load)
-else()
-set(WHOLE_ARCHIVE -Wl,--whole-archive)
-set(END_WHOLE_ARCHIVE -Wl,--no-whole-archive)
-endif()
 
-TARGET_LINK_LIBRARIES(ATen PRIVATE
-  ${WHOLE_ARCHIVE}
-  ${TH_LIBRARIES} ${THNN_LIBRARIES} ${THS_LIBRARIES}
-  ${END_WHOLE_ARCHIVE})
+FIND_PACKAGE(BLAS)
+IF(BLAS_FOUND)
+  SET(USE_BLAS 1)
+  IF ($ENV{TH_BINARY_BUILD})
+    MESSAGE(STATUS "TH_BINARY_BUILD detected. Enabling special linkage.")
+    TARGET_LINK_LIBRARIES(ATen "${BLAS_LIBRARIES};${BLAS_LIBRARIES};${BLAS_LIBRARIES}")
+  ELSE ($ENV{TH_BINARY_BUILD})
+    TARGET_LINK_LIBRARIES(ATen ${BLAS_LIBRARIES})
+  ENDIF ($ENV{TH_BINARY_BUILD})
+
+  IF(BLAS_INFO STREQUAL "mkl")
+    ADD_DEFINITIONS(-DTH_BLAS_MKL)
+  ENDIF()
+ENDIF(BLAS_FOUND)
+
+IF(LAPACK_FOUND)
+  TARGET_LINK_LIBRARIES(ATen ${LAPACK_LIBRARIES})
+ENDIF(LAPACK_FOUND)
+
+IF (UNIX AND NOT APPLE)
+   INCLUDE(CheckLibraryExists)
+   # https://github.com/libgit2/libgit2/issues/2128#issuecomment-35649830
+   CHECK_LIBRARY_EXISTS(rt clock_gettime "time.h" NEED_LIBRT)
+   IF(NEED_LIBRT)
+     TARGET_LINK_LIBRARIES(ATen rt)
+     SET(CMAKE_REQUIRED_LIBRARIES ${CMAKE_REQUIRED_LIBRARIES} rt)
+   ENDIF(NEED_LIBRT)
+ENDIF(UNIX AND NOT APPLE)
+
+IF(UNIX)
+  SET(CMAKE_EXTRA_INCLUDE_FILES "sys/mman.h")
+  CHECK_FUNCTION_EXISTS(mmap HAVE_MMAP)
+  IF(HAVE_MMAP)
+    ADD_DEFINITIONS(-DHAVE_MMAP=1)
+  ENDIF(HAVE_MMAP)
+  # done for lseek: https://www.gnu.org/software/libc/manual/html_node/File-Position-Primitive.html
+  ADD_DEFINITIONS(-D_FILE_OFFSET_BITS=64)
+  CHECK_FUNCTION_EXISTS(shm_open HAVE_SHM_OPEN)
+  IF(HAVE_SHM_OPEN)
+    ADD_DEFINITIONS(-DHAVE_SHM_OPEN=1)
+  ENDIF(HAVE_SHM_OPEN)
+  CHECK_FUNCTION_EXISTS(shm_unlink HAVE_SHM_UNLINK)
+  IF(HAVE_SHM_UNLINK)
+    ADD_DEFINITIONS(-DHAVE_SHM_UNLINK=1)
+  ENDIF(HAVE_SHM_UNLINK)
+  CHECK_FUNCTION_EXISTS(malloc_usable_size HAVE_MALLOC_USABLE_SIZE)
+  IF(HAVE_MALLOC_USABLE_SIZE)
+    ADD_DEFINITIONS(-DHAVE_MALLOC_USABLE_SIZE=1)
+  ENDIF(HAVE_MALLOC_USABLE_SIZE)
+ENDIF(UNIX)
+
+IF(NOT MSVC)
+  TARGET_LINK_LIBRARIES(ATen m)
+ENDIF(NOT MSVC)
+
+# Is __thread supported?
+IF(NOT MSVC)
+  CHECK_C_SOURCE_COMPILES("static __thread int x = 1; int main() { return x; }" C_HAS_THREAD)
+ELSE(NOT MSVC)
+  CHECK_C_SOURCE_COMPILES("static __declspec( thread ) int x = 1; int main() { return x; }" C_HAS_THREAD)
+ENDIF(NOT MSVC)
+IF(NOT C_HAS_THREAD)
+  MESSAGE(STATUS "Warning: __thread is not supported, generating thread-unsafe code")
+ELSE(NOT C_HAS_THREAD)
+  SET(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -DTH_HAVE_THREAD")
+ENDIF(NOT C_HAS_THREAD)
+
 IF(CUDA_FOUND)
-  TARGET_LINK_LIBRARIES(ATen PRIVATE
-    ${WHOLE_ARCHIVE}
-    ${THC_LIBRARIES} ${THCUNN_LIBRARIES} ${THCS_LIBRARIES}
-    ${END_WHOLE_ARCHIVE})
-  TARGET_LINK_LIBRARIES(ATen ${CUDA_LIBRARIES})
+  TARGET_LINK_LIBRARIES(ATen
+    ${CUDA_LIBRARIES}
+    ${CUDA_cusparse_LIBRARY}
+    ${CUDA_curand_LIBRARY})
+  CUDA_ADD_CUBLAS_TO_TARGET(ATen)
+  IF(USE_MAGMA)
+    TARGET_LINK_LIBRARIES(ATen ${MAGMA_LIBRARIES})
+    IF ($ENV{TH_BINARY_BUILD})
+      # because magma is linked statically and it wants a BLAS,
+      # we need to link the BLAS lib against THC. Usually TH will
+      # load a BLAS library and it's all fine, but in the binary builds,
+      # TH uses static linkage to MKL, so it doesn't have all symbols that
+      # magma needs. So in this case, explicitly find a BLAS and link against it
+      # just like in TH
+      SET(CMAKE_MODULE_PATH ${CMAKE_CURRENT_SOURCE_DIR}/../TH/cmake ${CMAKE_MODULE_PATH})
+      FIND_PACKAGE(BLAS)
+      IF(BLAS_FOUND)
+        TARGET_LINK_LIBRARIES(ATen "${BLAS_LIBRARIES};${BLAS_LIBRARIES};${BLAS_LIBRARIES}")
+      ELSE(BLAS_FOUND)
+        MESSAGE(FATAL_ERROR "Binary build needs blas to be found here")
+      ENDIF(BLAS_FOUND)
+    ENDIF($ENV{TH_BINARY_BUILD})
+  ENDIF(USE_MAGMA)
 ENDIF()
 
 INSTALL(TARGETS ATen
diff --git a/aten/src/TH/CMakeLists.txt b/aten/src/TH/CMakeLists.txt
index 659a905..c19ca9c 100644
--- a/aten/src/TH/CMakeLists.txt
+++ b/aten/src/TH/CMakeLists.txt
@@ -1,397 +1,57 @@
-cmake_minimum_required(VERSION 2.6)
-
-# avoid some cmake warnings
-IF(POLICY CMP0026)
- CMAKE_POLICY(SET CMP0026 OLD)
-ENDIF()
-CMAKE_POLICY(SET CMP0012 NEW)
-CMAKE_POLICY(SET CMP0015 NEW)
-
-
-SET(CMAKE_MODULE_PATH ${CMAKE_CURRENT_SOURCE_DIR}/cmake ${CMAKE_MODULE_PATH})
-SET(CMAKE_LIBRARY_PATH /usr/lib/x86_64-linux-gnu/ ${CMAKE_LIBRARY_PATH})
-
-# Can be compiled standalone
-IF(NOT TH_INSTALL_BIN_SUBDIR
-    OR NOT TH_INSTALL_LIB_SUBDIR
-    OR NOT TH_INSTALL_INCLUDE_SUBDIR
-    OR NOT TH_INSTALL_CMAKE_SUBDIR)
-
-  SET(TH_INSTALL_BIN_SUBDIR "bin" CACHE PATH "TH install binary subdirectory")
-  SET(TH_INSTALL_LIB_SUBDIR "lib" CACHE PATH "TH install library subdirectory")
-  SET(TH_INSTALL_INCLUDE_SUBDIR "include" CACHE PATH "TH install include subdirectory")
-  SET(TH_INSTALL_CMAKE_SUBDIR "share/cmake/TH" CACHE PATH "TH install cmake subdirectory")
-ENDIF()
-
-IF(NOT TH_LINK_STYLE)
-  set(TH_LINK_STYLE SHARED)
-ENDIF()
-
-######################################################################
-###### macros section
-#####################################################################
-IF(NOT ADD_TORCH_LIBRARY)
-MACRO(ADD_TORCH_LIBRARY package type src)
-  IF ("${type}" STREQUAL "STATIC")
-    if ("${src}" MATCHES "cu$" OR "${src}" MATCHES "cu;")
-      CUDA_ADD_LIBRARY(${package} STATIC ${src})
-    else()
-      ADD_LIBRARY(${package} STATIC ${src})
-    endif()
-  ELSE()
-    if ("${src}" MATCHES "cu$" OR "${src}" MATCHES "cu;")
-      CUDA_ADD_LIBRARY(${package} ${type} ${src})
-    else()
-      ADD_LIBRARY(${package} ${type} ${src})
-    endif()
-  ENDIF()
-ENDMACRO()
-ENDIF()
-
-#######################################################################
-##### flags section
-######################################################################
-
-IF(MSVC)
-  # MSVC now supports C99 since VS2013/VS2015, however the standard version switch is not provided yet
-  # SET(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} /std:c99")
-ELSE(MSVC)
-  # enable gnu99 and not c99 because we use
-  # gnu extensions like posix_memalign
-  SET(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -std=gnu99")
-ENDIF(MSVC)
-
-IF(MSVC)
-  ADD_DEFINITIONS(-D_CRT_SECURE_NO_DEPRECATE=1)  # respect the standard
-ENDIF(MSVC)
-
-IF(UNIX)
-  # prevent Unknown CMake command "check_function_exists".
-  INCLUDE(CheckFunctionExists)
-ENDIF(UNIX)
-
-# OpenMP support?
-SET(WITH_OPENMP ON CACHE BOOL "OpenMP support if available?")
-IF (APPLE AND CMAKE_COMPILER_IS_GNUCC)
-  EXEC_PROGRAM (uname ARGS -v  OUTPUT_VARIABLE DARWIN_VERSION)
-  STRING (REGEX MATCH "[0-9]+" DARWIN_VERSION ${DARWIN_VERSION})
-  MESSAGE (STATUS "MAC OS Darwin Version: ${DARWIN_VERSION}")
-  IF (DARWIN_VERSION GREATER 9)
-    SET(APPLE_OPENMP_SUCKS 1)
-  ENDIF (DARWIN_VERSION GREATER 9)
-  EXECUTE_PROCESS (COMMAND ${CMAKE_C_COMPILER} -dumpversion
-    OUTPUT_VARIABLE GCC_VERSION)
-  IF (APPLE_OPENMP_SUCKS AND GCC_VERSION VERSION_LESS 4.6.2)
-    MESSAGE(STATUS "Warning: Disabling OpenMP (unstable with this version of GCC)")
-    MESSAGE(STATUS " Install GCC >= 4.6.2 or change your OS to enable OpenMP")
-    SET(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -Wno-unknown-pragmas")
-    SET(WITH_OPENMP OFF CACHE BOOL "OpenMP support if available?" FORCE)
-  ENDIF ()
-ENDIF ()
-
-IF (WITH_OPENMP)
-  FIND_PACKAGE(OpenMP)
-  IF(OPENMP_FOUND)
-    MESSAGE(STATUS "Compiling with OpenMP support ${OpenMP_C_FLAGS} ${OpenMP_CXX_FLAGS} ${OpenMP_EXE_LINKER_FLAGS}")
-    SET(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${OpenMP_C_FLAGS}" PARENT_SCOPE)
-    SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OpenMP_CXX_FLAGS}" PARENT_SCOPE)
-    SET(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} ${OpenMP_EXE_LINKER_FLAGS}" PARENT_SCOPE)
-  ENDIF(OPENMP_FOUND)
-ENDIF (WITH_OPENMP)
-
-# ARM specific flags
-FIND_PACKAGE(ARM)
-IF (ASIMD_FOUND)
-  MESSAGE(STATUS "asimd/Neon found with compiler flag : -D__NEON__")
-  SET(CMAKE_C_FLAGS "-D__NEON__ ${CMAKE_C_FLAGS}")
-ELSEIF (NEON_FOUND)
-  MESSAGE(STATUS "Neon found with compiler flag : -mfpu=neon -D__NEON__")
-  SET(CMAKE_C_FLAGS "-mfpu=neon -D__NEON__ ${CMAKE_C_FLAGS}")
-ENDIF (ASIMD_FOUND)
-IF (CORTEXA8_FOUND)
-  MESSAGE(STATUS "Cortex-A8 Found with compiler flag : -mcpu=cortex-a8")
-  SET(CMAKE_C_FLAGS "-mcpu=cortex-a8 -fprefetch-loop-arrays ${CMAKE_C_FLAGS}")
-ENDIF (CORTEXA8_FOUND)
-IF (CORTEXA9_FOUND)
-  MESSAGE(STATUS "Cortex-A9 Found with compiler flag : -mcpu=cortex-a9")
-  SET(CMAKE_C_FLAGS "-mcpu=cortex-a9 ${CMAKE_C_FLAGS}")
-ENDIF (CORTEXA9_FOUND)
-
-INCLUDE (CheckIncludeFile)
-INCLUDE (CheckCSourceCompiles)
-CHECK_INCLUDE_FILE(cpuid.h HAVE_CPUID_H)
-# Check for a cpuid intrinsic
-IF(HAVE_CPUID_H)
-    CHECK_C_SOURCE_COMPILES("#include <cpuid.h>
-        int main()
-        {
-            unsigned int eax, ebx, ecx, edx;
-            return __get_cpuid(0, &eax, &ebx, &ecx, &edx);
-        }" HAVE_GCC_GET_CPUID)
-ENDIF()
-IF(HAVE_GCC_GET_CPUID)
-  SET(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -DHAVE_GCC_GET_CPUID")
-ENDIF(HAVE_GCC_GET_CPUID)
-
-CHECK_C_SOURCE_COMPILES("#include <stdint.h>
-    static inline void cpuid(uint32_t *eax, uint32_t *ebx,
-    			 uint32_t *ecx, uint32_t *edx)
-    {
-      uint32_t a = *eax, b, c = *ecx, d;
-      asm volatile ( \"cpuid\" : \"+a\"(a), \"=b\"(b), \"+c\"(c), \"=d\"(d) );
-      *eax = a; *ebx = b; *ecx = c; *edx = d;
-    }
-    int main() {
-      uint32_t a,b,c,d;
-      cpuid(&a, &b, &c, &d);
-      return 0;
-    }" NO_GCC_EBX_FPIC_BUG)
-
-IF(NOT NO_GCC_EBX_FPIC_BUG)
-  SET(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -DUSE_GCC_GET_CPUID")
-ENDIF(NOT NO_GCC_EBX_FPIC_BUG)
-
-
-FIND_PACKAGE(SSE) # checks SSE, AVX and AVX2
-IF(C_SSE2_FOUND)
-  MESSAGE(STATUS "SSE2 Found")
-  SET(CMAKE_C_FLAGS "${C_SSE2_FLAGS} -DUSE_SSE2 ${CMAKE_C_FLAGS}")
-ENDIF(C_SSE2_FOUND)
-IF(C_SSE3_FOUND)
-  MESSAGE(STATUS "SSE3 Found")
-  SET(CMAKE_C_FLAGS "${C_SSE3_FLAGS} -DUSE_SSE3 ${CMAKE_C_FLAGS}")
-ENDIF(C_SSE3_FOUND)
-# we don't set -mavx and -mavx2 flags globally, but only for specific files
-# however, we want to enable the AVX codepaths, so we still need to
-# add USE_AVX and USE_AVX2 macro defines
-IF(C_AVX_FOUND)
-  MESSAGE(STATUS "AVX Found")
-  SET(CMAKE_C_FLAGS "-DUSE_AVX ${CMAKE_C_FLAGS}")
-ENDIF(C_AVX_FOUND)
-IF(C_AVX2_FOUND)
-  MESSAGE(STATUS "AVX2 Found")
-  SET(CMAKE_C_FLAGS "-DUSE_AVX2 ${CMAKE_C_FLAGS}")
-ENDIF(C_AVX2_FOUND)
-
-CHECK_C_SOURCE_RUNS("
-#include <stdatomic.h>
-int main()
-{
-  int a;
-  int oa;
-  atomic_store(&a, 1);
-  atomic_fetch_add(&a, 1);
-  oa = atomic_load(&a);
-  if(!atomic_compare_exchange_strong(&a, &oa, 3))
-    return -1;
-  return 0;
-}
-" HAS_C11_ATOMICS)
-
-IF(NOT HAS_C11_ATOMICS)
-  CHECK_C_SOURCE_RUNS("
-#include <intrin.h>
-int main()
-{
-  long a;
-  _InterlockedExchange(&a, 1);
-  _InterlockedExchangeAdd(&a, 1);
-  if(_InterlockedCompareExchange(&a, 3, 2) != 2)
-    return -1;
-  return 0;
-}
-" HAS_MSC_ATOMICS)
-
-  CHECK_C_SOURCE_RUNS("
-int main()
-{
-  int a;
-  __sync_lock_test_and_set(&a, 1);
-  __sync_fetch_and_add(&a, 1);
-  if(!__sync_bool_compare_and_swap(&a, 2, 3))
-    return -1;
-  return 0;
-}
-" HAS_GCC_ATOMICS)
-ENDIF()
-
-#######################################################################
-##### sources section
-######################################################################
+set(extra_src)
 
 # IF ANY SIMD FOUND
 IF(C_AVX2_FOUND OR C_AVX_FOUND OR C_SSE4_2_FOUND OR C_SSE4_1_FOUND)
-  SET(simd generic/simd/convolve.c)
+  LIST(APPEND extra_src ${CMAKE_CURRENT_SOURCE_DIR}/generic/simd/convolve.c)
 ENDIF(C_AVX2_FOUND OR C_AVX_FOUND OR C_SSE4_2_FOUND OR C_SSE4_1_FOUND)
 
 # IF SSE4 FOUND
 IF(C_SSE4_1_FOUND AND C_SSE4_2_FOUND)
-  SET(CMAKE_C_FLAGS "${C_SSE4_1_FLAGS} -DUSE_SSE4_1 ${C_SSE4_2_FLAGS} -DUSE_SSE4_2 ${CMAKE_C_FLAGS}")
-  IF(MSVC)
-    SET_SOURCE_FILES_PROPERTIES(generic/simd/convolve5x5_sse.c PROPERTIES COMPILE_FLAGS "/Ox /fp:fast")
-  ELSE(MSVC)
-    SET_SOURCE_FILES_PROPERTIES(generic/simd/convolve5x5_sse.c PROPERTIES COMPILE_FLAGS "-O3 -ffast-math")
-  ENDIF(MSVC)
-  SET(simd ${simd} generic/simd/convolve5x5_sse.c)
+  LIST(APPEND extra_src ${CMAKE_CURRENT_SOURCE_DIR}/generic/simd/convolve5x5_sse.c)
 ENDIF(C_SSE4_1_FOUND AND C_SSE4_2_FOUND)
 
 # IF AVX FOUND
 IF(C_AVX_FOUND)
-  IF(MSVC)
-    SET_SOURCE_FILES_PROPERTIES(generic/simd/convolve5x5_avx.c PROPERTIES COMPILE_FLAGS "/Ox /fp:fast ${C_AVX_FLAGS}")
-    SET_SOURCE_FILES_PROPERTIES(vector/AVX.c PROPERTIES COMPILE_FLAGS "/Ox /arch:AVX ${C_AVX_FLAGS}")
-  ELSE(MSVC)
-    SET_SOURCE_FILES_PROPERTIES(generic/simd/convolve5x5_avx.c PROPERTIES COMPILE_FLAGS "-O3 -ffast-math ${C_AVX_FLAGS}")
-    SET_SOURCE_FILES_PROPERTIES(vector/AVX.c PROPERTIES COMPILE_FLAGS "-O3 ${C_AVX_FLAGS}")
-  ENDIF(MSVC)
-  SET(simd ${simd} vector/AVX.c generic/simd/convolve5x5_avx.c)
+  LIST(APPEND extra_src ${CMAKE_CURRENT_SOURCE_DIR}/vector/AVX.c)
+  LIST(APPEND extra_src ${CMAKE_CURRENT_SOURCE_DIR}/generic/simd/convolve5x5_avx.c)
 ENDIF(C_AVX_FOUND)
 
 IF(C_AVX2_FOUND)
-  IF(MSVC)
-    SET_SOURCE_FILES_PROPERTIES(vector/AVX2.c PROPERTIES COMPILE_FLAGS "/Ox /arch:AVX2 ${C_AVX2_FLAGS}")
-  ELSE(MSVC)
-    SET_SOURCE_FILES_PROPERTIES(vector/AVX2.c PROPERTIES COMPILE_FLAGS "-O3 ${C_AVX2_FLAGS}")
-  ENDIF(MSVC)
-  SET(simd ${simd} vector/AVX2.c)
+  LIST(APPEND extra_src ${CMAKE_CURRENT_SOURCE_DIR}/vector/AVX2.c)
 ENDIF(C_AVX2_FOUND)
 
 SET(hdr
   THGeneral.h THHalf.h THAllocator.h THSize.h THStorage.h THTensor.h THTensorApply.h THBlas.h THMath.h
   THLapack.h THLogAdd.h THRandom.h THVector.h THAtomic.h )
 
-SET(src
-  THGeneral.c THHalf.c THAllocator.c THSize.c THStorage.c THTensor.c THBlas.c THLapack.c
-  THLogAdd.c THRandom.c THFile.c THDiskFile.c THMemoryFile.c THAtomic.c THVector.c)
+set(ATen_CPU_SRCS ${ATen_CPU_SRCS}
+  ${CMAKE_CURRENT_SOURCE_DIR}/THGeneral.c
+  ${CMAKE_CURRENT_SOURCE_DIR}/THHalf.c
+  ${CMAKE_CURRENT_SOURCE_DIR}/THAllocator.c
+  ${CMAKE_CURRENT_SOURCE_DIR}/THSize.c
+  ${CMAKE_CURRENT_SOURCE_DIR}/THStorage.c
+  ${CMAKE_CURRENT_SOURCE_DIR}/THTensor.c
+  ${CMAKE_CURRENT_SOURCE_DIR}/THBlas.c
+  ${CMAKE_CURRENT_SOURCE_DIR}/THLapack.c
+  ${CMAKE_CURRENT_SOURCE_DIR}/THLogAdd.c
+  ${CMAKE_CURRENT_SOURCE_DIR}/THRandom.c
+  ${CMAKE_CURRENT_SOURCE_DIR}/THFile.c
+  ${CMAKE_CURRENT_SOURCE_DIR}/THDiskFile.c
+  ${CMAKE_CURRENT_SOURCE_DIR}/THMemoryFile.c
+  ${CMAKE_CURRENT_SOURCE_DIR}/THAtomic.c
+  ${CMAKE_CURRENT_SOURCE_DIR}/THVector.c
+  ${extra_src}
+PARENT_SCOPE)
+######################################################
 
-SET(src ${src} ${hdr} ${simd})
 
-#######################################################################
-##### build section
-######################################################################
+set(ATen_CPU_INCLUDE ${ATen_CPU_INCLUDE}
+  "${CMAKE_CURRENT_BINARY_DIR}"
+PARENT_SCOPE)
 
-ADD_TORCH_LIBRARY(TH ${TH_LINK_STYLE} "${src}")
-
-IF (BUILD_STATIC OR "$ENV{STATIC_TH}" STREQUAL "YES")
-  ADD_TORCH_LIBRARY(TH_static STATIC "${src}")
-  SET_TARGET_PROPERTIES(TH_static PROPERTIES
-    COMPILE_FLAGS "-fPIC")
-  SET_TARGET_PROPERTIES(TH_static PROPERTIES
-    PREFIX "lib" IMPORT_PREFIX "lib" OUTPUT_NAME "TH")
-ENDIF()
-
-IF(NOT TH_SO_VERSION)
-  SET(TH_SO_VERSION 0)
-ENDIF(NOT TH_SO_VERSION)
-MESSAGE(STATUS "TH_SO_VERSION: ${TH_SO_VERSION}")
-SET_TARGET_PROPERTIES(TH PROPERTIES
-  VERSION   ${TH_SO_VERSION}
-  SOVERSION ${TH_SO_VERSION})
-
-IF(HAS_C11_ATOMICS)
-  ADD_DEFINITIONS(-DUSE_C11_ATOMICS=1)
-  MESSAGE(STATUS "Atomics: using C11 intrinsics")
-ELSEIF(HAS_MSC_ATOMICS)
-  ADD_DEFINITIONS(-DUSE_MSC_ATOMICS=1)
-  MESSAGE(STATUS "Atomics: using MSVC intrinsics")
-ELSEIF(HAS_GCC_ATOMICS)
-  ADD_DEFINITIONS(-DUSE_GCC_ATOMICS=1)
-    MESSAGE(STATUS "Atomics: using GCC intrinsics")
-ELSE()
-  SET(CMAKE_THREAD_PREFER_PTHREAD TRUE)
-  FIND_PACKAGE(Threads)
-  IF(THREADS_FOUND)
-    ADD_DEFINITIONS(-DUSE_PTHREAD_ATOMICS=1)
-    TARGET_LINK_LIBRARIES(TH ${CMAKE_THREAD_LIBS_INIT})
-    MESSAGE(STATUS "Atomics: using pthread")
-  ENDIF()
-ENDIF()
-
-FIND_PACKAGE(BLAS)
-IF(BLAS_FOUND)
-  SET(USE_BLAS 1)
-  IF ($ENV{TH_BINARY_BUILD})
-    MESSAGE(STATUS "TH_BINARY_BUILD detected. Enabling special linkage.")
-    TARGET_LINK_LIBRARIES(TH "${BLAS_LIBRARIES};${BLAS_LIBRARIES};${BLAS_LIBRARIES}")
-  ELSE ($ENV{TH_BINARY_BUILD})
-    TARGET_LINK_LIBRARIES(TH ${BLAS_LIBRARIES})
-  ENDIF ($ENV{TH_BINARY_BUILD})
-
-  IF(BLAS_INFO STREQUAL "mkl")
-    ADD_DEFINITIONS(-DTH_BLAS_MKL)
-  ENDIF()
-ENDIF(BLAS_FOUND)
-
-FIND_PACKAGE(LAPACK)
-IF(LAPACK_FOUND)
-  SET(USE_LAPACK 1)
-  TARGET_LINK_LIBRARIES(TH ${LAPACK_LIBRARIES})
-ENDIF(LAPACK_FOUND)
-
-IF (UNIX AND NOT APPLE)
-   INCLUDE(CheckLibraryExists)
-   # https://github.com/libgit2/libgit2/issues/2128#issuecomment-35649830
-   CHECK_LIBRARY_EXISTS(rt clock_gettime "time.h" NEED_LIBRT)
-   IF(NEED_LIBRT)
-     TARGET_LINK_LIBRARIES(TH rt)
-     SET(CMAKE_REQUIRED_LIBRARIES ${CMAKE_REQUIRED_LIBRARIES} rt)
-   ENDIF(NEED_LIBRT)
-ENDIF(UNIX AND NOT APPLE)
-
-IF(UNIX)
-  SET(CMAKE_EXTRA_INCLUDE_FILES "sys/mman.h")
-  CHECK_FUNCTION_EXISTS(mmap HAVE_MMAP)
-  IF(HAVE_MMAP)
-    ADD_DEFINITIONS(-DHAVE_MMAP=1)
-  ENDIF(HAVE_MMAP)
-  # done for lseek: https://www.gnu.org/software/libc/manual/html_node/File-Position-Primitive.html
-  ADD_DEFINITIONS(-D_FILE_OFFSET_BITS=64)
-  CHECK_FUNCTION_EXISTS(shm_open HAVE_SHM_OPEN)
-  IF(HAVE_SHM_OPEN)
-    ADD_DEFINITIONS(-DHAVE_SHM_OPEN=1)
-  ENDIF(HAVE_SHM_OPEN)
-  CHECK_FUNCTION_EXISTS(shm_unlink HAVE_SHM_UNLINK)
-  IF(HAVE_SHM_UNLINK)
-    ADD_DEFINITIONS(-DHAVE_SHM_UNLINK=1)
-  ENDIF(HAVE_SHM_UNLINK)
-  CHECK_FUNCTION_EXISTS(malloc_usable_size HAVE_MALLOC_USABLE_SIZE)
-  IF(HAVE_MALLOC_USABLE_SIZE)
-    ADD_DEFINITIONS(-DHAVE_MALLOC_USABLE_SIZE=1)
-  ENDIF(HAVE_MALLOC_USABLE_SIZE)
-ENDIF(UNIX)
-
-IF(NOT MSVC)
-  TARGET_LINK_LIBRARIES(TH m)
-ENDIF(NOT MSVC)
-
-# Is __thread supported?
-IF(NOT MSVC)
-  CHECK_C_SOURCE_COMPILES("static __thread int x = 1; int main() { return x; }" C_HAS_THREAD)
-ELSE(NOT MSVC)
-  CHECK_C_SOURCE_COMPILES("static __declspec( thread ) int x = 1; int main() { return x; }" C_HAS_THREAD)
-ENDIF(NOT MSVC)
-IF(NOT C_HAS_THREAD)
-  MESSAGE(STATUS "Warning: __thread is not supported, generating thread-unsafe code")
-ELSE(NOT C_HAS_THREAD)
-  SET(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -DTH_HAVE_THREAD")
-ENDIF(NOT C_HAS_THREAD)
-
-INCLUDE_DIRECTORIES("${CMAKE_CURRENT_BINARY_DIR}")
 CONFIGURE_FILE(THGeneral.h.in "${CMAKE_CURRENT_BINARY_DIR}/THGeneral.h")
 
 
-#######################################################################
-##### install section
-######################################################################
-
-IF(TH_INSTALL_LIBRARIES)
-  INSTALL(TARGETS TH
-    EXPORT TH-exports
-    RUNTIME DESTINATION "${TH_INSTALL_BIN_SUBDIR}"
-    LIBRARY DESTINATION "${TH_INSTALL_LIB_SUBDIR}"
-    ARCHIVE DESTINATION "${TH_INSTALL_LIB_SUBDIR}")
-ENDIF()
-
 INSTALL(FILES
   TH.h
   THAllocator.h
@@ -425,12 +85,12 @@
   THVector.h
   THAtomic.h
   THHalf.h
-  DESTINATION "${TH_INSTALL_INCLUDE_SUBDIR}/TH")
+  DESTINATION "${ATEN_INSTALL_INCLUDE_SUBDIR}/TH")
 
 INSTALL(FILES
   vector/AVX.h
   vector/AVX2.h
-  DESTINATION "${TH_INSTALL_INCLUDE_SUBDIR}/TH/vector")
+  DESTINATION "${ATEN_INSTALL_INCLUDE_SUBDIR}/TH/vector")
 
 INSTALL(FILES
   generic/THBlas.c
@@ -455,42 +115,4 @@
   generic/THTensorRandom.h
   generic/THVectorDispatch.c
   generic/THVector.h
-  DESTINATION "${TH_INSTALL_INCLUDE_SUBDIR}/TH/generic")
-
-
-IF (WIN32 AND NOT CYGWIN)
-  SET(BLAS_INSTALL_LIBRARIES "OFF"
-    CACHE BOOL "Copy the required BLAS DLLs into the TH install dirs")
-ENDIF (WIN32 AND NOT CYGWIN)
-
-MACRO(Install_Required_Library ln)
-    get_filename_component(libpath ${ln} PATH)
-    get_filename_component(libname ${ln} NAME_WE)
-    file(GLOB libdlls "${libpath}/${libname}*.dll")
-    install(PROGRAMS ${libdlls}
-      DESTINATION "${TH_INSTALL_BIN_SUBDIR}")
-ENDMACRO(Install_Required_Library libname)
-
-IF (BLAS_FOUND AND BLAS_INSTALL_LIBRARIES)
-  IF (BLAS_goto2_LIBRARY)
-    Install_Required_Library(${BLAS_goto2_LIBRARY})
-    Install_Required_Library("${libpath}/libgfortran")
-    Install_Required_Library("${libpath}/libquadmath")
-    Install_Required_Library("${libpath}/libgcc")
-  ENDIF()
-  IF (BLAS_openblas_LIBRARY)
-    Install_Required_Library(${BLAS_openblas_LIBRARY})
-    Install_Required_Library("${libpath}/libquadmath")
-    Install_Required_Library("${libpath}/libgfortran")
-    Install_Required_Library("${libpath}/libgcc")
-  ENDIF()
-ENDIF()
-
-# Create THConfig.cmake
-GET_TARGET_PROPERTY(TH_OUTPUT_NAME TH LOCATION)
-GET_FILENAME_COMPONENT(TH_OUTPUT_NAME ${TH_OUTPUT_NAME} NAME)
-SET(TH_LIBRARIES "${CMAKE_INSTALL_PREFIX}/${TH_INSTALL_LIB_SUBDIR}/${TH_OUTPUT_NAME}")
-SET(TH_INCLUDE_DIR "${CMAKE_INSTALL_PREFIX}/${TH_INSTALL_INCLUDE_SUBDIR}/TH")
-CONFIGURE_FILE(THConfig.cmake.in "${CMAKE_CURRENT_BINARY_DIR}/cmake-exports/THConfig.cmake")
-INSTALL(FILES "${CMAKE_CURRENT_BINARY_DIR}/cmake-exports/THConfig.cmake"
-  DESTINATION "${TH_INSTALL_CMAKE_SUBDIR}")
+  DESTINATION "${ATEN_INSTALL_INCLUDE_SUBDIR}/TH/generic")
diff --git a/aten/src/TH/vector/AVX2.c b/aten/src/TH/vector/AVX2.c
index 082a680..70a19bc 100644
--- a/aten/src/TH/vector/AVX2.c
+++ b/aten/src/TH/vector/AVX2.c
@@ -3,6 +3,7 @@
 #include <x86intrin.h>
 #else
 #include <intrin.h>
+#include <immintrin.h>
 #endif
 #include "AVX2.h"
 
diff --git a/aten/src/THC/CMakeLists.txt b/aten/src/THC/CMakeLists.txt
index fc918f2..d3a9b70 100644
--- a/aten/src/THC/CMakeLists.txt
+++ b/aten/src/THC/CMakeLists.txt
@@ -1,213 +1,11 @@
-CMAKE_MINIMUM_REQUIRED(VERSION 2.8 FATAL_ERROR)
-CMAKE_POLICY(VERSION 2.8)
+set(ATen_CUDA_INCLUDE ${ATen_CUDA_INCLUDE}
+  "${CMAKE_CURRENT_BINARY_DIR}"
+  "${CMAKE_CURRENT_SOURCE_DIR}"
+PARENT_SCOPE)
 
-SET(CMAKE_MODULE_PATH ${CMAKE_CURRENT_SOURCE_DIR}/cmake ${CMAKE_MODULE_PATH})
-
-SET(CUDA_ATTACH_VS_BUILD_RULE_TO_CUDA_FILE OFF)
-OPTION(NDEBUG "disable asserts (WARNING: this may result in invalid memory accesses)")
-IF(NOT NDEBUG)
-  MESSAGE(STATUS "Removing -DNDEBUG from compile flags")
-  STRING(REPLACE "-DNDEBUG" "" CMAKE_C_FLAGS "" ${CMAKE_C_FLAGS})
-  STRING(REPLACE "-DNDEBUG" "" CMAKE_C_FLAGS_DEBUG "" ${CMAKE_C_FLAGS_DEBUG})
-  STRING(REPLACE "-DNDEBUG" "" CMAKE_C_FLAGS_RELEASE "" ${CMAKE_C_FLAGS_RELEASE})
-  STRING(REPLACE "-DNDEBUG" "" CMAKE_CXX_FLAGS "" ${CMAKE_CXX_FLAGS})
-  STRING(REPLACE "-DNDEBUG" "" CMAKE_CXX_FLAGS_DEBUG "" ${CMAKE_CXX_FLAGS_DEBUG})
-  STRING(REPLACE "-DNDEBUG" "" CMAKE_CXX_FLAGS_RELEASE "" ${CMAKE_CXX_FLAGS_RELEASE})
-ENDIF()
-
-IF(NOT Torch_FOUND)
-  FIND_PACKAGE(Torch)
-ENDIF()
-
-IF(NOT TH_LIBRARIES)
-  SET(TH_LIBRARIES "TH")
-ENDIF(NOT TH_LIBRARIES)
-MESSAGE(STATUS "TH_LIBRARIES: ${TH_LIBRARIES}")
-
-IF(NOT CUDA_FOUND)
-  FIND_PACKAGE(CUDA 5.5 REQUIRED)
-ENDIF()
-
-IF(NOT MAGMA_FOUND)
-  FIND_PACKAGE(MAGMA)
-ENDIF()
-
-if(CMAKE_CXX_COMPILER_ID STREQUAL "GNU")
-  if(CMAKE_CXX_COMPILER_VERSION VERSION_GREATER "4.9")
-    if(CUDA_VERSION VERSION_LESS "8.0")
-      MESSAGE(STATUS "Found gcc >=5 and CUDA <= 7.5, adding workaround C++ flags")
-      set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -D_FORCE_INLINES -D_MWAITXINTRIN_H_INCLUDED -D__STRICT_ANSI__")
-    endif(CUDA_VERSION VERSION_LESS "8.0")
-  endif(CMAKE_CXX_COMPILER_VERSION VERSION_GREATER "4.9")
-endif(CMAKE_CXX_COMPILER_ID STREQUAL "GNU")
-
-
-if(CUDA_VERSION VERSION_GREATER "8.0")
-  LIST(APPEND CUDA_NVCC_FLAGS "-D__CUDA_NO_HALF_OPERATORS__")
-endif(CUDA_VERSION VERSION_GREATER "8.0")
-
-IF(CMAKE_CXX_COMPILER_ID STREQUAL "GNU")
-  IF(CMAKE_CXX_COMPILER_VERSION VERSION_GREATER "4.7" OR CMAKE_CXX_COMPILER_VERSION VERSION_EQUAL "4.7" )
-    SET(CXX_VERSION "c++11")
-  ELSE()
-    SET(CXX_VERSION "c++0x")
-  ENDIF()
-  SET_SOURCE_FILES_PROPERTIES(
-    THCTensorRandom.cpp
-    THCCachingAllocator.cpp
-    THCCachingHostAllocator.cpp
-    THCStream.cpp
-    PROPERTIES COMPILE_FLAGS -std=${CXX_VERSION})
-ELSE()
-  SET(CMAKE_CXX_STANDARD 11)
-ENDIF()
-
-
-INCLUDE_DIRECTORIES(${CUDA_INCLUDE_DIRS})
-INCLUDE_DIRECTORIES("${CUDA_SDK_ROOT_DIR}/common/inc")
-
-IF ("$ENV{STATIC_TH}" STREQUAL "YES")
-LIST(APPEND CUDA_NVCC_FLAGS "-Xcompiler -fPIC")
-ENDIF()
-
-IF(MAGMA_FOUND)
-  INCLUDE_DIRECTORIES(${MAGMA_INCLUDE_DIR})
-  SET(CMAKE_REQUIRED_INCLUDES "${MAGMA_INCLUDE_DIR};${CUDA_INCLUDE_DIRS}")
-  INCLUDE(CheckPrototypeDefinition)
-  check_prototype_definition(magma_get_sgeqrf_nb
-   "magma_int_t magma_get_sgeqrf_nb( magma_int_t m, magma_int_t n );"
-   "0"
-   "magma.h"
-    MAGMA_V2)
-  IF (MAGMA_V2)
-    add_definitions(-DMAGMA_V2)
-  ENDIF (MAGMA_V2)
-
-  SET(USE_MAGMA 1)
-  MESSAGE(STATUS "Compiling with MAGMA support")
-  MESSAGE(STATUS "MAGMA INCLUDE DIRECTORIES: ${MAGMA_INCLUDE_DIR}")
-  MESSAGE(STATUS "MAGMA LIBRARIES: ${MAGMA_LIBRARIES}")
-  MESSAGE(STATUS "MAGMA V2 check: ${MAGMA_V2}")
-ELSE(MAGMA_FOUND)
-  MESSAGE(STATUS "MAGMA not found. Compiling without MAGMA support")
-ENDIF(MAGMA_FOUND)
-
-IF ($ENV{TH_BINARY_BUILD})
-  MESSAGE(STATUS "TH_BINARY_BUILD detected. Statically linking libstdc++")
-  SET(CMAKE_CXX_FLAGS "-static-libstdc++ ${CMAKE_CXX_FLAGS}")
-  IF (UNIX AND NOT APPLE)
-    # hiding statically linked library symbols, this flag is not available for the linker under MACOSX
-    SET(CMAKE_CXX_FLAGS "-Wl,--exclude-libs,libstdc++.a ${CMAKE_CXX_FLAGS}")
-  ENDIF(UNIX AND NOT APPLE)
-ENDIF()
-
-IF(APPLE)
-  IF(${CUDA_VERSION} LESS 6.0)
-    # work around for mac os x bug:
-    # http://stackoverflow.com/questions/16286588/cuda-5-0-cmake-and-make-failing-on-osx-10-8-3
-    if (NOT DEFINED CUDA_HOST_COMPILER AND CMAKE_C_COMPILER_ID STREQUAL "Clang" AND EXISTS /usr/bin/gcc)
-      set(CUDA_HOST_COMPILER /usr/bin/gcc CACHE FILEPATH "Host side compiler used by NVCC")
-      message(STATUS "Setting CMAKE_HOST_COMPILER to /usr/bin/gcc instead of ${CMAKE_C_COMPILER}.")
-    endif()
-
-    # bug on Apple
-    LINK_DIRECTORIES("/usr/local/cuda/lib/")
-  ELSEIF(${CUDA_VERSION} LESS 7.0)
-    SET(CUDA_HOST_COMPILER clang)
-    LIST(APPEND CUDA_NVCC_FLAGS "-Xcompiler -stdlib=libstdc++ -Xlinker -stdlib=libstdc++")
-    IF("${CMAKE_CXX_COMPILER_ID}" STREQUAL "Clang")
-      set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -stdlib=libstdc++")
-    ENDIF()
-  ENDIF()
-  # CUDA 7 supports clang and libc++ so no need to change anything
-ENDIF(APPLE)
-
-# Detect CUDA architecture and get best NVCC flags
-IF(NOT COMMAND CUDA_SELECT_NVCC_ARCH_FLAGS OR MSVC)
-  INCLUDE(${CMAKE_CURRENT_SOURCE_DIR}/cmake/select_compute_arch.cmake)
-ENDIF()
-LIST(APPEND CUDA_NVCC_FLAGS $ENV{TORCH_NVCC_FLAGS})
-CUDA_SELECT_NVCC_ARCH_FLAGS(NVCC_FLAGS_EXTRA $ENV{TORCH_CUDA_ARCH_LIST})
-LIST(APPEND CUDA_NVCC_FLAGS ${NVCC_FLAGS_EXTRA})
-IF(CMAKE_POSITION_INDEPENDENT_CODE)
-  LIST(APPEND CUDA_NVCC_FLAGS "-Xcompiler -fPIC")
-ENDIF()
-
-IF(NOT THC_INSTALL_BIN_SUBDIR
-    OR NOT THC_INSTALL_LIB_SUBDIR
-    OR NOT THC_INSTALL_INCLUDE_SUBDIR
-    OR NOT THC_INSTALL_CMAKE_SUBDIR)
-
-  INCLUDE_DIRECTORIES(${TH_INCLUDE_PATH} ${TH_INCLUDE_PATH}/TH)
-  LINK_DIRECTORIES(${TH_LIB_PATH})
-
-  IF(Torch_INSTALL_BIN_SUBDIR)
-    SET(THC_INSTALL_BIN_SUBDIR ${Torch_INSTALL_BIN_SUBDIR})
-    SET(THC_INSTALL_LIB_SUBDIR ${Torch_INSTALL_LIB_SUBDIR})
-    SET(THC_INSTALL_INCLUDE_SUBDIR ${Torch_INSTALL_INCLUDE_SUBDIR})
-    SET(THC_INSTALL_CMAKE_SUBDIR ${Torch_INSTALL_CMAKE_SUBDIR})
-  ELSE(Torch_INSTALL_BIN_SUBDIR)
-    # not installing in a Torch context, so Torch_INSTALL_BIN_SUBDIR is not available
-    SET(THC_INSTALL_BIN_SUBDIR "bin" CACHE PATH "THC install binary subdirectory")
-    SET(THC_INSTALL_LIB_SUBDIR "lib" CACHE PATH "THC install library subdirectory")
-    SET(THC_INSTALL_INCLUDE_SUBDIR "include" CACHE PATH "THC install include subdirectory")
-    SET(THC_INSTALL_CMAKE_SUBDIR "share/cmake/THC" CACHE PATH "THC install cmake subdirectory")
-  ENDIF(Torch_INSTALL_BIN_SUBDIR)
-
-ENDIF()
-IF(NOT TH_LINK_STYLE)
-  set(TH_LINK_STYLE SHARED)
-ENDIF()
-
-
-INCLUDE_DIRECTORIES("${CMAKE_CURRENT_SOURCE_DIR}")
-INCLUDE_DIRECTORIES("${CMAKE_CURRENT_BINARY_DIR}")
 CONFIGURE_FILE(THCGeneral.h.in "${CMAKE_CURRENT_BINARY_DIR}/THCGeneral.h")
 
-IF(MSVC)
-  LIST(APPEND CUDA_NVCC_FLAGS "-Xcompiler /wd4819 -Xcompiler /wd4503")
-ELSE()
-  SET(CMAKE_C_FLAGS "-std=c99 ${CMAKE_C_FLAGS}")
-ENDIF()
-
-SET(src
-    THCCachingAllocator.cpp
-    THCCachingHostAllocator.cpp
-    THCGeneral.c
-    THCStorageCopy.c
-    THCStream.cpp
-    THCTensor.c
-    THCTensorCopy.c
-    THCTensorRandom.cpp
-    THCThreadLocal.c
-    )
-
-SET(src-cuda
-  THCReduceApplyUtils.cu
-  THCBlas.cu
-  THCSleep.cu
-  THCStorage.cu
-  THCStorageCopy.cu
-  THCTensor.cu
-  THCTensorCopy.cu
-  THCTensorMath.cu
-  THCTensorMath2.cu
-  THCTensorMathBlas.cu
-  THCTensorMathMagma.cu
-  THCTensorMathPairwise.cu
-  THCTensorMathReduce.cu
-  THCTensorMathScan.cu
-  THCTensorIndex.cu
-  THCTensorConv.cu
-  THCTensorRandom.cu
-  THCTensorScatterGather.cu
-  THCTensorTopK.cu
-  THCTensorSort.cu
-  THCTensorTypeUtils.cu
-  THCSortUtils.cu
-  THCTensorMode.cu
-  )
-
+set(extra_src)
 # loop over all types
 foreach(THC_TYPE Byte Char Short Int Long Half Float Double)
    # loop over files which need to be split between types (because of long compile times)
@@ -216,65 +14,50 @@
          FILE(WRITE "${CMAKE_CURRENT_SOURCE_DIR}/generated/THC${THC_FILE}${THC_TYPE}.cu"
               "#include \"../THC${THC_FILE}.cuh\"\n#include \"../generic/THC${THC_FILE}.cu\"\n#include \"../THCGenerate${THC_TYPE}Type.h\"\n")
       endif()
-      LIST(APPEND src-cuda "${CMAKE_CURRENT_SOURCE_DIR}/generated/THC${THC_FILE}${THC_TYPE}.cu")
+      LIST(APPEND extra_src "${CMAKE_CURRENT_SOURCE_DIR}/generated/THC${THC_FILE}${THC_TYPE}.cu")
    endforeach()
 endforeach()
 
-MESSAGE(STATUS "got cuda version " ${CUDA_VERSION})
-
 IF(CUDA_HAS_FP16 OR NOT ${CUDA_VERSION} LESS 7.5)
-  MESSAGE(STATUS "Found CUDA with FP16 support, compiling with torch.CudaHalfTensor")
-  LIST(APPEND src-cuda THCHalf.cu)
-  LIST(APPEND CUDA_NVCC_FLAGS "-DCUDA_HAS_FP16=1")
-  SET(CMAKE_C_FLAGS "-DCUDA_HAS_FP16=1 ${CMAKE_C_FLAGS}")
-ELSE(CUDA_HAS_FP16 OR NOT ${CUDA_VERSION} LESS 7.5)
-  MESSAGE(STATUS "Could not find CUDA with FP16 support, compiling without torch.CudaHalfTensor")
-ENDIF(CUDA_HAS_FP16 OR NOT ${CUDA_VERSION} LESS 7.5)
-
-MESSAGE(STATUS "CUDA_NVCC_FLAGS: ${CUDA_NVCC_FLAGS}")
-IF ("$ENV{STATIC_TH}" STREQUAL "YES")
-  CUDA_ADD_LIBRARY(THC STATIC ${src} ${src-cuda})
-  SET_TARGET_PROPERTIES(THC PROPERTIES COMPILE_FLAGS "-fPIC")
-ELSE()
-  CUDA_ADD_LIBRARY(THC ${TH_LINK_STYLE} ${src} ${src-cuda})
-  CUDA_ADD_CUBLAS_TO_TARGET(THC)
-  TARGET_LINK_LIBRARIES(THC ${TH_LIBRARIES} ${CUDA_curand_LIBRARY} ${CUDA_cusparse_LIBRARY})
-
-  IF(USE_MAGMA)
-    TARGET_LINK_LIBRARIES(THC ${MAGMA_LIBRARIES})
-    IF ($ENV{TH_BINARY_BUILD})
-      # because magma is linked statically and it wants a BLAS,
-      # we need to link the BLAS lib against THC. Usually TH will
-      # load a BLAS library and it's all fine, but in the binary builds,
-      # TH uses static linkage to MKL, so it doesn't have all symbols that
-      # magma needs. So in this case, explicitly find a BLAS and link against it
-      # just like in TH
-      SET(CMAKE_MODULE_PATH ${CMAKE_CURRENT_SOURCE_DIR}/../TH/cmake ${CMAKE_MODULE_PATH})
-      FIND_PACKAGE(BLAS)
-      IF(BLAS_FOUND)
-        TARGET_LINK_LIBRARIES(THC "${BLAS_LIBRARIES};${BLAS_LIBRARIES};${BLAS_LIBRARIES}")
-      ELSE(BLAS_FOUND)
-        MESSAGE(FATAL_ERROR "Binary build needs blas to be found here")
-      ENDIF(BLAS_FOUND)
-    ENDIF($ENV{TH_BINARY_BUILD})
-  ENDIF(USE_MAGMA)
-
-  IF(NOT THC_SO_VERSION)
-    SET(THC_SO_VERSION 0)
-  ENDIF(NOT THC_SO_VERSION)
-  MESSAGE(STATUS "THC_SO_VERSION: ${THC_SO_VERSION}")
-  SET_TARGET_PROPERTIES(THC PROPERTIES
-    VERSION   ${THC_SO_VERSION}
-    SOVERSION ${THC_SO_VERSION})
-
-  IF(TH_INSTALL_LIBRARIES)
-    INSTALL(TARGETS THC
-      RUNTIME DESTINATION "${THC_INSTALL_BIN_SUBDIR}"
-      LIBRARY DESTINATION "${THC_INSTALL_LIB_SUBDIR}"
-      ARCHIVE DESTINATION "${THC_INSTALL_LIB_SUBDIR}")
-  ENDIF()
+  LIST(APPEND extra_src ${CMAKE_CURRENT_SOURCE_DIR}/THCHalf.cu)
 ENDIF()
 
+set(ATen_CUDA_SRCS ${ATen_CUDA_SRCS}
+  ${CMAKE_CURRENT_SOURCE_DIR}/THCCachingAllocator.cpp
+  ${CMAKE_CURRENT_SOURCE_DIR}/THCCachingHostAllocator.cpp
+  ${CMAKE_CURRENT_SOURCE_DIR}/THCGeneral.c
+  ${CMAKE_CURRENT_SOURCE_DIR}/THCStorageCopy.c
+  ${CMAKE_CURRENT_SOURCE_DIR}/THCStream.cpp
+  ${CMAKE_CURRENT_SOURCE_DIR}/THCTensor.c
+  ${CMAKE_CURRENT_SOURCE_DIR}/THCTensorCopy.c
+  ${CMAKE_CURRENT_SOURCE_DIR}/THCTensorRandom.cpp
+  ${CMAKE_CURRENT_SOURCE_DIR}/THCThreadLocal.c
+
+  ${CMAKE_CURRENT_SOURCE_DIR}/THCReduceApplyUtils.cu
+  ${CMAKE_CURRENT_SOURCE_DIR}/THCBlas.cu
+  ${CMAKE_CURRENT_SOURCE_DIR}/THCSleep.cu
+  ${CMAKE_CURRENT_SOURCE_DIR}/THCStorage.cu
+  ${CMAKE_CURRENT_SOURCE_DIR}/THCStorageCopy.cu
+  ${CMAKE_CURRENT_SOURCE_DIR}/THCTensor.cu
+  ${CMAKE_CURRENT_SOURCE_DIR}/THCTensorCopy.cu
+  ${CMAKE_CURRENT_SOURCE_DIR}/THCTensorMath.cu
+  ${CMAKE_CURRENT_SOURCE_DIR}/THCTensorMath2.cu
+  ${CMAKE_CURRENT_SOURCE_DIR}/THCTensorMathBlas.cu
+  ${CMAKE_CURRENT_SOURCE_DIR}/THCTensorMathMagma.cu
+  ${CMAKE_CURRENT_SOURCE_DIR}/THCTensorMathPairwise.cu
+  ${CMAKE_CURRENT_SOURCE_DIR}/THCTensorMathReduce.cu
+  ${CMAKE_CURRENT_SOURCE_DIR}/THCTensorMathScan.cu
+  ${CMAKE_CURRENT_SOURCE_DIR}/THCTensorIndex.cu
+  ${CMAKE_CURRENT_SOURCE_DIR}/THCTensorConv.cu
+  ${CMAKE_CURRENT_SOURCE_DIR}/THCTensorRandom.cu
+  ${CMAKE_CURRENT_SOURCE_DIR}/THCTensorScatterGather.cu
+  ${CMAKE_CURRENT_SOURCE_DIR}/THCTensorTopK.cu
+  ${CMAKE_CURRENT_SOURCE_DIR}/THCTensorSort.cu
+  ${CMAKE_CURRENT_SOURCE_DIR}/THCTensorTypeUtils.cu
+  ${CMAKE_CURRENT_SOURCE_DIR}/THCSortUtils.cu
+  ${CMAKE_CURRENT_SOURCE_DIR}/THCTensorMode.cu
+  ${extra_src}
+  PARENT_SCOPE)
 
 INSTALL(FILES
           THC.h
@@ -328,7 +111,7 @@
           THCThrustAllocator.cuh
           THCTensorMode.cuh
           THCTensorTopK.cuh
-          DESTINATION "${THC_INSTALL_INCLUDE_SUBDIR}/THC")
+          DESTINATION "${ATEN_INSTALL_INCLUDE_SUBDIR}/THC")
 
 INSTALL(FILES
           generic/THCStorage.c
@@ -376,4 +159,4 @@
           generic/THCTensorMode.cu
           generic/THCTensorTopK.h
           generic/THCTensorTopK.cu
-          DESTINATION "${THC_INSTALL_INCLUDE_SUBDIR}/THC/generic")
+          DESTINATION "${ATEN_INSTALL_INCLUDE_SUBDIR}/THC/generic")
diff --git a/aten/src/THC/THCCachingHostAllocator.cpp b/aten/src/THC/THCCachingHostAllocator.cpp
index a43cb30..867fda6 100644
--- a/aten/src/THC/THCCachingHostAllocator.cpp
+++ b/aten/src/THC/THCCachingHostAllocator.cpp
@@ -134,7 +134,6 @@
   cudaError_t recordEvent(void* ptr, THCStream *stream)
   {
     std::lock_guard<std::mutex> lock(mutex);
-    cudaError_t err;
 
     auto it = blocks.find(ptr);
     if (it == blocks.end()) {
diff --git a/aten/src/THC/generic/THCTensorMath.cu b/aten/src/THC/generic/THCTensorMath.cu
index 5ff5700..c461ed9 100644
--- a/aten/src/THC/generic/THCTensorMath.cu
+++ b/aten/src/THC/generic/THCTensorMath.cu
@@ -224,7 +224,7 @@
         stackInputs[j].offset = offset;
         stackInputs[j].dimSize = dimSize;
         stackInputs[j].nElements = THCTensor_(nElement)(state, inputs[i+j]);
-        cohortMax = cohortMax > stackInputs[j].nElements ? cohortMax : stackInputs[j].nElements;
+        cohortMax = cohortMax > (int) stackInputs[j].nElements ? cohortMax : (int) stackInputs[j].nElements;
 
         // update offset
         offset += dimSize;
diff --git a/aten/src/THCS/CMakeLists.txt b/aten/src/THCS/CMakeLists.txt
index 541525b..2471cb6 100644
--- a/aten/src/THCS/CMakeLists.txt
+++ b/aten/src/THCS/CMakeLists.txt
@@ -1,161 +1,21 @@
-CMAKE_MINIMUM_REQUIRED(VERSION 2.8 FATAL_ERROR)
-CMAKE_POLICY(VERSION 2.8)
-
-SET(CMAKE_MODULE_PATH ${CMAKE_CURRENT_SOURCE_DIR}/cmake ${CMAKE_MODULE_PATH})
-
-IF(NOT Torch_FOUND)
-  FIND_PACKAGE(Torch)
-ENDIF()
-IF(NOT CUDA_FOUND)
-  FIND_PACKAGE(CUDA 5.5 REQUIRED)
-ENDIF()
 IF(NOT MAGMA_FOUND)
   FIND_PACKAGE(MAGMA)
 ENDIF()
-IF(NOT TH_LIBRARIES)
-  SET(TH_LIBRARIES "TH")
-ENDIF(NOT TH_LIBRARIES)
-MESSAGE(STATUS "TH_LIBRARIES: ${TH_LIBRARIES}")
-IF(NOT THC_LIBRARIES)
-  SET(THC_LIBRARIES "THC")
-ENDIF(NOT THC_LIBRARIES)
-MESSAGE(STATUS "THC_LIBRARIES: ${THC_LIBRARIES}")
-IF(NOT THS_LIBRARIES)
-  SET(THS_LIBRARIES "THS")
-ENDIF(NOT THS_LIBRARIES)
-MESSAGE(STATUS "THS_LIBRARIES: ${THS_LIBRARIES}")
 
-if(CMAKE_CXX_COMPILER_ID STREQUAL "GNU")
-  if(CMAKE_CXX_COMPILER_VERSION VERSION_GREATER "4.9")
-    if(CUDA_VERSION VERSION_LESS "8.0")
-      MESSAGE(STATUS "Found gcc >=5 and CUDA <= 7.5, adding workaround C++ flags")
-      set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -D_FORCE_INLINES")
-    endif(CUDA_VERSION VERSION_LESS "8.0")
-  endif(CMAKE_CXX_COMPILER_VERSION VERSION_GREATER "4.9")
-endif(CMAKE_CXX_COMPILER_ID STREQUAL "GNU")
+INCLUDE_DIRECTORIES()
 
-if(CUDA_VERSION VERSION_GREATER "8.0")
-  LIST(APPEND CUDA_NVCC_FLAGS "-D__CUDA_NO_HALF_OPERATORS__")
-endif(CUDA_VERSION VERSION_GREATER "8.0")
-
-if(NOT CMAKE_CXX_COMPILER_ID STREQUAL "GNU")
-  SET(CMAKE_CXX_STANDARD 11)
-endif()
-
-IF ($ENV{TH_BINARY_BUILD})
-  MESSAGE(STATUS "TH_BINARY_BUILD detected. Statically linking libstdc++")
-  SET(CMAKE_CXX_FLAGS "-static-libstdc++ ${CMAKE_CXX_FLAGS}")
-  IF (UNIX AND NOT APPLE)
-    # hiding statically linked library symbols, this flag is not available for the linker under MACOSX
-    SET(CMAKE_CXX_FLAGS "-Wl,--exclude-libs,libstdc++.a ${CMAKE_CXX_FLAGS}")
-  ENDIF(UNIX AND NOT APPLE)
-ENDIF()
-
-INCLUDE_DIRECTORIES(${CUDA_INCLUDE_DIRS})
-INCLUDE_DIRECTORIES("${CUDA_SDK_ROOT_DIR}/common/inc")
-
-IF(APPLE)
-  IF(${CUDA_VERSION} LESS 6.0)
-    # work around for mac os x bug:
-    # http://stackoverflow.com/questions/16286588/cuda-5-0-cmake-and-make-failing-on-osx-10-8-3
-    if (NOT DEFINED CUDA_HOST_COMPILER AND CMAKE_C_COMPILER_ID STREQUAL "Clang" AND EXISTS /usr/bin/gcc)
-      set(CUDA_HOST_COMPILER /usr/bin/gcc CACHE FILEPATH "Host side compiler used by NVCC")
-      message(STATUS "Setting CMAKE_HOST_COMPILER to /usr/bin/gcc instead of ${CMAKE_C_COMPILER}.")
-    endif()
-
-    # bug on Apple
-    LINK_DIRECTORIES("/usr/local/cuda/lib/")
-  ELSEIF(${CUDA_VERSION} LESS 7.0)
-    SET(CUDA_HOST_COMPILER clang)
-    LIST(APPEND CUDA_NVCC_FLAGS "-Xcompiler -stdlib=libstdc++ -Xlinker -stdlib=libstdc++")
-    IF("${CMAKE_CXX_COMPILER_ID}" STREQUAL "Clang")
-      set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -stdlib=libstdc++")
-    ENDIF()
-  ENDIF()
-  # CUDA 7 supports clang and libc++ so no need to change anything
-ENDIF(APPLE)
-
-# Detect CUDA architecture and get best NVCC flags
-IF(NOT COMMAND CUDA_SELECT_NVCC_ARCH_FLAGS)
-  INCLUDE(${CMAKE_CURRENT_SOURCE_DIR}/cmake/select_compute_arch.cmake)
-ENDIF()
-LIST(APPEND CUDA_NVCC_FLAGS $ENV{TORCH_NVCC_FLAGS})
-CUDA_SELECT_NVCC_ARCH_FLAGS(NVCC_FLAGS_EXTRA $ENV{TORCH_CUDA_ARCH_LIST})
-LIST(APPEND CUDA_NVCC_FLAGS ${NVCC_FLAGS_EXTRA})
-IF(CMAKE_POSITION_INDEPENDENT_CODE)
-  LIST(APPEND CUDA_NVCC_FLAGS "-Xcompiler -fPIC")
-ENDIF()
+set(ATen_CUDA_INCLUDE ${ATen_CUDA_INCLUDE}
+  ${TH_INCLUDE_PATH}/TH
+  "${CMAKE_CURRENT_BINARY_DIR}"
+PARENT_SCOPE)
 
 
-IF(NOT THCS_INSTALL_BIN_SUBDIR
-    OR NOT THCS_INSTALL_LIB_SUBDIR
-    OR NOT THCS_INSTALL_INCLUDE_SUBDIR
-    OR NOT THCS_INSTALL_CMAKE_SUBDIR)
-
-  INCLUDE_DIRECTORIES(${TH_INCLUDE_PATH} ${TH_INCLUDE_PATH}/TH)
-  LINK_DIRECTORIES(${TH_LIB_PATH})
-
-  SET(THCS_INSTALL_BIN_SUBDIR "bin" CACHE PATH "THCS install binary subdirectory")
-  SET(THCS_INSTALL_LIB_SUBDIR "lib" CACHE PATH "THCS install library subdirectory")
-  SET(THCS_INSTALL_INCLUDE_SUBDIR "include" CACHE PATH "THCS install include subdirectory")
-  SET(THCS_INSTALL_CMAKE_SUBDIR "share/cmake/THCS" CACHE PATH "THCS install cmake subdirectory")
-ELSE()
-  SET(THCS_INSTALL_BIN_SUBDIR ${Torch_INSTALL_BIN_SUBDIR})
-  SET(THCS_INSTALL_LIB_SUBDIR ${Torch_INSTALL_LIB_SUBDIR})
-  SET(THCS_INSTALL_INCLUDE_SUBDIR ${Torch_INSTALL_INCLUDE_SUBDIR})
-  SET(THCS_INSTALL_CMAKE_SUBDIR ${Torch_INSTALL_CMAKE_SUBDIR})
-ENDIF()
-IF(NOT TH_LINK_STYLE)
-  set(TH_LINK_STYLE SHARED)
-ENDIF()
-
-INCLUDE_DIRECTORIES("${CMAKE_CURRENT_BINARY_DIR}")
-# CONFIGURE_FILE(THCSGeneral.h.in "${CMAKE_CURRENT_BINARY_DIR}/THCSGeneral.h")
-
-SET(CMAKE_C_FLAGS "-std=c99 ${CMAKE_C_FLAGS}")
-
-IF(MSVC)
-  LIST(APPEND CUDA_NVCC_FLAGS "-Xcompiler /wd4819")
-ENDIF(MSVC)
-
-SET(src
-  THCSTensor.c
-  )
-
-SET(src-cuda
-  THCSTensor.cu
-  THCSparse.cu
-  )
-
-MESSAGE(STATUS "got cuda version " ${CUDA_VERSION})
-
-IF(CUDA_HAS_FP16 OR NOT ${CUDA_VERSION} LESS 7.5)
-  MESSAGE(STATUS "Found CUDA with FP16 support, compiling with torch.CudaHalfTensor")
-  LIST(APPEND CUDA_NVCC_FLAGS "-DCUDA_HAS_FP16=1")
-ELSE(CUDA_HAS_FP16 OR NOT ${CUDA_VERSION} LESS 7.5)
-  MESSAGE(STATUS "Could not find CUDA with FP16 support, compiling without torch.CudaHalfTensor")
-ENDIF(CUDA_HAS_FP16 OR NOT ${CUDA_VERSION} LESS 7.5)
-
-MESSAGE(STATUS "CUDA_NVCC_FLAGS: ${CUDA_NVCC_FLAGS}")
-
-CUDA_ADD_LIBRARY(THCS ${TH_LINK_STYLE} ${src} ${src-cuda})
-CUDA_ADD_CUBLAS_TO_TARGET(THCS)
-TARGET_LINK_LIBRARIES(THCS ${TH_LIBRARIES} ${THC_LIBRARIES} ${THS_LIBRARIES} ${CUDA_cusparse_LIBRARY})
-
-IF(NOT THCS_SO_VERSION)
-  SET(THCS_SO_VERSION 1)
-ENDIF(NOT THCS_SO_VERSION)
-MESSAGE(STATUS "THCS_SO_VERSION: ${THCS_SO_VERSION}")
-SET_TARGET_PROPERTIES(THCS PROPERTIES
-  VERSION   ${THCS_SO_VERSION}
-  SOVERSION ${THCS_SO_VERSION})
-
-IF(TH_INSTALL_LIBRARIES)
-  INSTALL(TARGETS THCS
-            RUNTIME DESTINATION "${THCS_INSTALL_BIN_SUBDIR}"
-            LIBRARY DESTINATION "${THCS_INSTALL_LIB_SUBDIR}"
-            ARCHIVE DESTINATION "${THCS_INSTALL_LIB_SUBDIR}")
-ENDIF()
+set(ATen_CUDA_SRCS ${ATen_CUDA_SRCS}
+  ${CMAKE_CURRENT_SOURCE_DIR}/THCSTensor.c
+  ${CMAKE_CURRENT_SOURCE_DIR}/THCSTensor.cu
+  ${CMAKE_CURRENT_SOURCE_DIR}/THCSparse.cu
+  PARENT_SCOPE
+)
 
 INSTALL(FILES
           THCS.h
@@ -171,7 +31,7 @@
           THCSGenerateFloatTypes.h
           THCSGenerateDoubleType.h
           THCSparse.h
-          DESTINATION "${THCS_INSTALL_INCLUDE_SUBDIR}/THCS")
+          DESTINATION "${ATEN_INSTALL_INCLUDE_SUBDIR}/THCS")
 
 INSTALL(FILES
           generic/THCSTensor.c
@@ -179,4 +39,4 @@
           generic/THCSTensor.h
           generic/THCSTensorMath.h
           generic/THCSTensorMath.cu
-          DESTINATION "${THCS_INSTALL_INCLUDE_SUBDIR}/THCS/generic")
+          DESTINATION "${ATEN_INSTALL_INCLUDE_SUBDIR}/THCS/generic")
diff --git a/aten/src/THCUNN/CMakeLists.txt b/aten/src/THCUNN/CMakeLists.txt
index 650bf68..bf5e317 100644
--- a/aten/src/THCUNN/CMakeLists.txt
+++ b/aten/src/THCUNN/CMakeLists.txt
@@ -1,102 +1,86 @@
-CMAKE_MINIMUM_REQUIRED(VERSION 2.8 FATAL_ERROR)
-CMAKE_POLICY(VERSION 2.8)
+SET(ATen_CUDA_SRCS ${ATen_CUDA_SRCS}
+${CMAKE_CURRENT_SOURCE_DIR}/AbsCriterion.cu
+${CMAKE_CURRENT_SOURCE_DIR}/Abs.cu
+${CMAKE_CURRENT_SOURCE_DIR}/BatchNormalization.cu
+${CMAKE_CURRENT_SOURCE_DIR}/BCECriterion.cu
+${CMAKE_CURRENT_SOURCE_DIR}/ClassNLLCriterion.cu
+${CMAKE_CURRENT_SOURCE_DIR}/DistKLDivCriterion.cu
+${CMAKE_CURRENT_SOURCE_DIR}/ELU.cu
+${CMAKE_CURRENT_SOURCE_DIR}/FeatureLPPooling.cu
+${CMAKE_CURRENT_SOURCE_DIR}/FusedRNNKernel.cu
+${CMAKE_CURRENT_SOURCE_DIR}/GatedLinearUnit.cu
+${CMAKE_CURRENT_SOURCE_DIR}/HardTanh.cu
+${CMAKE_CURRENT_SOURCE_DIR}/IndexLinear.cu
+${CMAKE_CURRENT_SOURCE_DIR}/L1Cost.cu
+${CMAKE_CURRENT_SOURCE_DIR}/LeakyReLU.cu
+${CMAKE_CURRENT_SOURCE_DIR}/LogSigmoid.cu
+${CMAKE_CURRENT_SOURCE_DIR}/LogSoftMax.cu
+${CMAKE_CURRENT_SOURCE_DIR}/LookupTableBag.cu
+${CMAKE_CURRENT_SOURCE_DIR}/LookupTable.cu
+${CMAKE_CURRENT_SOURCE_DIR}/MarginCriterion.cu
+${CMAKE_CURRENT_SOURCE_DIR}/MSECriterion.cu
+${CMAKE_CURRENT_SOURCE_DIR}/MultiLabelMarginCriterion.cu
+${CMAKE_CURRENT_SOURCE_DIR}/MultiMarginCriterion.cu
+${CMAKE_CURRENT_SOURCE_DIR}/PReLU.cu
+${CMAKE_CURRENT_SOURCE_DIR}/RReLU.cu
+${CMAKE_CURRENT_SOURCE_DIR}/Sigmoid.cu
+${CMAKE_CURRENT_SOURCE_DIR}/SmoothL1Criterion.cu
+${CMAKE_CURRENT_SOURCE_DIR}/SoftMarginCriterion.cu
+${CMAKE_CURRENT_SOURCE_DIR}/SoftMax.cu
+${CMAKE_CURRENT_SOURCE_DIR}/SoftPlus.cu
+${CMAKE_CURRENT_SOURCE_DIR}/SoftShrink.cu
+${CMAKE_CURRENT_SOURCE_DIR}/SparseLinear.cu
+${CMAKE_CURRENT_SOURCE_DIR}/SpatialAdaptiveAveragePooling.cu
+${CMAKE_CURRENT_SOURCE_DIR}/SpatialAdaptiveMaxPooling.cu
+${CMAKE_CURRENT_SOURCE_DIR}/SpatialAveragePooling.cu
+${CMAKE_CURRENT_SOURCE_DIR}/SpatialClassNLLCriterion.cu
+${CMAKE_CURRENT_SOURCE_DIR}/SpatialConvolutionLocal.cu
+${CMAKE_CURRENT_SOURCE_DIR}/SpatialConvolutionMM.cu
+${CMAKE_CURRENT_SOURCE_DIR}/SpatialCrossMapLRN.cu
+${CMAKE_CURRENT_SOURCE_DIR}/SpatialDepthwiseConvolution.cu
+${CMAKE_CURRENT_SOURCE_DIR}/SpatialDilatedConvolution.cu
+${CMAKE_CURRENT_SOURCE_DIR}/SpatialDilatedMaxPooling.cu
+${CMAKE_CURRENT_SOURCE_DIR}/SpatialFractionalMaxPooling.cu
+${CMAKE_CURRENT_SOURCE_DIR}/SpatialFullConvolution.cu
+${CMAKE_CURRENT_SOURCE_DIR}/SpatialFullDilatedConvolution.cu
+${CMAKE_CURRENT_SOURCE_DIR}/SpatialGridSamplerBilinear.cu
+${CMAKE_CURRENT_SOURCE_DIR}/SpatialMaxPooling.cu
+${CMAKE_CURRENT_SOURCE_DIR}/SpatialMaxUnpooling.cu
+${CMAKE_CURRENT_SOURCE_DIR}/SpatialReflectionPadding.cu
+${CMAKE_CURRENT_SOURCE_DIR}/SpatialReplicationPadding.cu
+${CMAKE_CURRENT_SOURCE_DIR}/SpatialSubSampling.cu
+${CMAKE_CURRENT_SOURCE_DIR}/SpatialUpSamplingBilinear.cu
+${CMAKE_CURRENT_SOURCE_DIR}/SpatialUpSamplingNearest.cu
+${CMAKE_CURRENT_SOURCE_DIR}/Sqrt.cu
+${CMAKE_CURRENT_SOURCE_DIR}/Square.cu
+${CMAKE_CURRENT_SOURCE_DIR}/Tanh.cu
+${CMAKE_CURRENT_SOURCE_DIR}/TemporalConvolution.cu
+${CMAKE_CURRENT_SOURCE_DIR}/TemporalMaxPooling.cu
+${CMAKE_CURRENT_SOURCE_DIR}/TemporalReflectionPadding.cu
+${CMAKE_CURRENT_SOURCE_DIR}/TemporalReplicationPadding.cu
+${CMAKE_CURRENT_SOURCE_DIR}/TemporalRowConvolution.cu
+${CMAKE_CURRENT_SOURCE_DIR}/TemporalUpSamplingLinear.cu
+${CMAKE_CURRENT_SOURCE_DIR}/TemporalUpSamplingNearest.cu
+${CMAKE_CURRENT_SOURCE_DIR}/Threshold.cu
+${CMAKE_CURRENT_SOURCE_DIR}/VolumetricAdaptiveAveragePooling.cu
+${CMAKE_CURRENT_SOURCE_DIR}/VolumetricAdaptiveMaxPooling.cu
+${CMAKE_CURRENT_SOURCE_DIR}/VolumetricAveragePooling.cu
+${CMAKE_CURRENT_SOURCE_DIR}/VolumetricConvolution.cu
+${CMAKE_CURRENT_SOURCE_DIR}/VolumetricDilatedConvolution.cu
+${CMAKE_CURRENT_SOURCE_DIR}/VolumetricDilatedMaxPooling.cu
+${CMAKE_CURRENT_SOURCE_DIR}/VolumetricFractionalMaxPooling.cu
+${CMAKE_CURRENT_SOURCE_DIR}/VolumetricFullConvolution.cu
+${CMAKE_CURRENT_SOURCE_DIR}/VolumetricFullDilatedConvolution.cu
+${CMAKE_CURRENT_SOURCE_DIR}/VolumetricMaxPooling.cu
+${CMAKE_CURRENT_SOURCE_DIR}/VolumetricMaxUnpooling.cu
+${CMAKE_CURRENT_SOURCE_DIR}/VolumetricReplicationPadding.cu
+${CMAKE_CURRENT_SOURCE_DIR}/VolumetricUpSamplingNearest.cu
+${CMAKE_CURRENT_SOURCE_DIR}/VolumetricUpSamplingTrilinear.cu
+PARENT_SCOPE)
 
-OPTION(NDEBUG "disable asserts (WARNING: this may result in silent UB e.g. with out-of-bound indices)")
-IF(NOT NDEBUG)
-  MESSAGE(STATUS "Removing -DNDEBUG from compile flags")
-  STRING(REPLACE "-DNDEBUG" "" CMAKE_C_FLAGS "" ${CMAKE_C_FLAGS})
-  STRING(REPLACE "-DNDEBUG" "" CMAKE_C_FLAGS_DEBUG "" ${CMAKE_C_FLAGS_DEBUG})
-  STRING(REPLACE "-DNDEBUG" "" CMAKE_C_FLAGS_RELEASE "" ${CMAKE_C_FLAGS_RELEASE})
-  STRING(REPLACE "-DNDEBUG" "" CMAKE_CXX_FLAGS "" ${CMAKE_CXX_FLAGS})
-  STRING(REPLACE "-DNDEBUG" "" CMAKE_CXX_FLAGS_DEBUG "" ${CMAKE_CXX_FLAGS_DEBUG})
-  STRING(REPLACE "-DNDEBUG" "" CMAKE_CXX_FLAGS_RELEASE "" ${CMAKE_CXX_FLAGS_RELEASE})
-ENDIF()
+set(ATen_CUDA_INCLUDE ${ATen_CUDA_INCLUDE}
+  "${CMAKE_CURRENT_SOURCE_DIR}"
+PARENT_SCOPE)
 
-IF(NOT Torch_FOUND)
-  FIND_PACKAGE(Torch REQUIRED)
-ENDIF()
-
-IF(NOT TH_LIBRARIES)
-  SET(TH_LIBRARIES "TH")
-ENDIF(NOT TH_LIBRARIES)
-MESSAGE(STATUS "TH_LIBRARIES: ${TH_LIBRARIES}")
-IF(NOT THC_LIBRARIES)
-  SET(THC_LIBRARIES "THC")
-ENDIF(NOT THC_LIBRARIES)
-MESSAGE(STATUS "THC_LIBRARIES: ${THC_LIBRARIES}")
-IF(NOT TH_LINK_STYLE)
-  set(TH_LINK_STYLE SHARED)
-ENDIF()
-
-IF(NOT CUDA_FOUND)
-  FIND_PACKAGE(CUDA 6.5 REQUIRED)
-ENDIF()
-
-IF ($ENV{TH_BINARY_BUILD})
-  MESSAGE(STATUS "TH_BINARY_BUILD detected. Statically linking libstdc++")
-  SET(CMAKE_CXX_FLAGS "-static-libstdc++ ${CMAKE_CXX_FLAGS}")
-  IF (UNIX AND NOT APPLE)
-    # hiding statically linked library symbols, this flag is not available for the linker under MACOSX
-    SET(CMAKE_CXX_FLAGS "-Wl,--exclude-libs,libstdc++.a ${CMAKE_CXX_FLAGS}")
-  ENDIF(UNIX AND NOT APPLE)
-ENDIF()
-
-# Detect CUDA architecture and get best NVCC flags
-IF(NOT COMMAND CUDA_SELECT_NVCC_ARCH_FLAGS OR MSVC)
-  INCLUDE(${CMAKE_CURRENT_SOURCE_DIR}/cmake/select_compute_arch.cmake)
-ENDIF()
-LIST(APPEND CUDA_NVCC_FLAGS $ENV{TORCH_NVCC_FLAGS})
-CUDA_SELECT_NVCC_ARCH_FLAGS(NVCC_FLAGS_EXTRA $ENV{TORCH_CUDA_ARCH_LIST})
-LIST(APPEND CUDA_NVCC_FLAGS ${NVCC_FLAGS_EXTRA})
-IF(CMAKE_POSITION_INDEPENDENT_CODE)
-  LIST(APPEND CUDA_NVCC_FLAGS "-Xcompiler -fPIC")
-ENDIF()
-
-if(CMAKE_CXX_COMPILER_ID STREQUAL "GNU")
-  if(CMAKE_CXX_COMPILER_VERSION VERSION_GREATER "4.9")
-    if(CUDA_VERSION VERSION_LESS "8.0")
-      MESSAGE(STATUS "Found gcc >=5 and CUDA <= 7.5, adding workaround C++ flags")
-      set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -D_FORCE_INLINES -D_MWAITXINTRIN_H_INCLUDED -D__STRICT_ANSI__")
-    endif(CUDA_VERSION VERSION_LESS "8.0")
-  endif(CMAKE_CXX_COMPILER_VERSION VERSION_GREATER "4.9")
-endif(CMAKE_CXX_COMPILER_ID STREQUAL "GNU")
-
-if(CUDA_VERSION VERSION_GREATER "8.0")
-  LIST(APPEND CUDA_NVCC_FLAGS "-D__CUDA_NO_HALF_OPERATORS__")
-endif(CUDA_VERSION VERSION_GREATER "8.0")
-
-IF(MSVC)
-  LIST(APPEND CUDA_NVCC_FLAGS "-Xcompiler /wd4819")
-  ADD_DEFINITIONS(-DTH_EXPORTS)
-ENDIF()
-
-IF(NOT THCUNN_INSTALL_LIB_SUBDIR)
-  SET(THCUNN_INSTALL_BIN_SUBDIR "bin" CACHE PATH "THCUNN install binary directory")
-  SET(THCUNN_INSTALL_LIB_SUBDIR "lib" CACHE PATH "THCUNN install library directory")
-  SET(THCUNN_INSTALL_INCLUDE_SUBDIR "include" CACHE PATH "THCUNN install include subdirectory")
-ENDIF()
-
-FILE(GLOB src-cuda *.cu)
-
-CUDA_INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR})
-CUDA_ADD_LIBRARY(THCUNN ${TH_LINK_STYLE} ${src-cuda})
-
-INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR})
-TARGET_LINK_LIBRARIES(THCUNN ${THC_LIBRARIES} ${TH_LIBRARIES} ${CUDA_cusparse_LIBRARY})
-
-# Luarocks bug pre-14.04 prevents us from setting it for Lua-Torch
-IF(THCUNN_SO_VERSION)
-  MESSAGE(STATUS "THCUNN_SO_VERSION: ${THCUNN_SO_VERSION}")
-  SET_TARGET_PROPERTIES(THCUNN PROPERTIES
-    VERSION   ${THCUNN_SO_VERSION}
-    SOVERSION ${THCUNN_SO_VERSION})
-ENDIF(THCUNN_SO_VERSION)
-
-IF(TH_INSTALL_LIBRARIES)
-  INSTALL(TARGETS THCUNN
-            RUNTIME DESTINATION "${THCUNN_INSTALL_BIN_SUBDIR}"
-            LIBRARY DESTINATION "${THCUNN_INSTALL_LIB_SUBDIR}"
-            ARCHIVE DESTINATION "${THCUNN_INSTALL_LIB_SUBDIR}")
-ENDIF()
-INSTALL(FILES THCUNN.h DESTINATION "${THCUNN_INSTALL_INCLUDE_SUBDIR}/THCUNN")
-INSTALL(FILES generic/THCUNN.h DESTINATION "${THCUNN_INSTALL_INCLUDE_SUBDIR}/THCUNN/generic")
+INSTALL(FILES THCUNN.h DESTINATION "${ATEN_INSTALL_INCLUDE_SUBDIR}/THCUNN")
+INSTALL(FILES generic/THCUNN.h DESTINATION "${ATEN_INSTALL_INCLUDE_SUBDIR}/THCUNN/generic")
diff --git a/aten/src/THCUNN/LookupTableBag.cu b/aten/src/THCUNN/LookupTableBag.cu
index 9cb7212..bf3aa32 100644
--- a/aten/src/THCUNN/LookupTableBag.cu
+++ b/aten/src/THCUNN/LookupTableBag.cu
@@ -53,6 +53,7 @@
 	weightFeatSum = weightFeatSum / ScalarConvert<int64_t, Acctype>::to(bag_size_);
 	bag_size[bag] = bag_size_;
       }
+      (void) MODE_SUM; //silence warnings about unused MODE_SUM;
       output[bag * stride + featureDim] = ScalarConvert<Acctype, Dtype>::to(weightFeatSum);
     }
   }
diff --git a/aten/src/THCUNN/generic/LogSoftMax.cu b/aten/src/THCUNN/generic/LogSoftMax.cu
index ab777ea..cef4e5f 100644
--- a/aten/src/THCUNN/generic/LogSoftMax.cu
+++ b/aten/src/THCUNN/generic/LogSoftMax.cu
@@ -19,12 +19,12 @@
   input = THCTensor_(newContiguous)(state, input);
   THCTensor_(resizeAs)(state, output, input);
 
-  uint64_t outer_size = 1;
-  uint64_t dim_size = input->size[dim];
-  uint64_t inner_size = 1;
-  for (uint64_t i = 0; i < dim; ++i)
+  int64_t outer_size = 1;
+  int64_t dim_size = input->size[dim];
+  int64_t inner_size = 1;
+  for (int64_t i = 0; i < dim; ++i)
     outer_size *= input->size[i];
-  for (uint64_t i = dim + 1; i < input->nDimension; ++i)
+  for (int64_t i = dim + 1; i < input->nDimension; ++i)
     inner_size *= input->size[i];
 
   HostSoftMaxForward<real, accreal, LogSoftMaxForwardEpilogue>(
@@ -53,12 +53,12 @@
 
   THCTensor_(resizeAs)(state, gradInput, output);
 
-  uint64_t outer_size = 1;
-  uint64_t dim_size = output->size[dim];
-  uint64_t inner_size = 1;
-  for (uint64_t i = 0; i < dim; ++i)
+  int64_t outer_size = 1;
+  int64_t dim_size = output->size[dim];
+  int64_t inner_size = 1;
+  for (int64_t i = 0; i < dim; ++i)
     outer_size *= output->size[i];
-  for (uint64_t i = dim + 1; i < output->nDimension; ++i)
+  for (int64_t i = dim + 1; i < output->nDimension; ++i)
     inner_size *= output->size[i];
 
   output = THCTensor_(newContiguous)(state, output);
diff --git a/aten/src/THCUNN/generic/SoftMax.cu b/aten/src/THCUNN/generic/SoftMax.cu
index 8bd4f59..4202357 100644
--- a/aten/src/THCUNN/generic/SoftMax.cu
+++ b/aten/src/THCUNN/generic/SoftMax.cu
@@ -19,12 +19,12 @@
   input = THCTensor_(newContiguous)(state, input);
   THCTensor_(resizeAs)(state, output, input);
 
-  uint64_t outer_size = 1;
-  uint64_t dim_size = input->size[dim];
-  uint64_t inner_size = 1;
-  for (uint64_t i = 0; i < dim; ++i)
+  int64_t outer_size = 1;
+  int64_t dim_size = input->size[dim];
+  int64_t inner_size = 1;
+  for (int64_t i = 0; i < dim; ++i)
     outer_size *= input->size[i];
-  for (uint64_t i = dim + 1; i < input->nDimension; ++i)
+  for (int64_t i = dim + 1; i < input->nDimension; ++i)
     inner_size *= input->size[i];
 
   HostSoftMaxForward<real, accreal, SoftMaxForwardEpilogue>(
@@ -53,12 +53,12 @@
 
   THCTensor_(resizeAs)(state, gradInput, output);
 
-  uint64_t outer_size = 1;
-  uint64_t dim_size = output->size[dim];
-  uint64_t inner_size = 1;
-  for (uint64_t i = 0; i < dim; ++i)
+  int64_t outer_size = 1;
+  int64_t dim_size = output->size[dim];
+  int64_t inner_size = 1;
+  for (int64_t i = 0; i < dim; ++i)
     outer_size *= output->size[i];
-  for (uint64_t i = dim + 1; i < output->nDimension; ++i)
+  for (int64_t i = dim + 1; i < output->nDimension; ++i)
     inner_size *= output->size[i];
 
   output = THCTensor_(newContiguous)(state, output);
diff --git a/aten/src/THCUNN/generic/SpatialCrossMapLRN.cu b/aten/src/THCUNN/generic/SpatialCrossMapLRN.cu
index 6b79c15..61a1c70 100644
--- a/aten/src/THCUNN/generic/SpatialCrossMapLRN.cu
+++ b/aten/src/THCUNN/generic/SpatialCrossMapLRN.cu
@@ -54,7 +54,7 @@
   real alpha = ScalarConvert<accreal, real>::to(alpha_);
   real beta = ScalarConvert<accreal, real>::to(beta_);
   real k = ScalarConvert<accreal, real>::to(k_);
-
+  (void) k;
   THCTensor_(resizeAs)(state, gradInput, input);
 
   int batchSize;
diff --git a/aten/src/THCUNN/generic/Threshold.cu b/aten/src/THCUNN/generic/Threshold.cu
index 0b7b79e..d174e41 100644
--- a/aten/src/THCUNN/generic/Threshold.cu
+++ b/aten/src/THCUNN/generic/Threshold.cu
@@ -45,6 +45,7 @@
 {
   real threshold = ScalarConvert<accreal, real>::to(threshold_);
   real val = ScalarConvert<accreal, real>::to(val_);
+  (void) val;
   THCUNN_check_nElement(state, input, gradOutput);
   THCUNN_assertSameGPU(state, 3, input, gradInput, gradOutput);
 
diff --git a/aten/src/THNN/CMakeLists.txt b/aten/src/THNN/CMakeLists.txt
index b5d0106..d3ac032 100644
--- a/aten/src/THNN/CMakeLists.txt
+++ b/aten/src/THNN/CMakeLists.txt
@@ -1,95 +1,5 @@
-CMAKE_MINIMUM_REQUIRED(VERSION 2.6 FATAL_ERROR)
-CMAKE_POLICY(VERSION 2.6)
-cmake_policy(SET CMP0015 NEW)
-
-IF(NOT Torch_FOUND)
-  FIND_PACKAGE(Torch REQUIRED)
-ENDIF()
-
-IF(NOT TH_LIBRARIES)
-  SET(TH_LIBRARIES "TH")
-ENDIF(NOT TH_LIBRARIES)
-MESSAGE(STATUS "TH_LIBRARIES: ${TH_LIBRARIES}")
-IF(NOT TH_LINK_STYLE)
-  set(TH_LINK_STYLE SHARED)
-ENDIF()
-
-IF(NOT THNN_INSTALL_LIB_SUBDIR)
-  SET(THNN_INSTALL_BIN_SUBDIR "bin" CACHE PATH "THNN install binary directory")
-  SET(THNN_INSTALL_LIB_SUBDIR "lib" CACHE PATH "THNN install library directory")
-  SET(THNN_INSTALL_INCLUDE_SUBDIR "include" CACHE PATH "THNN install include subdirectory")
-ENDIF()
-
-# Flags
-# When using MSVC
-IF(MSVC)
-  # we want to respect the standard, and we are bored of those **** .
-  ADD_DEFINITIONS(-D_CRT_SECURE_NO_DEPRECATE=1)
-  ADD_DEFINITIONS(-DTH_EXPORTS)
-ENDIF(MSVC)
-
-IF (CMAKE_VERSION VERSION_LESS "3.1")
-  SET(CMAKE_C_FLAGS "-std=c99 ${CMAKE_C_FLAGS}")
-ELSE ()
-  SET(CMAKE_C_STANDARD 99)
-ENDIF ()
-
-# OpenMP support?
-SET(WITH_OPENMP ON CACHE BOOL "OpenMP support if available?")
-IF (APPLE AND CMAKE_COMPILER_IS_GNUCC)
-  EXEC_PROGRAM (uname ARGS -v  OUTPUT_VARIABLE DARWIN_VERSION)
-  STRING (REGEX MATCH "[0-9]+" DARWIN_VERSION ${DARWIN_VERSION})
-  MESSAGE (STATUS "MAC OS Darwin Version: ${DARWIN_VERSION}")
-  IF (DARWIN_VERSION GREATER 9)
-    SET(APPLE_OPENMP_SUCKS 1)
-  ENDIF (DARWIN_VERSION GREATER 9)
-  EXECUTE_PROCESS (COMMAND ${CMAKE_C_COMPILER} -dumpversion
-    OUTPUT_VARIABLE GCC_VERSION)
-  IF (APPLE_OPENMP_SUCKS AND GCC_VERSION VERSION_LESS 4.6.2)
-    MESSAGE(STATUS "Warning: Disabling OpenMP (unstable with this version of GCC)")
-    MESSAGE(STATUS " Install GCC >= 4.6.2 or change your OS to enable OpenMP")
-    SET(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -Wno-unknown-pragmas")
-    SET(WITH_OPENMP OFF CACHE BOOL "OpenMP support if available?" FORCE)
-  ENDIF ()
-ENDIF ()
-
-IF (WITH_OPENMP)
-  FIND_PACKAGE(OpenMP)
-  IF(OPENMP_FOUND)
-    MESSAGE(STATUS "Compiling with OpenMP support")
-    SET(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${OpenMP_C_FLAGS}" PARENT_SCOPE)
-    SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OpenMP_CXX_FLAGS}" PARENT_SCOPE)
-    SET(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} ${OpenMP_EXE_LINKER_FLAGS}" PARENT_SCOPE)
-  ENDIF(OPENMP_FOUND)
-ENDIF (WITH_OPENMP)
-
-LINK_DIRECTORIES("${Torch_INSTALL_LIB}")
-
-SET(src init.c)
-ADD_LIBRARY(THNN ${TH_LINK_STYLE} init.c)
-INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR})
-### Torch packages supposes libraries prefix is "lib"
-IF (NOT MSVC)
-  SET_TARGET_PROPERTIES(THNN PROPERTIES
-    PREFIX "lib"
-    IMPORT_PREFIX "lib")
-ENDIF(NOT MSVC)
-
-TARGET_LINK_LIBRARIES(THNN ${TH_LIBRARIES})
-
-# Luarocks bug pre-14.04 prevents us from setting it for Lua-Torch
-IF(THNN_SO_VERSION)
-  MESSAGE(STATUS "THNN_SO_VERSION: ${THNN_SO_VERSION}")
-  SET_TARGET_PROPERTIES(THNN PROPERTIES
-    VERSION   ${THNN_SO_VERSION}
-    SOVERSION ${THNN_SO_VERSION})
-ENDIF(THNN_SO_VERSION)
-
-IF(TH_INSTALL_LIBRARIES)
-  INSTALL(TARGETS THNN
-    RUNTIME DESTINATION "${THNN_INSTALL_BIN_SUBDIR}"
-    LIBRARY DESTINATION "${THNN_INSTALL_LIB_SUBDIR}"
-    ARCHIVE DESTINATION "${THNN_INSTALL_LIB_SUBDIR}")
-ENDIF()
-INSTALL(FILES THNN.h DESTINATION "${THNN_INSTALL_INCLUDE_SUBDIR}/THNN")
-INSTALL(FILES generic/THNN.h DESTINATION "${THNN_INSTALL_INCLUDE_SUBDIR}/THNN/generic")
+set(ATen_CPU_SRCS ${ATen_CPU_SRCS}
+  ${CMAKE_CURRENT_SOURCE_DIR}/init.c
+PARENT_SCOPE)
+INSTALL(FILES THNN.h DESTINATION "${ATEN_INSTALL_INCLUDE_SUBDIR}/THNN")
+INSTALL(FILES generic/THNN.h DESTINATION "${ATEN_INSTALL_INCLUDE_SUBDIR}/THNN/generic")
diff --git a/aten/src/THS/CMakeLists.txt b/aten/src/THS/CMakeLists.txt
index 2487654..492012d 100644
--- a/aten/src/THS/CMakeLists.txt
+++ b/aten/src/THS/CMakeLists.txt
@@ -1,79 +1,10 @@
-CMAKE_MINIMUM_REQUIRED(VERSION 2.6 FATAL_ERROR)
-CMAKE_POLICY(VERSION 2.6)
+set(ATen_CPU_SRCS ${ATen_CPU_SRCS}
+  ${CMAKE_CURRENT_SOURCE_DIR}/THSTensor.c
+PARENT_SCOPE)
 
-IF(NOT Torch_FOUND)
-  FIND_PACKAGE(Torch REQUIRED)
-ENDIF()
-
-IF(NOT TH_LIBRARIES)
-  SET(TH_LIBRARIES "TH")
-ENDIF(NOT TH_LIBRARIES)
-MESSAGE(STATUS "TH_LIBRARIES: ${TH_LIBRARIES}")
-IF(NOT TH_LINK_STYLE)
-  set(TH_LINK_STYLE SHARED)
-ENDIF()
-
-IF(NOT THS_INSTALL_BIN_SUBDIR
-    OR NOT THS_INSTALL_LIB_SUBDIR
-    OR NOT THS_INSTALL_INCLUDE_SUBDIR
-    OR NOT THS_INSTALL_CMAKE_SUBDIR)
-
-  INCLUDE_DIRECTORIES(${TH_INCLUDE_PATH} ${TH_INCLUDE_PATH}/TH)
-  LINK_DIRECTORIES(${TH_LIB_PATH})
-
-  SET(THS_INSTALL_BIN_SUBDIR "bin" CACHE PATH "THS install binary subdirectory")
-  SET(THS_INSTALL_LIB_SUBDIR "lib" CACHE PATH "THS install library subdirectory")
-  SET(THS_INSTALL_INCLUDE_SUBDIR "include" CACHE PATH "THS install include subdirectory")
-  SET(THS_INSTALL_CMAKE_SUBDIR "share/cmake/THS" CACHE PATH "THS install cmake subdirectory")
-ELSE()
-  SET(THS_INSTALL_BIN_SUBDIR ${Torch_INSTALL_BIN_SUBDIR})
-  SET(THS_INSTALL_LIB_SUBDIR ${Torch_INSTALL_LIB_SUBDIR})
-  SET(THS_INSTALL_INCLUDE_SUBDIR ${Torch_INSTALL_INCLUDE_SUBDIR})
-  SET(THS_INSTALL_CMAKE_SUBDIR ${Torch_INSTALL_CMAKE_SUBDIR})
-ENDIF()
-
-# Flags
-# When using MSVC
-IF(MSVC)
-  # we want to respect the standard, and we are bored of those **** .
-  ADD_DEFINITIONS(-D_CRT_SECURE_NO_DEPRECATE=1)
-ENDIF(MSVC)
-
-IF (CMAKE_VERSION VERSION_LESS "3.1")
-  SET(CMAKE_C_FLAGS "-std=c99 ${CMAKE_C_FLAGS}")
-ELSE ()
-  SET(CMAKE_C_STANDARD 99)
-ENDIF ()
-
-SET(hdr
-  THS.h
-  THSTensor.h
-  )
-
-SET(src
-  THSTensor.c
-  )
-
-SET(src ${src} ${hdr})
-ADD_LIBRARY(THS ${TH_LINK_STYLE} ${src})
-
-INCLUDE_DIRECTORIES("${CMAKE_CURRENT_BINARY_DIR}")
-TARGET_LINK_LIBRARIES(THS ${TH_LIBRARIES})
-
-IF(NOT THS_SO_VERSION)
-  SET(THS_SO_VERSION 1)
-ENDIF(NOT THS_SO_VERSION)
-MESSAGE(STATUS "THS_SO_VERSION: ${THS_SO_VERSION}")
-SET_TARGET_PROPERTIES(THS PROPERTIES
-  VERSION   ${THS_SO_VERSION}
-  SOVERSION ${THS_SO_VERSION})
-
-IF(TH_INSTALL_LIBRARIES)
-  INSTALL(TARGETS THS
-    RUNTIME DESTINATION "${THS_INSTALL_BIN_SUBDIR}"
-    LIBRARY DESTINATION "${THS_INSTALL_LIB_SUBDIR}"
-    ARCHIVE DESTINATION "${THS_INSTALL_LIB_SUBDIR}")
-ENDIF()
+set(ATen_CPU_INCLUDE ${ATen_CPU_INCLUDE}
+  "${CMAKE_CURRENT_BINARY_DIR}"
+PARENT_SCOPE)
 
 INSTALL(FILES
   THS.h
@@ -81,11 +12,11 @@
   THSGenerateFloatTypes.h
   THSGenerateIntTypes.h
   THSTensor.h
-  DESTINATION "${THS_INSTALL_INCLUDE_SUBDIR}/THS")
+  DESTINATION "${ATEN_INSTALL_INCLUDE_SUBDIR}/THS")
 
 INSTALL(FILES
   generic/THSTensor.c
   generic/THSTensor.h
   generic/THSTensorMath.c
   generic/THSTensorMath.h
-  DESTINATION "${THS_INSTALL_INCLUDE_SUBDIR}/THS/generic")
+  DESTINATION "${ATEN_INSTALL_INCLUDE_SUBDIR}/THS/generic")