Upgrade NVTX to NVTX3 (#90689)

Due to recent upgrade to CUDA 11, we can upgrade NVTX to NVTX3 as well, which is a header only library that can simplify the building system a lot.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/90689
Approved by: https://github.com/soumith, https://github.com/malfet
diff --git a/.gitmodules b/.gitmodules
index 282746e..4da46e5 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -154,3 +154,6 @@
 [submodule "third_party/cutlass"]
 	path = third_party/cutlass
 	url = https://github.com/NVIDIA/cutlass.git
+[submodule "third_party/NVTX"]
+	path = third_party/NVTX
+	url = https://github.com/NVIDIA/NVTX.git
diff --git a/caffe2/CMakeLists.txt b/caffe2/CMakeLists.txt
index 4e5993f..25e3424 100644
--- a/caffe2/CMakeLists.txt
+++ b/caffe2/CMakeLists.txt
@@ -1475,7 +1475,8 @@
     target_link_libraries(torch_cpu PRIVATE torch::cudart)
   endif()
   target_link_libraries(torch_cuda INTERFACE torch::cudart)
-  target_link_libraries(torch_cuda PUBLIC c10_cuda torch::nvtoolsext)
+  target_link_libraries(torch_cuda PUBLIC c10_cuda)
+  target_link_libraries(torch_cuda PRIVATE torch::nvtoolsext)
 
   target_include_directories(
       torch_cuda INTERFACE $<INSTALL_INTERFACE:include>)
@@ -1530,7 +1531,7 @@
   # not find them, because they're usually in non-standard locations)
   if(USE_CUDA)
     target_link_libraries(torch_global_deps ${Caffe2_PUBLIC_CUDA_DEPENDENCY_LIBS})
-    target_link_libraries(torch_global_deps torch::cudart torch::nvtoolsext)
+    target_link_libraries(torch_global_deps torch::cudart)
   endif()
   if(USE_TBB)
     target_link_libraries(torch_global_deps TBB::tbb)
diff --git a/caffe2/python/__init__.py b/caffe2/python/__init__.py
index 888d286..283f08c 100644
--- a/caffe2/python/__init__.py
+++ b/caffe2/python/__init__.py
@@ -26,13 +26,6 @@
     th_root = os.path.join(os.path.dirname(os.path.dirname(os.path.dirname(__file__))), 'torch')
     th_dll_path = os.path.join(th_root, 'lib')
 
-    if not os.path.exists(os.path.join(th_dll_path, 'nvToolsExt64_1.dll')) and \
-            not os.path.exists(os.path.join(py_dll_path, 'nvToolsExt64_1.dll')):
-        nvtoolsext_dll_path = os.path.join(
-            os.getenv('NVTOOLSEXT_PATH', 'C:\\Program Files\\NVIDIA Corporation\\NvToolsExt'), 'bin', 'x64')
-    else:
-        nvtoolsext_dll_path = ''
-
     import importlib.util
     import glob
     spec = importlib.util.spec_from_file_location('torch_version', os.path.join(th_root, 'version.py'))
@@ -50,7 +43,7 @@
 
     import ctypes
     kernel32 = ctypes.WinDLL('kernel32.dll', use_last_error=True)
-    dll_paths = list(filter(os.path.exists, [th_dll_path, py_dll_path, nvtoolsext_dll_path, cuda_path]))
+    dll_paths = list(filter(os.path.exists, [th_dll_path, py_dll_path, cuda_path]))
     with_load_library_flags = hasattr(kernel32, 'AddDllDirectory')
     prev_error_mode = kernel32.SetErrorMode(0x0001)
 
diff --git a/cmake/TorchConfig.cmake.in b/cmake/TorchConfig.cmake.in
index 6fecc86..95fe448 100644
--- a/cmake/TorchConfig.cmake.in
+++ b/cmake/TorchConfig.cmake.in
@@ -129,30 +129,18 @@
 
 if(@USE_CUDA@)
   if(MSVC)
-    if(NOT NVTOOLEXT_HOME)
-      set(NVTOOLEXT_HOME "C:/Program Files/NVIDIA Corporation/NvToolsExt")
-    endif()
-    if(DEFINED ENV{NVTOOLSEXT_PATH})
-      set(NVTOOLEXT_HOME $ENV{NVTOOLSEXT_PATH})
-    endif()
-    set(TORCH_CUDA_LIBRARIES
-      ${NVTOOLEXT_HOME}/lib/x64/nvToolsExt64_1.lib
-      ${CUDA_LIBRARIES})
-    list(APPEND TORCH_INCLUDE_DIRS ${NVTOOLEXT_HOME}/include)
+    set(TORCH_CUDA_LIBRARIES ${CUDA_LIBRARIES})
     find_library(CAFFE2_NVRTC_LIBRARY caffe2_nvrtc PATHS "${TORCH_INSTALL_PREFIX}/lib")
     list(APPEND TORCH_CUDA_LIBRARIES ${CAFFE2_NVRTC_LIBRARY})
   elseif(APPLE)
     set(TORCH_CUDA_LIBRARIES
       ${CUDA_TOOLKIT_ROOT_DIR}/lib/libcudart.dylib
       ${CUDA_TOOLKIT_ROOT_DIR}/lib/libnvrtc.dylib
-      ${CUDA_TOOLKIT_ROOT_DIR}/lib/libnvToolsExt.dylib
       ${CUDA_LIBRARIES})
   else()
-    find_library(LIBNVTOOLSEXT libnvToolsExt.so PATHS ${CUDA_TOOLKIT_ROOT_DIR}/lib64/)
     set(TORCH_CUDA_LIBRARIES
       ${CUDA_CUDA_LIB}
       ${CUDA_NVRTC_LIB}
-      ${LIBNVTOOLSEXT}
       ${CUDA_LIBRARIES})
   endif()
   if(@BUILD_SHARED_LIBS@)
diff --git a/cmake/public/cuda.cmake b/cmake/public/cuda.cmake
index 68de16b..416473f 100644
--- a/cmake/public/cuda.cmake
+++ b/cmake/public/cuda.cmake
@@ -65,10 +65,6 @@
                       "V${CUDAToolkit_VERSION} in '${CUDAToolkit_INCLUDE_DIR}'")
 endif()
 
-if(NOT TARGET CUDA::nvToolsExt)
-  message(FATAL_ERROR "Failed to find nvToolsExt")
-endif()
-
 message(STATUS "Caffe2: CUDA detected: " ${CUDA_VERSION})
 message(STATUS "Caffe2: CUDA nvcc is: " ${CUDA_NVCC_EXECUTABLE})
 message(STATUS "Caffe2: CUDA toolkit directory: " ${CUDA_TOOLKIT_ROOT_DIR})
@@ -214,9 +210,19 @@
 
 # nvToolsExt
 add_library(torch::nvtoolsext INTERFACE IMPORTED)
-set_property(
-    TARGET torch::nvtoolsext PROPERTY INTERFACE_LINK_LIBRARIES
-    CUDA::nvToolsExt)
+find_path(
+  nvtx3_dir
+  NAMES nvtx3
+  PATHS ${CUDA_INCLUDE_DIRS}
+  NO_DEFAULT_PATH)
+find_package_handle_standard_args(nvtx3 DEFAULT_MSG nvtx3_dir)
+if(NOT nvtx3_FOUND)
+  set(nvtx3_dir "${CMAKE_CURRENT_LIST_DIR}/../../third_party/NVTX/c/include")
+  message(WARNING "use NVTX library in ${nvtx3_dir}")
+  include_directories(SYSTEM "${nvtx3_dir}")
+  target_include_directories(torch::nvtoolsext INTERFACE "${nvtx3_dir}")
+endif()
+
 
 # cublas
 add_library(caffe2::cublas INTERFACE IMPORTED)
diff --git a/setup.py b/setup.py
index 9a258ca..566e1e9 100644
--- a/setup.py
+++ b/setup.py
@@ -178,9 +178,6 @@
 #   NVFUSER_SOURCE_DIR
 #     specify nvfuser root directory
 #
-#   NVTOOLSEXT_PATH (Windows only)
-#     specify where nvtoolsext is installed
-#
 #   ACL_ROOT_DIR
 #     specify where Compute Library is installed
 #
diff --git a/third_party/NVTX b/third_party/NVTX
new file mode 160000
index 0000000..e170594
--- /dev/null
+++ b/third_party/NVTX
@@ -0,0 +1 @@
+Subproject commit e170594ac7cf1dac584da473d4ca9301087090c1
diff --git a/third_party/nvfuser/csrc/instrumentation.h b/third_party/nvfuser/csrc/instrumentation.h
index cd57825..5b27f0d 100644
--- a/third_party/nvfuser/csrc/instrumentation.h
+++ b/third_party/nvfuser/csrc/instrumentation.h
@@ -2,7 +2,11 @@
 
 #include <utils.h>
 
+#ifndef FBCODE_CAFFE2
+#include <nvtx3/nvToolsExt.h>
+#else
 #include <nvToolsExt.h>
+#endif
 
 // NOLINTNEXTLINE(modernize-deprecated-headers)
 #include <stdio.h>
diff --git a/torch/CMakeLists.txt b/torch/CMakeLists.txt
index c6c23f2..d3b2a20 100644
--- a/torch/CMakeLists.txt
+++ b/torch/CMakeLists.txt
@@ -129,8 +129,6 @@
         list(APPEND TORCH_PYTHON_LINK_LIBRARIES torch::cudnn)
         list(APPEND TORCH_PYTHON_COMPILE_DEFINITIONS USE_CUDNN)
     endif()
-
-    list(APPEND TORCH_PYTHON_LINK_LIBRARIES torch::nvtoolsext)
 endif()
 
 if(USE_ROCM)
diff --git a/torch/__init__.py b/torch/__init__.py
index e349c3a..5189c05 100644
--- a/torch/__init__.py
+++ b/torch/__init__.py
@@ -72,12 +72,6 @@
 
     dll_paths = list(filter(os.path.exists, [th_dll_path, py_dll_path, base_py_dll_path]))
 
-    if all([not os.path.exists(os.path.join(p, 'nvToolsExt64_1.dll')) for p in dll_paths]):
-        nvtoolsext_dll_path = os.path.join(
-            os.getenv('NVTOOLSEXT_PATH', os.path.join(pfiles_path, 'NVIDIA Corporation', 'NvToolsExt')), 'bin', 'x64')
-    else:
-        nvtoolsext_dll_path = ''
-
     from .version import cuda as cuda_version
     import glob
     if cuda_version and all([not glob.glob(os.path.join(p, 'cudart64*.dll')) for p in dll_paths]):
@@ -88,7 +82,7 @@
     else:
         cuda_path = ''
 
-    dll_paths.extend(filter(os.path.exists, [nvtoolsext_dll_path, cuda_path]))
+    dll_paths.extend(filter(os.path.exists, [cuda_path]))
 
     kernel32 = ctypes.WinDLL('kernel32.dll', use_last_error=True)
     with_load_library_flags = hasattr(kernel32, 'AddDllDirectory')
diff --git a/torch/csrc/cuda/shared/nvtx.cpp b/torch/csrc/cuda/shared/nvtx.cpp
index 882324c..c4dffbb 100644
--- a/torch/csrc/cuda/shared/nvtx.cpp
+++ b/torch/csrc/cuda/shared/nvtx.cpp
@@ -1,7 +1,11 @@
 #ifdef _WIN32
 #include <wchar.h> // _wgetenv for nvtx
 #endif
+#ifndef FBCODE_CAFFE2
+#include <nvtx3/nvToolsExt.h>
+#else
 #include <nvToolsExt.h>
+#endif
 #include <torch/csrc/utils/pybind.h>
 
 namespace torch {
diff --git a/torch/csrc/profiler/stubs/cuda.cpp b/torch/csrc/profiler/stubs/cuda.cpp
index 6731d0f..c9c6c66 100644
--- a/torch/csrc/profiler/stubs/cuda.cpp
+++ b/torch/csrc/profiler/stubs/cuda.cpp
@@ -1,6 +1,10 @@
 #include <sstream>
 
+#ifndef FBCODE_CAFFE2
+#include <nvtx3/nvToolsExt.h>
+#else
 #include <nvToolsExt.h>
+#endif
 
 #include <c10/cuda/CUDAGuard.h>
 #include <c10/util/irange.h>
diff --git a/torch/utils/hipify/cuda_to_hip_mappings.py b/torch/utils/hipify/cuda_to_hip_mappings.py
index 45e3cb6..f89b812 100644
--- a/torch/utils/hipify/cuda_to_hip_mappings.py
+++ b/torch/utils/hipify/cuda_to_hip_mappings.py
@@ -635,7 +635,7 @@
         ("cub/device/device_radix_sort.cuh", ("hipcub/hipcub.hpp", CONV_INCLUDE, API_BLAS)),
         ("cub/device/device_reduce.cuh", ("hipcub/hipcub.hpp", CONV_INCLUDE, API_BLAS)),
         ("cub/device/device_scan.cuh", ("hipcub/hipcub.hpp", CONV_INCLUDE, API_BLAS)),
-        ("nvToolsExt.h", ("roctracer/roctx.h", CONV_INCLUDE, API_ROCTX)),
+        ("nvtx3/nvToolsExt.h", ("roctracer/roctx.h", CONV_INCLUDE, API_ROCTX)),
     ]
 )