Merge fd6579e35 for LLVM update to 344140

Change-Id: I8ee9da9db85c45170dc51297c6268fe81af6ad1d
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 86ca2b3..12bb52a 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -59,6 +59,9 @@
       -D${COMPILER_RT_ASAN_SHADOW_SCALE_DEFINITION})
 endif()
 
+set(COMPILER_RT_HWASAN_WITH_INTERCEPTORS ON CACHE BOOLEAN
+    "Enable libc interceptors in HWASan (testing mode)")
+
 set(COMPILER_RT_BAREMETAL_BUILD OFF CACHE BOOLEAN
   "Build for a bare-metal target.")
 
@@ -79,6 +82,61 @@
       or specify the PYTHON_EXECUTABLE CMake variable.")
   endif()
 
+  # Ensure that fat libraries are built correctly on Darwin
+  if(CMAKE_SYSTEM_NAME STREQUAL Darwin)
+    if(NOT CMAKE_LIBTOOL)
+      find_program(CMAKE_XCRUN
+                   NAMES
+                     xcrun)
+      if(CMAKE_XCRUN)
+        execute_process(COMMAND
+                          ${CMAKE_XCRUN} -find libtool
+                        OUTPUT_VARIABLE
+                          CMAKE_LIBTOOL
+                        OUTPUT_STRIP_TRAILING_WHITESPACE)
+      endif()
+
+      if(NOT CMAKE_LIBTOOL OR NOT EXISTS CMAKE_LIBTOOL)
+        find_program(CMAKE_LIBTOOL
+                     NAMES
+                       libtool)
+      endif()
+    endif()
+
+    get_property(languages GLOBAL PROPERTY ENABLED_LANGUAGES)
+
+    if(CMAKE_LIBTOOL)
+      set(CMAKE_LIBTOOL ${CMAKE_LIBTOOL} CACHE PATH "libtool executable")
+      message(STATUS "Found libtool - ${CMAKE_LIBTOOL}")
+
+      execute_process(COMMAND
+                        ${CMAKE_LIBTOOL} -V
+                      OUTPUT_VARIABLE
+                        LIBTOOL_V_OUTPUT
+                      OUTPUT_STRIP_TRAILING_WHITESPACE)
+      if("${LIBTOOL_V_OUTPUT}" MATCHES ".*cctools-([0-9]+).*")
+        string(REGEX REPLACE ".*cctools-([0-9]+).*" "\\1" LIBTOOL_VERSION ${LIBTOOL_V_OUTPUT})
+        if(NOT LIBTOOL_VERSION VERSION_LESS "862")
+          set(LIBTOOL_NO_WARNING_FLAG "-no_warning_for_no_symbols")
+        endif()
+      endif()
+
+      foreach(lang ${languages})
+        set(CMAKE_${lang}_CREATE_STATIC_LIBRARY "\"${CMAKE_LIBTOOL}\" -static ${LIBTOOL_NO_WARNING_FLAG} -o <TARGET> <LINK_FLAGS> <OBJECTS>")
+      endforeach()
+    endif()
+
+    # Workaround SIP :-(
+    if(DYLD_LIBRARY_PATH)
+      set(dyld_envar "DYLD_LIBRARY_PATH=${DYLD_LIBRARY_PATH}")
+      foreach(lang ${languages})
+        foreach(cmd ${CMAKE_${lang}_CREATE_STATIC_LIBRARY})
+          list(APPEND CMAKE_${lang}_CREATE_STATIC_LIBRARY_NEW "${dyld_envar} ${cmd}")
+        endforeach()
+      endforeach()
+    endif()
+  endif()
+
   # Define default arguments to lit.
   set(LIT_ARGS_DEFAULT "-sv")
   if (MSVC OR XCODE)
@@ -89,12 +147,12 @@
 endif()
 
 construct_compiler_rt_default_triple()
-if ("${COMPILER_RT_DEFAULT_TARGET_ABI}" MATCHES "hf$")
+if ("${COMPILER_RT_DEFAULT_TARGET_TRIPLE}" MATCHES ".*hf$")
   if (${COMPILER_RT_DEFAULT_TARGET_ARCH} MATCHES "^arm")
     set(COMPILER_RT_DEFAULT_TARGET_ARCH "armhf")
   endif()
 endif()
-if ("${COMPILER_RT_DEFAULT_TARGET_ABI}" MATCHES "^android")
+if ("${COMPILER_RT_DEFAULT_TARGET_TRIPLE}" MATCHES ".*android.*")
   set(ANDROID 1)
 endif()
 pythonize_bool(ANDROID)
@@ -310,6 +368,8 @@
 append_list_if(COMPILER_RT_HAS_WD4722_FLAG /wd4722 SANITIZER_COMMON_CFLAGS)
 append_list_if(COMPILER_RT_HAS_WD4800_FLAG /wd4800 SANITIZER_COMMON_CFLAGS)
 
+append_list_if(MINGW -fms-extensions SANITIZER_COMMON_CFLAGS)
+
 # Set common link flags.
 append_list_if(COMPILER_RT_HAS_NODEFAULTLIBS_FLAG -nodefaultlibs SANITIZER_COMMON_LINK_FLAGS)
 
@@ -325,14 +385,6 @@
 
 append_list_if(COMPILER_RT_HAS_LIBC c SANITIZER_COMMON_LINK_LIBS)
 
-if(ANDROID)
-# Put the Sanitizer shared libraries in the global group. For more details, see
-# android-changes-for-ndk-developers.md#changes-to-library-search-order
-  if (COMPILER_RT_HAS_Z_GLOBAL)
-    list(APPEND SANITIZER_COMMON_LINK_FLAGS -Wl,-z,global)
-  endif()
-endif()
-
 if("${CMAKE_SYSTEM_NAME}" STREQUAL "Fuchsia")
   list(APPEND SANITIZER_COMMON_LINK_FLAGS -Wl,-z,defs,-z,now,-z,relro)
   list(APPEND SANITIZER_COMMON_LINK_LIBS zircon)
diff --git a/cmake/Modules/AddCompilerRT.cmake b/cmake/Modules/AddCompilerRT.cmake
index cd4c704..4efbd3a 100644
--- a/cmake/Modules/AddCompilerRT.cmake
+++ b/cmake/Modules/AddCompilerRT.cmake
@@ -357,6 +357,16 @@
   -I${COMPILER_RT_GTEST_PATH}
 )
 
+# Mocking support.
+set(COMPILER_RT_GMOCK_PATH ${LLVM_MAIN_SRC_DIR}/utils/unittest/googlemock)
+set(COMPILER_RT_GMOCK_SOURCE ${COMPILER_RT_GMOCK_PATH}/src/gmock-all.cc)
+set(COMPILER_RT_GMOCK_CFLAGS
+  -DGTEST_NO_LLVM_RAW_OSTREAM=1
+  -DGTEST_HAS_RTTI=0
+  -I${COMPILER_RT_GMOCK_PATH}/include
+  -I${COMPILER_RT_GMOCK_PATH}
+)
+
 append_list_if(COMPILER_RT_DEBUG -DSANITIZER_DEBUG=1 COMPILER_RT_UNITTEST_CFLAGS)
 append_list_if(COMPILER_RT_HAS_WCOVERED_SWITCH_DEFAULT_FLAG -Wno-covered-switch-default COMPILER_RT_UNITTEST_CFLAGS)
 
diff --git a/cmake/Modules/CompilerRTUtils.cmake b/cmake/Modules/CompilerRTUtils.cmake
index c3f2237..d2cb486 100644
--- a/cmake/Modules/CompilerRTUtils.cmake
+++ b/cmake/Modules/CompilerRTUtils.cmake
@@ -217,7 +217,7 @@
   endif()
   if (LLVM_CONFIG_PATH)
     execute_process(
-      COMMAND ${LLVM_CONFIG_PATH} "--obj-root" "--bindir" "--libdir" "--src-root"
+      COMMAND ${LLVM_CONFIG_PATH} "--obj-root" "--bindir" "--libdir" "--src-root" "--includedir"
       RESULT_VARIABLE HAD_ERROR
       OUTPUT_VARIABLE CONFIG_OUTPUT)
     if (HAD_ERROR)
@@ -228,11 +228,31 @@
     list(GET CONFIG_OUTPUT 1 TOOLS_BINARY_DIR)
     list(GET CONFIG_OUTPUT 2 LIBRARY_DIR)
     list(GET CONFIG_OUTPUT 3 MAIN_SRC_DIR)
+    list(GET CONFIG_OUTPUT 4 INCLUDE_DIR)
 
     set(LLVM_BINARY_DIR ${BINARY_DIR} CACHE PATH "Path to LLVM build tree")
-    set(LLVM_TOOLS_BINARY_DIR ${TOOLS_BINARY_DIR} CACHE PATH "Path to llvm/bin")
     set(LLVM_LIBRARY_DIR ${LIBRARY_DIR} CACHE PATH "Path to llvm/lib")
     set(LLVM_MAIN_SRC_DIR ${MAIN_SRC_DIR} CACHE PATH "Path to LLVM source tree")
+    set(LLVM_TOOLS_BINARY_DIR ${TOOLS_BINARY_DIR} CACHE PATH "Path to llvm/bin")
+    set(LLVM_INCLUDE_DIR ${INCLUDE_DIR} CACHE PATH "Paths to LLVM headers")
+
+    # Detect if we have the LLVMXRay and TestingSupport library installed and
+    # available from llvm-config.
+    execute_process(
+      COMMAND ${LLVM_CONFIG_PATH} "--ldflags" "--libs" "xray" "testingsupport"
+      RESULT_VARIABLE HAD_ERROR
+      OUTPUT_VARIABLE CONFIG_OUTPUT)
+    if (HAD_ERROR)
+      message(WARNING "llvm-config finding xray failed with status ${HAD_ERROR}")
+      set(COMPILER_RT_HAS_LLVMXRAY FALSE)
+    else()
+      string(REGEX REPLACE "[ \t]*[\r\n]+[ \t]*" ";" CONFIG_OUTPUT ${CONFIG_OUTPUT})
+      list(GET CONFIG_OUTPUT 0 LDFLAGS)
+      list(GET CONFIG_OUTPUT 1 LIBLIST)
+      set(LLVM_XRAY_LDFLAGS ${LDFLAGS} CACHE STRING "Linker flags for LLVMXRay library")
+      set(LLVM_XRAY_LIBLIST ${LIBLIST} CACHE STRING "Library list for LLVMXRay")
+      set(COMPILER_RT_HAS_LLVMXRAY TRUE)
+    endif()
 
     # Make use of LLVM CMake modules.
     # --cmakedir is supported since llvm r291218 (4.0 release)
@@ -276,11 +296,6 @@
 
   string(REPLACE "-" ";" TARGET_TRIPLE_LIST ${COMPILER_RT_DEFAULT_TARGET_TRIPLE})
   list(GET TARGET_TRIPLE_LIST 0 COMPILER_RT_DEFAULT_TARGET_ARCH)
-  list(GET TARGET_TRIPLE_LIST 1 COMPILER_RT_DEFAULT_TARGET_OS)
-  list(LENGTH TARGET_TRIPLE_LIST TARGET_TRIPLE_LIST_LENGTH)
-  if(TARGET_TRIPLE_LIST_LENGTH GREATER 2)
-    list(GET TARGET_TRIPLE_LIST 2 COMPILER_RT_DEFAULT_TARGET_ABI)
-  endif()
   # Determine if test target triple is specified explicitly, and doesn't match the
   # default.
   if(NOT COMPILER_RT_DEFAULT_TARGET_TRIPLE STREQUAL TARGET_TRIPLE)
@@ -320,13 +335,12 @@
 endfunction()
 
 function(get_compiler_rt_target arch variable)
+  string(FIND ${COMPILER_RT_DEFAULT_TARGET_TRIPLE} "-" dash_index)
+  string(SUBSTRING ${COMPILER_RT_DEFAULT_TARGET_TRIPLE} ${dash_index} -1 triple_suffix)
   if(ANDROID AND ${arch} STREQUAL "i386")
-    set(target "i686${COMPILER_RT_OS_SUFFIX}-${COMPILER_RT_DEFAULT_TARGET_OS}")
+    set(target "i686${COMPILER_RT_OS_SUFFIX}${triple_suffix}")
   else()
-    set(target "${arch}-${COMPILER_RT_DEFAULT_TARGET_OS}")
-  endif()
-  if(COMPILER_RT_DEFAULT_TARGET_ABI)
-    set(target "${target}-${COMPILER_RT_DEFAULT_TARGET_ABI}")
+    set(target "${arch}${triple_suffix}")
   endif()
   set(${variable} ${target} PARENT_SCOPE)
 endfunction()
diff --git a/cmake/Modules/SanitizerUtils.cmake b/cmake/Modules/SanitizerUtils.cmake
index b631242..8fe4baa 100644
--- a/cmake/Modules/SanitizerUtils.cmake
+++ b/cmake/Modules/SanitizerUtils.cmake
@@ -51,7 +51,15 @@
 # This function is only used on Darwin, where undefined symbols must be specified
 # in the linker invocation.
 function(add_weak_symbols libname link_flags)
-  file(STRINGS "${COMPILER_RT_SOURCE_DIR}/lib/${libname}/weak_symbols.txt" WEAK_SYMBOLS)
+  set(weak_symbols_file "${COMPILER_RT_SOURCE_DIR}/lib/${libname}/weak_symbols.txt")
+  file(STRINGS  "${weak_symbols_file}" WEAK_SYMBOLS)
+  # Add this file as a configure-time dependency so that changes to this
+  # file trigger a re-configure. This is necessary so that `${link_flags}`
+  # is changed when appropriate.
+  set_property(
+    DIRECTORY "${CMAKE_CURRENT_SOURCE_DIR}"
+    APPEND
+    PROPERTY CMAKE_CONFIGURE_DEPENDS "${weak_symbols_file}")
   set(local_link_flags ${${link_flags}})
   foreach(SYMBOL ${WEAK_SYMBOLS})
     set(local_link_flags ${local_link_flags} -Wl,-U,${SYMBOL})
diff --git a/cmake/base-config-ix.cmake b/cmake/base-config-ix.cmake
index 91fe249..b92e29a 100644
--- a/cmake/base-config-ix.cmake
+++ b/cmake/base-config-ix.cmake
@@ -191,11 +191,11 @@
       # clang's default CPU's. In the 64-bit case, we must also specify the ABI
       # since the default ABI differs between gcc and clang.
       # FIXME: Ideally, we would build the N32 library too.
-      test_target_arch(mipsel "" "-mips32r2" "--target=mipsel-linux-gnu")
-      test_target_arch(mips64el "" "-mips64r2" "--target=mips64el-linux-gnu" "-mabi=64")
+      test_target_arch(mipsel "" "-mips32r2" "-mabi=32")
+      test_target_arch(mips64el "" "-mips64r2" "-mabi=64")
     elseif("${COMPILER_RT_DEFAULT_TARGET_ARCH}" MATCHES "mips")
-      test_target_arch(mips "" "-mips32r2" "--target=mips-linux-gnu")
-      test_target_arch(mips64 "" "-mips64r2" "--target=mips64-linux-gnu" "-mabi=64")
+      test_target_arch(mips "" "-mips32r2" "-mabi=32")
+      test_target_arch(mips64 "" "-mips64r2" "-mabi=64")
     elseif("${COMPILER_RT_DEFAULT_TARGET_ARCH}" MATCHES "arm")
       if(WIN32)
         test_target_arch(arm "" "" "")
diff --git a/cmake/config-ix.cmake b/cmake/config-ix.cmake
index f3935ff..27097b7 100644
--- a/cmake/config-ix.cmake
+++ b/cmake/config-ix.cmake
@@ -37,6 +37,19 @@
   elseif (COMPILER_RT_HAS_GCC_LIB)
     list(APPEND CMAKE_REQUIRED_LIBRARIES gcc)
   endif ()
+  if (MINGW)
+    # Mingw64 requires quite a few "C" runtime libraries in order for basic
+    # programs to link successfully with -nodefaultlibs.
+    if (COMPILER_RT_USE_BUILTINS_LIBRARY)
+      set(MINGW_RUNTIME ${COMPILER_RT_BUILTINS_LIBRARY})
+    else ()
+      set(MINGW_RUNTIME gcc_s gcc)
+    endif()
+    set(MINGW_LIBRARIES mingw32 ${MINGW_RUNTIME} moldname mingwex msvcrt advapi32
+                        shell32 user32 kernel32 mingw32 ${MINGW_RUNTIME}
+                        moldname mingwex msvcrt)
+    list(APPEND CMAKE_REQUIRED_LIBRARIES ${MINGW_LIBRARIES})
+  endif()
 endif ()
 
 # CodeGen options.
@@ -105,6 +118,19 @@
 check_library_exists(rt shm_open "" COMPILER_RT_HAS_LIBRT)
 check_library_exists(m pow "" COMPILER_RT_HAS_LIBM)
 check_library_exists(pthread pthread_create "" COMPILER_RT_HAS_LIBPTHREAD)
+
+# Look for terminfo library, used in unittests that depend on LLVMSupport.
+if(LLVM_ENABLE_TERMINFO)
+  foreach(library tinfo terminfo curses ncurses ncursesw)
+    check_library_exists(
+      ${library} setupterm "" COMPILER_RT_HAS_TERMINFO)
+    if(COMPILER_RT_HAS_TERMINFO)
+      set(COMPILER_RT_TERMINFO_LIB "${library}")
+      break()
+    endif()
+  endforeach()
+endif()
+
 if (ANDROID AND COMPILER_RT_HAS_LIBDL)
   # Android's libstdc++ has a dependency on libdl.
   list(APPEND CMAKE_REQUIRED_LIBRARIES dl)
@@ -501,7 +527,8 @@
 
 if (SANITIZER_COMMON_SUPPORTED_ARCH AND NOT LLVM_USE_SANITIZER AND
     (OS_NAME MATCHES "Android|Darwin|Linux|FreeBSD|NetBSD|OpenBSD|Fuchsia|SunOS" OR
-    (OS_NAME MATCHES "Windows" AND (NOT MINGW AND NOT CYGWIN))))
+    (OS_NAME MATCHES "Windows" AND NOT CYGWIN AND
+        (NOT MINGW OR CMAKE_CXX_COMPILER_ID MATCHES "Clang"))))
   set(COMPILER_RT_HAS_SANITIZER_COMMON TRUE)
 else()
   set(COMPILER_RT_HAS_SANITIZER_COMMON FALSE)
@@ -598,7 +625,7 @@
 endif()
 
 if (COMPILER_RT_HAS_SANITIZER_COMMON AND ESAN_SUPPORTED_ARCH AND
-    OS_NAME MATCHES "Linux")
+    OS_NAME MATCHES "Linux|FreeBSD")
   set(COMPILER_RT_HAS_ESAN TRUE)
 else()
   set(COMPILER_RT_HAS_ESAN FALSE)
@@ -619,7 +646,10 @@
 endif()
 
 if (COMPILER_RT_HAS_SANITIZER_COMMON AND FUZZER_SUPPORTED_ARCH AND
-    OS_NAME MATCHES "Android|Darwin|Linux|NetBSD|FreeBSD|OpenBSD|Fuchsia")
+    OS_NAME MATCHES "Android|Darwin|Linux|NetBSD|FreeBSD|OpenBSD|Fuchsia|Windows" AND
+    # TODO: Support builds with MSVC.
+    NOT "${CMAKE_CXX_COMPILER_ID}" STREQUAL "MSVC" AND
+    NOT "${CMAKE_C_COMPILER_ID}" STREQUAL "MSVC")
   set(COMPILER_RT_HAS_FUZZER TRUE)
 else()
   set(COMPILER_RT_HAS_FUZZER FALSE)
diff --git a/include/sanitizer/allocator_interface.h b/include/sanitizer/allocator_interface.h
index 89f3283..e44c4a1 100644
--- a/include/sanitizer/allocator_interface.h
+++ b/include/sanitizer/allocator_interface.h
@@ -82,7 +82,6 @@
      Currently available with ASan only.
   */
   void __sanitizer_purge_allocator(void);
-
 #ifdef __cplusplus
 }  // extern "C"
 #endif
diff --git a/include/sanitizer/common_interface_defs.h b/include/sanitizer/common_interface_defs.h
index d11cb1a..bf015eb 100644
--- a/include/sanitizer/common_interface_defs.h
+++ b/include/sanitizer/common_interface_defs.h
@@ -124,6 +124,12 @@
 
   // Symbolizes the supplied 'pc' using the format string 'fmt'.
   // Outputs at most 'out_buf_size' bytes into 'out_buf'.
+  // If 'out_buf' is not empty then output is zero or more non empty C strings
+  // followed by single empty C string. Multiple strings can be returned if PC
+  // corresponds to inlined function. Inlined frames are printed in the order
+  // from "most-inlined" to the "least-inlined", so the last frame should be the
+  // not inlined function.
+  // Inlined frames can be removed with 'symbolize_inline_frames=0'.
   // The format syntax is described in
   // lib/sanitizer_common/sanitizer_stacktrace_printer.h.
   void __sanitizer_symbolize_pc(void *pc, const char *fmt, char *out_buf,
diff --git a/include/sanitizer/hwasan_interface.h b/include/sanitizer/hwasan_interface.h
index 0c306cf..1affd87 100644
--- a/include/sanitizer/hwasan_interface.h
+++ b/include/sanitizer/hwasan_interface.h
@@ -19,6 +19,12 @@
 #ifdef __cplusplus
 extern "C" {
 #endif
+  // Initialize shadow but not the rest of the runtime.
+  // Does not call libc unless there is an error.
+  // Can be called multiple times, or not at all (in which case shadow will
+  // be initialized in compiler-inserted __hwasan_init() call).
+  void __hwasan_shadow_init(void);
+
   // This function may be optionally provided by user and should return
   // a string containing HWASan runtime options. See asan_flags.h for details.
   const char* __hwasan_default_options(void);
@@ -26,6 +32,51 @@
   void __hwasan_enable_allocator_tagging(void);
   void __hwasan_disable_allocator_tagging(void);
 
+  // Mark region of memory with the given tag. Both address and size need to be
+  // 16-byte aligned.
+  void __hwasan_tag_memory(const volatile void *p, unsigned char tag,
+                           size_t size);
+
+  /// Set pointer tag. Previous tag is lost.
+  void *__hwasan_tag_pointer(const volatile void *p, unsigned char tag);
+
+  // Set memory tag from the current SP address to the given address to zero.
+  // This is meant to annotate longjmp and other non-local jumps.
+  // This function needs to know the (almost) exact destination frame address;
+  // clearing shadow for the entire thread stack like __asan_handle_no_return
+  // does would cause false reports.
+  void __hwasan_handle_longjmp(const void *sp_dst);
+
+  // Libc hook for thread creation. Should be called in the child thread before
+  // any instrumented code.
+  void __hwasan_thread_enter();
+
+  // Libc hook for thread destruction. No instrumented code should run after
+  // this call.
+  void __hwasan_thread_exit();
+
+  // Print shadow and origin for the memory range to stderr in a human-readable
+  // format.
+  void __hwasan_print_shadow(const volatile void *x, size_t size);
+
+  // Print one-line report about the memory usage of the current process.
+  void __hwasan_print_memory_usage();
+
+  int __sanitizer_posix_memalign(void **memptr, size_t alignment, size_t size);
+  void * __sanitizer_memalign(size_t alignment, size_t size);
+  void * __sanitizer_aligned_alloc(size_t alignment, size_t size);
+  void * __sanitizer___libc_memalign(size_t alignment, size_t size);
+  void * __sanitizer_valloc(size_t size);
+  void * __sanitizer_pvalloc(size_t size);
+  void __sanitizer_free(void *ptr);
+  void __sanitizer_cfree(void *ptr);
+  size_t __sanitizer_malloc_usable_size(const void *ptr);
+  struct mallinfo __sanitizer_mallinfo();
+  int __sanitizer_mallopt(int cmd, int value);
+  void __sanitizer_malloc_stats(void);
+  void * __sanitizer_calloc(size_t nmemb, size_t size);
+  void * __sanitizer_realloc(void *ptr, size_t size);
+  void * __sanitizer_malloc(size_t size);
 #ifdef __cplusplus
 }  // extern "C"
 #endif
diff --git a/include/xray/xray_log_interface.h b/include/xray/xray_log_interface.h
index 5f8b3a4..3994678 100644
--- a/include/xray/xray_log_interface.h
+++ b/include/xray/xray_log_interface.h
@@ -158,8 +158,8 @@
   /// The log initialization routine provided by the implementation, always
   /// provided with the following parameters:
   ///
-  ///   - buffer size
-  ///   - maximum number of buffers
+  ///   - buffer size (unused)
+  ///   - maximum number of buffers (unused)
   ///   - a pointer to an argument struct that the implementation MUST handle
   ///   - the size of the argument struct
   ///
@@ -355,25 +355,4 @@
 
 } // extern "C"
 
-namespace __xray {
-
-/// DEPRECATED: Use __xray_log_init_mode(...) instead, and provide flag
-/// configuration strings to set the options instead.
-/// Options used by the LLVM XRay FDR logging implementation.
-struct FDRLoggingOptions {
-  bool ReportErrors = false;
-  int Fd = -1;
-};
-
-/// DEPRECATED: Use __xray_log_init_mode(...) instead, and provide flag
-/// configuration strings to set the options instead.
-/// Options used by the LLVM XRay Basic (Naive) logging implementation.
-struct BasicLoggingOptions {
-  int DurationFilterMicros = 0;
-  size_t MaxStackDepth = 0;
-  size_t ThreadBufferSize = 0;
-};
-
-} // namespace __xray
-
 #endif // XRAY_XRAY_LOG_INTERFACE_H
diff --git a/lib/asan/CMakeLists.txt b/lib/asan/CMakeLists.txt
index 2ae5c85..4afc8c9 100644
--- a/lib/asan/CMakeLists.txt
+++ b/lib/asan/CMakeLists.txt
@@ -75,6 +75,14 @@
 
 set(ASAN_DYNAMIC_LINK_FLAGS ${SANITIZER_COMMON_LINK_FLAGS})
 
+if(ANDROID)
+# Put most Sanitizer shared libraries in the global group. For more details, see
+# android-changes-for-ndk-developers.md#changes-to-library-search-order
+  if (COMPILER_RT_HAS_Z_GLOBAL)
+    list(APPEND ASAN_DYNAMIC_LINK_FLAGS -Wl,-z,global)
+  endif()
+endif()
+
 set(ASAN_DYNAMIC_DEFINITIONS
   ${ASAN_COMMON_DEFINITIONS} ASAN_DYNAMIC=1)
 append_list_if(WIN32 INTERCEPTION_DYNAMIC_CRT ASAN_DYNAMIC_DEFINITIONS)
@@ -91,6 +99,7 @@
 append_list_if(COMPILER_RT_HAS_LIBM m ASAN_DYNAMIC_LIBS)
 append_list_if(COMPILER_RT_HAS_LIBPTHREAD pthread ASAN_DYNAMIC_LIBS)
 append_list_if(COMPILER_RT_HAS_LIBLOG log ASAN_DYNAMIC_LIBS)
+append_list_if(MINGW "${MINGW_LIBRARIES}" ASAN_DYNAMIC_LIBS)
 
 # Compile ASan sources into an object library.
 
@@ -138,6 +147,7 @@
   add_weak_symbols("lsan" WEAK_SYMBOL_LINK_FLAGS)
   add_weak_symbols("ubsan" WEAK_SYMBOL_LINK_FLAGS)
   add_weak_symbols("sanitizer_common" WEAK_SYMBOL_LINK_FLAGS)
+  add_weak_symbols("xray" WEAK_SYMBOL_LINK_FLAGS)
 
   add_compiler_rt_runtime(clang_rt.asan
     SHARED
@@ -215,7 +225,7 @@
     endif()
 
     set(ASAN_DYNAMIC_WEAK_INTERCEPTION)
-    if (MSVC)
+    if (WIN32)
       add_compiler_rt_object_libraries(AsanWeakInterception
         ${SANITIZER_COMMON_SUPPORTED_OS}
         ARCHS ${arch}
diff --git a/lib/asan/asan_globals_win.cc b/lib/asan/asan_globals_win.cc
index 29ab5eb..0e75992 100644
--- a/lib/asan/asan_globals_win.cc
+++ b/lib/asan/asan_globals_win.cc
@@ -29,7 +29,7 @@
   __asan_global *end = &__asan_globals_end;
   uptr bytediff = (uptr)end - (uptr)start;
   if (bytediff % sizeof(__asan_global) != 0) {
-#ifdef SANITIZER_DLL_THUNK
+#if defined(SANITIZER_DLL_THUNK) || defined(SANITIZER_DYNAMIC_RUNTIME_THUNK)
     __debugbreak();
 #else
     CHECK("corrupt asan global array");
diff --git a/lib/asan/asan_malloc_win.cc b/lib/asan/asan_malloc_win.cc
index efa0582..a094e05 100644
--- a/lib/asan/asan_malloc_win.cc
+++ b/lib/asan/asan_malloc_win.cc
@@ -14,8 +14,17 @@
 
 #include "sanitizer_common/sanitizer_platform.h"
 #if SANITIZER_WINDOWS
-#define WIN32_LEAN_AND_MEAN
-#include <windows.h>
+// Intentionally not including windows.h here, to avoid the risk of
+// pulling in conflicting declarations of these functions. (With mingw-w64,
+// there's a risk of windows.h pulling in stdint.h.)
+typedef int BOOL;
+typedef void *HANDLE;
+typedef const void *LPCVOID;
+typedef void *LPVOID;
+
+#define HEAP_ZERO_MEMORY           0x00000008
+#define HEAP_REALLOC_IN_PLACE_ONLY 0x00000010
+
 
 #include "asan_allocator.h"
 #include "asan_interceptors.h"
@@ -125,7 +134,7 @@
 }
 
 ALLOCATION_FUNCTION_ATTRIBUTE
-size_t _msize(const void *ptr) {
+size_t _msize(void *ptr) {
   GET_CURRENT_PC_BP_SP;
   (void)sp;
   return asan_malloc_usable_size(ptr, pc, bp);
diff --git a/lib/asan/asan_new_delete.cc b/lib/asan/asan_new_delete.cc
index 30efd61..e6053c1 100644
--- a/lib/asan/asan_new_delete.cc
+++ b/lib/asan/asan_new_delete.cc
@@ -26,7 +26,7 @@
 // anyway by passing extra -export flags to the linker, which is exactly that
 // dllexport would normally do. We need to export them in order to make the
 // VS2015 dynamic CRT (MD) work.
-#if SANITIZER_WINDOWS
+#if SANITIZER_WINDOWS && defined(_MSC_VER)
 #define CXX_OPERATOR_ATTRIBUTE
 #define COMMENT_EXPORT(sym) __pragma(comment(linker, "/export:" sym))
 #ifdef _WIN64
diff --git a/lib/asan/asan_win.cc b/lib/asan/asan_win.cc
index 67125d3..5661d91 100644
--- a/lib/asan/asan_win.cc
+++ b/lib/asan/asan_win.cc
@@ -159,6 +159,14 @@
 namespace __asan {
 
 void InitializePlatformInterceptors() {
+  // The interceptors were not designed to be removable, so we have to keep this
+  // module alive for the life of the process.
+  HMODULE pinned;
+  CHECK(GetModuleHandleExW(GET_MODULE_HANDLE_EX_FLAG_FROM_ADDRESS |
+                           GET_MODULE_HANDLE_EX_FLAG_PIN,
+                           (LPCWSTR)&InitializePlatformInterceptors,
+                           &pinned));
+
   ASAN_INTERCEPT_FUNC(CreateThread);
   ASAN_INTERCEPT_FUNC(SetUnhandledExceptionFilter);
 
diff --git a/lib/asan/tests/asan_interface_test.cc b/lib/asan/tests/asan_interface_test.cc
index 69c8fe6..b5c8303 100644
--- a/lib/asan/tests/asan_interface_test.cc
+++ b/lib/asan/tests/asan_interface_test.cc
@@ -102,6 +102,7 @@
   }
 }
 
+#if !defined(__NetBSD__)
 static const size_t kManyThreadsMallocSizes[] = {5, 1UL<<10, 1UL<<14, 357};
 static const size_t kManyThreadsIterations = 250;
 static const size_t kManyThreadsNumThreads =
@@ -135,6 +136,7 @@
   // so we can't check for equality here.
   EXPECT_LT(after_test, before_test + (1UL<<20));
 }
+#endif
 
 static void DoDoubleFree() {
   int *x = Ident(new int);
diff --git a/lib/asan/tests/asan_noinst_test.cc b/lib/asan/tests/asan_noinst_test.cc
index 65acb28..3e36684 100644
--- a/lib/asan/tests/asan_noinst_test.cc
+++ b/lib/asan/tests/asan_noinst_test.cc
@@ -153,6 +153,7 @@
   EXPECT_LT(i, max_i);
 }
 
+#if !defined(__NetBSD__)
 void *ThreadedQuarantineTestWorker(void *unused) {
   (void)unused;
   u32 seed = my_rand();
@@ -187,6 +188,7 @@
     EXPECT_LT(mmaped2 - mmaped1, 320U * (1 << 20));
   }
 }
+#endif
 
 void *ThreadedOneSizeMallocStress(void *unused) {
   (void)unused;
diff --git a/lib/builtins/arm/addsf3.S b/lib/builtins/arm/addsf3.S
index 362b5c1..74723cb 100644
--- a/lib/builtins/arm/addsf3.S
+++ b/lib/builtins/arm/addsf3.S
@@ -178,7 +178,7 @@
 
   push {r0, r1, r2, r3}
   movs r0, r4
-  bl   __clzsi2
+  bl   SYMBOL_NAME(__clzsi2)
   movs r5, r0
   pop {r0, r1, r2, r3}
   // shift = rep_clz(aSignificand) - rep_clz(implicitBit << 3);
diff --git a/lib/builtins/arm/aeabi_cdcmp.S b/lib/builtins/arm/aeabi_cdcmp.S
index 87dd03d..adc2d55 100644
--- a/lib/builtins/arm/aeabi_cdcmp.S
+++ b/lib/builtins/arm/aeabi_cdcmp.S
@@ -55,7 +55,7 @@
         mov ip, #APSR_C
         msr APSR_nzcvq, ip
 #else
-        msr CPSR_f, #APSR_C
+        msr APSR_nzcvq, #APSR_C
 #endif
         JMP(lr)
 #endif
@@ -115,11 +115,7 @@
         movne ip, #(APSR_C)
 
 1:
-#if defined(__ARM_ARCH_7M__) || defined(__ARM_ARCH_7EM__)
         msr APSR_nzcvq, ip
-#else
-        msr CPSR_f, ip
-#endif
         pop {r0-r3}
         POP_PC()
 #endif
diff --git a/lib/builtins/arm/aeabi_cfcmp.S b/lib/builtins/arm/aeabi_cfcmp.S
index c5fee6b..4b1de99 100644
--- a/lib/builtins/arm/aeabi_cfcmp.S
+++ b/lib/builtins/arm/aeabi_cfcmp.S
@@ -55,7 +55,7 @@
         mov ip, #APSR_C
         msr APSR_nzcvq, ip
 #else
-        msr CPSR_f, #APSR_C
+        msr APSR_nzcvq, #APSR_C
 #endif
         JMP(lr)
 #endif
@@ -115,11 +115,7 @@
         movne ip, #(APSR_C)
 
 1:
-#if defined(__ARM_ARCH_7M__) || defined(__ARM_ARCH_7EM__)
         msr APSR_nzcvq, ip
-#else
-        msr CPSR_f, ip
-#endif
         pop {r0-r3}
         POP_PC()
 #endif
diff --git a/lib/builtins/clzdi2.c b/lib/builtins/clzdi2.c
index b56d98f..1819e6b 100644
--- a/lib/builtins/clzdi2.c
+++ b/lib/builtins/clzdi2.c
@@ -16,8 +16,13 @@
 
 /* Returns: the number of leading 0-bits */
 
-#if !defined(__clang__) && (defined(__sparc64__) || defined(__mips64) || defined(__riscv__))
-/* gcc resolves __builtin_clz -> __clzdi2 leading to infinite recursion */
+#if !defined(__clang__) &&                                                     \
+    ((defined(__sparc__) && defined(__arch64__)) ||                            \
+     defined(__mips64) ||                                                      \
+     (defined(__riscv) && __SIZEOF_POINTER__ >= 8))
+/* On 64-bit architectures with neither a native clz instruction nor a native
+ * ctz instruction, gcc resolves __builtin_clz to __clzdi2 rather than
+ * __clzsi2, leading to infinite recursion. */
 #define __builtin_clz(a) __clzsi2(a)
 extern si_int __clzsi2(si_int);
 #endif
diff --git a/lib/builtins/ctzdi2.c b/lib/builtins/ctzdi2.c
index eecde29..ef6d7fe 100644
--- a/lib/builtins/ctzdi2.c
+++ b/lib/builtins/ctzdi2.c
@@ -16,8 +16,13 @@
 
 /* Returns: the number of trailing 0-bits  */
 
-#if !defined(__clang__) && (defined(__sparc64__) || defined(__mips64) || defined(__riscv__))
-/* gcc resolves __builtin_ctz -> __ctzdi2 leading to infinite recursion */
+#if !defined(__clang__) &&                                                     \
+    ((defined(__sparc__) && defined(__arch64__)) ||                            \
+     defined(__mips64) ||                                                      \
+     (defined(__riscv) && __SIZEOF_POINTER__ >= 8))
+/* On 64-bit architectures with neither a native clz instruction nor a native
+ * ctz instruction, gcc resolves __builtin_ctz to __ctzdi2 rather than
+ * __ctzsi2, leading to infinite recursion. */
 #define __builtin_ctz(a) __ctzsi2(a)
 extern si_int __ctzsi2(si_int);
 #endif
diff --git a/lib/builtins/divdc3.c b/lib/builtins/divdc3.c
index 3c88390..392d6ec 100644
--- a/lib/builtins/divdc3.c
+++ b/lib/builtins/divdc3.c
@@ -12,6 +12,8 @@
  * ===----------------------------------------------------------------------===
  */
 
+#define DOUBLE_PRECISION
+#include "fp_lib.h"
 #include "int_lib.h"
 #include "int_math.h"
 
@@ -21,7 +23,7 @@
 __divdc3(double __a, double __b, double __c, double __d)
 {
     int __ilogbw = 0;
-    double __logbw = crt_logb(crt_fmax(crt_fabs(__c), crt_fabs(__d)));
+    double __logbw = __compiler_rt_logb(crt_fmax(crt_fabs(__c), crt_fabs(__d)));
     if (crt_isfinite(__logbw))
     {
         __ilogbw = (int)__logbw;
diff --git a/lib/builtins/divsc3.c b/lib/builtins/divsc3.c
index 42a4831..0d18a25 100644
--- a/lib/builtins/divsc3.c
+++ b/lib/builtins/divsc3.c
@@ -12,6 +12,8 @@
  *===----------------------------------------------------------------------===
  */
 
+#define SINGLE_PRECISION
+#include "fp_lib.h"
 #include "int_lib.h"
 #include "int_math.h"
 
@@ -21,7 +23,8 @@
 __divsc3(float __a, float __b, float __c, float __d)
 {
     int __ilogbw = 0;
-    float __logbw = crt_logbf(crt_fmaxf(crt_fabsf(__c), crt_fabsf(__d)));
+    float __logbw =
+        __compiler_rt_logbf(crt_fmaxf(crt_fabsf(__c), crt_fabsf(__d)));
     if (crt_isfinite(__logbw))
     {
         __ilogbw = (int)__logbw;
diff --git a/lib/builtins/divtc3.c b/lib/builtins/divtc3.c
index 16e538b..e5ea00d 100644
--- a/lib/builtins/divtc3.c
+++ b/lib/builtins/divtc3.c
@@ -12,6 +12,8 @@
  *===----------------------------------------------------------------------===
  */
 
+#define QUAD_PRECISION
+#include "fp_lib.h"
 #include "int_lib.h"
 #include "int_math.h"
 
@@ -21,7 +23,8 @@
 __divtc3(long double __a, long double __b, long double __c, long double __d)
 {
     int __ilogbw = 0;
-    long double __logbw = crt_logbl(crt_fmaxl(crt_fabsl(__c), crt_fabsl(__d)));
+    long double __logbw =
+        __compiler_rt_logbl(crt_fmaxl(crt_fabsl(__c), crt_fabsl(__d)));
     if (crt_isfinite(__logbw))
     {
         __ilogbw = (int)__logbw;
diff --git a/lib/builtins/emutls.c b/lib/builtins/emutls.c
index 07d436e..ef95a1c 100644
--- a/lib/builtins/emutls.c
+++ b/lib/builtins/emutls.c
@@ -42,6 +42,7 @@
 
 static pthread_mutex_t emutls_mutex = PTHREAD_MUTEX_INITIALIZER;
 static pthread_key_t emutls_pthread_key;
+static bool emutls_key_created = false;
 
 typedef unsigned int gcc_word __attribute__((mode(word)));
 typedef unsigned int gcc_pointer __attribute__((mode(pointer)));
@@ -109,6 +110,7 @@
 static __inline void emutls_init(void) {
     if (pthread_key_create(&emutls_pthread_key, emutls_key_destructor) != 0)
         abort();
+    emutls_key_created = true;
 }
 
 static __inline void emutls_init_once(void) {
@@ -390,3 +392,14 @@
         array->data[index] = emutls_allocate_object(control);
     return array->data[index];
 }
+
+#ifdef __BIONIC__
+/* Called by Bionic on dlclose to delete the emutls pthread key. */
+__attribute__((visibility("hidden")))
+void __emutls_unregister_key(void) {
+    if (emutls_key_created) {
+        pthread_key_delete(emutls_pthread_key);
+        emutls_key_created = false;
+    }
+}
+#endif
diff --git a/lib/builtins/fp_lib.h b/lib/builtins/fp_lib.h
index 223fb98..a0e19ab 100644
--- a/lib/builtins/fp_lib.h
+++ b/lib/builtins/fp_lib.h
@@ -25,6 +25,7 @@
 #include <stdbool.h>
 #include <limits.h>
 #include "int_lib.h"
+#include "int_math.h"
 
 // x86_64 FreeBSD prior v9.3 define fixed-width types incorrectly in
 // 32-bit mode.
@@ -265,6 +266,62 @@
         *hi = 0;
     }
 }
+
+// Implements logb methods (logb, logbf, logbl) for IEEE-754. This avoids
+// pulling in a libm dependency from compiler-rt, but is not meant to replace
+// it (i.e. code calling logb() should get the one from libm, not this), hence
+// the __compiler_rt prefix.
+static __inline fp_t __compiler_rt_logbX(fp_t x) {
+  rep_t rep = toRep(x);
+  int exp = (rep & exponentMask) >> significandBits;
+
+  // Abnormal cases:
+  // 1) +/- inf returns +inf; NaN returns NaN
+  // 2) 0.0 returns -inf
+  if (exp == maxExponent) {
+    if (((rep & signBit) == 0) || (x != x)) {
+      return x;  // NaN or +inf: return x
+    } else {
+      return -x;  // -inf: return -x
+    }
+  } else if (x == 0.0) {
+    // 0.0: return -inf
+    return fromRep(infRep | signBit);
+  }
+
+  if (exp != 0) {
+    // Normal number
+    return exp - exponentBias;  // Unbias exponent
+  } else {
+    // Subnormal number; normalize and repeat
+    rep &= absMask;
+    const int shift = 1 - normalize(&rep);
+    exp = (rep & exponentMask) >> significandBits;
+    return exp - exponentBias - shift;  // Unbias exponent
+  }
+}
+#endif
+
+#if defined(SINGLE_PRECISION)
+static __inline fp_t __compiler_rt_logbf(fp_t x) {
+  return __compiler_rt_logbX(x);
+}
+#elif defined(DOUBLE_PRECISION)
+static __inline fp_t __compiler_rt_logb(fp_t x) {
+  return __compiler_rt_logbX(x);
+}
+#elif defined(QUAD_PRECISION)
+  #if defined(CRT_LDBL_128BIT)
+static __inline fp_t __compiler_rt_logbl(fp_t x) {
+  return __compiler_rt_logbX(x);
+}
+  #else
+// The generic implementation only works for ieee754 floating point. For other
+// floating point types, continue to rely on the libm implementation for now.
+static __inline long double __compiler_rt_logbl(long double x) {
+  return crt_logbl(x);
+}
+  #endif
 #endif
 
 #endif // FP_LIB_HEADER
diff --git a/lib/builtins/int_math.h b/lib/builtins/int_math.h
index fc81fb7..aa3d072 100644
--- a/lib/builtins/int_math.h
+++ b/lib/builtins/int_math.h
@@ -92,12 +92,8 @@
 #endif
 
 #if defined(_MSC_VER) && !defined(__clang__)
-#define crt_logb(x) logb((x))
-#define crt_logbf(x) logbf((x))
 #define crt_logbl(x) logbl((x))
 #else
-#define crt_logb(x) __builtin_logb((x))
-#define crt_logbf(x) __builtin_logbf((x))
 #define crt_logbl(x) __builtin_logbl((x))
 #endif
 
diff --git a/lib/builtins/int_util.c b/lib/builtins/int_util.c
index de87410..752f201 100644
--- a/lib/builtins/int_util.c
+++ b/lib/builtins/int_util.c
@@ -27,7 +27,7 @@
 #ifndef _WIN32
 __attribute__((visibility("hidden")))
 #endif
-void compilerrt_abort_impl(const char *file, int line, const char *function) {
+void __compilerrt_abort_impl(const char *file, int line, const char *function) {
   panic("%s:%d: abort in %s", file, line, function);
 }
 
@@ -41,7 +41,7 @@
 __attribute__((weak))
 __attribute__((visibility("hidden")))
 #endif
-void compilerrt_abort_impl(const char *file, int line, const char *function) {
+void __compilerrt_abort_impl(const char *file, int line, const char *function) {
   __assert_rtn(function, file, line, "libcompiler_rt abort");
 }
 
@@ -51,7 +51,7 @@
 __attribute__((weak))
 __attribute__((visibility("hidden")))
 #endif
-void compilerrt_abort_impl(const char *file, int line, const char *function) {
+void __compilerrt_abort_impl(const char *file, int line, const char *function) {
   __builtin_trap();
 }
 
@@ -64,7 +64,7 @@
 __attribute__((weak))
 __attribute__((visibility("hidden")))
 #endif
-void compilerrt_abort_impl(const char *file, int line, const char *function) {
+void __compilerrt_abort_impl(const char *file, int line, const char *function) {
   abort();
 }
 
diff --git a/lib/builtins/int_util.h b/lib/builtins/int_util.h
index a7b20ed..c3c8738 100644
--- a/lib/builtins/int_util.h
+++ b/lib/builtins/int_util.h
@@ -20,10 +20,10 @@
 #define INT_UTIL_H
 
 /** \brief Trigger a program abort (or panic for kernel code). */
-#define compilerrt_abort() compilerrt_abort_impl(__FILE__, __LINE__, __func__)
+#define compilerrt_abort() __compilerrt_abort_impl(__FILE__, __LINE__, __func__)
 
-NORETURN void compilerrt_abort_impl(const char *file, int line,
-                                    const char *function);
+NORETURN void __compilerrt_abort_impl(const char *file, int line,
+                                      const char *function);
 
 #define COMPILE_TIME_ASSERT(expr) COMPILE_TIME_ASSERT1(expr, __COUNTER__)
 #define COMPILE_TIME_ASSERT1(expr, cnt) COMPILE_TIME_ASSERT2(expr, cnt)
diff --git a/lib/builtins/ppc/divtc3.c b/lib/builtins/ppc/divtc3.c
index 8ec41c5..ef532b8 100644
--- a/lib/builtins/ppc/divtc3.c
+++ b/lib/builtins/ppc/divtc3.c
@@ -4,6 +4,11 @@
 
 #include "DD.h"
 #include "../int_math.h"
+// Use DOUBLE_PRECISION because the soft-fp method we use is logb (on the upper
+// half of the long doubles), even though this file defines complex division for
+// 128-bit floats.
+#define DOUBLE_PRECISION
+#include "../fp_lib.h"
 
 #if !defined(CRT_INFINITY) && defined(HUGE_VAL)
 #define CRT_INFINITY HUGE_VAL
@@ -21,9 +26,10 @@
 	DD dDD = { .ld = d };
 	
 	int ilogbw = 0;
-	const double logbw = crt_logb(crt_fmax(crt_fabs(cDD.s.hi), crt_fabs(dDD.s.hi) ));
-	
-	if (crt_isfinite(logbw))
+	const double logbw = __compiler_rt_logb(
+		crt_fmax(crt_fabs(cDD.s.hi), crt_fabs(dDD.s.hi)));
+
+        if (crt_isfinite(logbw))
 	{
 		ilogbw = (int)logbw;
 		
diff --git a/lib/cfi/CMakeLists.txt b/lib/cfi/CMakeLists.txt
index 7ed72bc..463a1fd 100644
--- a/lib/cfi/CMakeLists.txt
+++ b/lib/cfi/CMakeLists.txt
@@ -1,6 +1,6 @@
 add_compiler_rt_component(cfi)
 
-if(OS_NAME MATCHES "Linux")
+if(OS_NAME MATCHES "Linux" OR OS_NAME MATCHES "FreeBSD" OR OS_NAME MATCHES "NetBSD")
   set(CFI_SOURCES cfi.cc)
 
   include_directories(..)
diff --git a/lib/cfi/cfi.cc b/lib/cfi/cfi.cc
index a2f127f..b0a9437 100644
--- a/lib/cfi/cfi.cc
+++ b/lib/cfi/cfi.cc
@@ -13,15 +13,33 @@
 
 #include <assert.h>
 #include <elf.h>
+
+#include "sanitizer_common/sanitizer_common.h"
+#if SANITIZER_FREEBSD
+#include <sys/link_elf.h>
+#endif
 #include <link.h>
 #include <string.h>
+#include <stdlib.h>
 #include <sys/mman.h>
 
+#if SANITIZER_LINUX
 typedef ElfW(Phdr) Elf_Phdr;
 typedef ElfW(Ehdr) Elf_Ehdr;
+typedef ElfW(Addr) Elf_Addr;
+typedef ElfW(Sym) Elf_Sym;
+typedef ElfW(Dyn) Elf_Dyn;
+#elif SANITIZER_FREEBSD
+#if SANITIZER_WORDSIZE == 64
+#define ElfW64_Dyn Elf_Dyn
+#define ElfW64_Sym Elf_Sym
+#else
+#define ElfW32_Dyn Elf_Dyn
+#define ElfW32_Sym Elf_Sym
+#endif
+#endif
 
 #include "interception/interception.h"
-#include "sanitizer_common/sanitizer_common.h"
 #include "sanitizer_common/sanitizer_flag_parser.h"
 #include "ubsan/ubsan_init.h"
 #include "ubsan/ubsan_flags.h"
@@ -154,15 +172,25 @@
     *s = sv;
 }
 
-#if SANITIZER_LINUX
+#if SANITIZER_LINUX || SANITIZER_FREEBSD || SANITIZER_NETBSD
 void ShadowBuilder::Install() {
   MprotectReadOnly(shadow_, GetShadowSize());
   uptr main_shadow = GetShadow();
   if (main_shadow) {
     // Update.
+#if SANITIZER_LINUX
     void *res = mremap((void *)shadow_, GetShadowSize(), GetShadowSize(),
                        MREMAP_MAYMOVE | MREMAP_FIXED, (void *)main_shadow);
     CHECK(res != MAP_FAILED);
+#elif SANITIZER_NETBSD
+    void *res = mremap((void *)shadow_, GetShadowSize(), (void *)main_shadow,
+                       GetShadowSize(), MAP_FIXED);
+    CHECK(res != MAP_FAILED);
+#else
+    void *res = MmapFixedOrDie(shadow_, GetShadowSize());
+    CHECK(res != MAP_FAILED);
+    ::memcpy(&shadow_, &main_shadow, GetShadowSize());
+#endif
   } else {
     // Initial setup.
     CHECK_EQ(kCfiShadowLimitsStorageSize, GetPageSizeCached());
@@ -183,17 +211,17 @@
 //    dlopen(RTLD_NOLOAD | RTLD_LAZY)
 //    dlsym("__cfi_check").
 uptr find_cfi_check_in_dso(dl_phdr_info *info) {
-  const ElfW(Dyn) *dynamic = nullptr;
+  const Elf_Dyn *dynamic = nullptr;
   for (int i = 0; i < info->dlpi_phnum; ++i) {
     if (info->dlpi_phdr[i].p_type == PT_DYNAMIC) {
       dynamic =
-          (const ElfW(Dyn) *)(info->dlpi_addr + info->dlpi_phdr[i].p_vaddr);
+          (const Elf_Dyn *)(info->dlpi_addr + info->dlpi_phdr[i].p_vaddr);
       break;
     }
   }
   if (!dynamic) return 0;
   uptr strtab = 0, symtab = 0, strsz = 0;
-  for (const ElfW(Dyn) *p = dynamic; p->d_tag != PT_NULL; ++p) {
+  for (const Elf_Dyn *p = dynamic; p->d_tag != PT_NULL; ++p) {
     if (p->d_tag == DT_SYMTAB)
       symtab = p->d_un.d_ptr;
     else if (p->d_tag == DT_STRTAB)
@@ -227,7 +255,7 @@
     return 0;
   }
 
-  for (const ElfW(Sym) *p = (const ElfW(Sym) *)symtab; (ElfW(Addr))p < strtab;
+  for (const Elf_Sym *p = (const Elf_Sym *)symtab; (Elf_Addr)p < strtab;
        ++p) {
     // There is no reliable way to find the end of the symbol table. In
     // lld-produces files, there are other sections between symtab and strtab.
diff --git a/lib/cfi/cfi_blacklist.txt b/lib/cfi/cfi_blacklist.txt
index 3d73508..4a0f039 100644
--- a/lib/cfi/cfi_blacklist.txt
+++ b/lib/cfi/cfi_blacklist.txt
@@ -1,13 +1,11 @@
 [cfi-unrelated-cast]
 # The specification of std::get_temporary_buffer mandates a cast to
-# uninitialized T* (libstdc++, libc++, MSVC stdlib).
+# uninitialized T* (libstdc++, MSVC stdlib).
 fun:_ZSt20get_temporary_buffer*
-fun:_ZNSt3__120get_temporary_buffer*
 fun:*get_temporary_buffer@.*@std@@*
 
-# STL address-of magic (libstdc++, libc++).
+# STL address-of magic (libstdc++).
 fun:*__addressof*
-fun:_ZNSt3__19addressof*
 
 # Windows C++ stdlib headers that contain bad unrelated casts.
 src:*xmemory0
diff --git a/lib/esan/CMakeLists.txt b/lib/esan/CMakeLists.txt
index 4de5c02..c880971 100644
--- a/lib/esan/CMakeLists.txt
+++ b/lib/esan/CMakeLists.txt
@@ -14,6 +14,7 @@
   esan_interceptors.cpp
   esan_linux.cpp
   esan_sideline_linux.cpp
+  esan_sideline_bsd.cpp
   cache_frag.cpp
   working_set.cpp
   working_set_posix.cpp)
diff --git a/lib/esan/esan_interceptors.cpp b/lib/esan/esan_interceptors.cpp
index 0c596f1..833faa2 100644
--- a/lib/esan/esan_interceptors.cpp
+++ b/lib/esan/esan_interceptors.cpp
@@ -327,7 +327,7 @@
 // Signal-related interceptors
 //===----------------------------------------------------------------------===//
 
-#if SANITIZER_LINUX
+#if SANITIZER_LINUX || SANITIZER_FREEBSD
 typedef void (*signal_handler_t)(int);
 INTERCEPTOR(signal_handler_t, signal, int signum, signal_handler_t handler) {
   void *ctx;
@@ -344,7 +344,7 @@
 #define ESAN_MAYBE_INTERCEPT_SIGNAL
 #endif
 
-#if SANITIZER_LINUX
+#if SANITIZER_LINUX || SANITIZER_FREEBSD
 DECLARE_REAL(int, sigaction, int signum, const struct sigaction *act,
              struct sigaction *oldact)
 INTERCEPTOR(int, sigaction, int signum, const struct sigaction *act,
@@ -363,7 +363,11 @@
   if (REAL(sigaction) == nullptr) {
     // With an instrumented allocator, this is called during interceptor init
     // and we need a raw syscall solution.
+#if SANITIZER_LINUX
     return internal_sigaction_syscall(signum, act, oldact);
+#else
+    return internal_sigaction(signum, act, oldact);
+#endif
   }
   return REAL(sigaction)(signum, (const struct sigaction *)act,
                          (struct sigaction *)oldact);
@@ -376,7 +380,7 @@
 #define ESAN_MAYBE_INTERCEPT_SIGACTION
 #endif
 
-#if SANITIZER_LINUX
+#if SANITIZER_LINUX || SANITIZER_FREEBSD
 INTERCEPTOR(int, sigprocmask, int how, __sanitizer_sigset_t *set,
             __sanitizer_sigset_t *oldset) {
   void *ctx;
diff --git a/lib/esan/esan_shadow.h b/lib/esan/esan_shadow.h
index 72a919a..b76a9cc 100644
--- a/lib/esan/esan_shadow.h
+++ b/lib/esan/esan_shadow.h
@@ -30,7 +30,7 @@
   bool ShadowMergedWithPrev;
 };
 
-#if SANITIZER_LINUX && defined(__x86_64__)
+#if (SANITIZER_LINUX || SANITIZER_FREEBSD) && defined(__x86_64__)
 // Linux x86_64
 //
 // Application memory falls into these 5 regions (ignoring the corner case
diff --git a/lib/esan/esan_sideline_bsd.cpp b/lib/esan/esan_sideline_bsd.cpp
new file mode 100644
index 0000000..3134d37
--- /dev/null
+++ b/lib/esan/esan_sideline_bsd.cpp
@@ -0,0 +1,35 @@
+//===-- esan_sideline_bsd.cpp -----------------------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file is a part of EfficiencySanitizer, a family of performance tuners.
+//
+// Support for a separate or "sideline" tool thread on FreeBSD.
+//===----------------------------------------------------------------------===//
+
+#include "sanitizer_common/sanitizer_platform.h"
+#if SANITIZER_FREEBSD
+
+#include "esan_sideline.h"
+
+namespace __esan {
+
+static SidelineThread *TheThread;
+
+bool SidelineThread::launchThread(SidelineFunc takeSample, void *Arg,
+                                  u32 FreqMilliSec) {
+  return true;
+}
+
+bool SidelineThread::joinThread() {
+  return true;
+}
+
+} // namespace __esan
+
+#endif // SANITIZER_FREEBSD
diff --git a/lib/fuzzer/CMakeLists.txt b/lib/fuzzer/CMakeLists.txt
index 679318e..aae3df8 100644
--- a/lib/fuzzer/CMakeLists.txt
+++ b/lib/fuzzer/CMakeLists.txt
@@ -3,7 +3,7 @@
   FuzzerDataFlowTrace.cpp
   FuzzerDriver.cpp
   FuzzerExtFunctionsDlsym.cpp
-  FuzzerExtFunctionsDlsymWin.cpp
+  FuzzerExtFunctionsWeakAlias.cpp
   FuzzerExtFunctionsWeak.cpp
   FuzzerExtraCounters.cpp
   FuzzerIO.cpp
diff --git a/lib/fuzzer/FuzzerCommand.h b/lib/fuzzer/FuzzerCommand.h
index 255f571..9d258a2 100644
--- a/lib/fuzzer/FuzzerCommand.h
+++ b/lib/fuzzer/FuzzerCommand.h
@@ -81,7 +81,7 @@
   }
 
   // Like hasArgument, but checks for "-[Flag]=...".
-  bool hasFlag(const std::string &Flag) {
+  bool hasFlag(const std::string &Flag) const {
     std::string Arg("-" + Flag + "=");
     auto IsMatch = [&](const std::string &Other) {
       return Arg.compare(0, std::string::npos, Other, 0, Arg.length()) == 0;
@@ -92,7 +92,7 @@
   // Returns the value of the first instance of a given flag, or an empty string
   // if the flag isn't present.  Ignores any occurrences after
   // "-ignore_remaining_args=1", if present.
-  std::string getFlagValue(const std::string &Flag) {
+  std::string getFlagValue(const std::string &Flag) const {
     std::string Arg("-" + Flag + "=");
     auto IsMatch = [&](const std::string &Other) {
       return Arg.compare(0, std::string::npos, Other, 0, Arg.length()) == 0;
diff --git a/lib/fuzzer/FuzzerDefs.h b/lib/fuzzer/FuzzerDefs.h
index a35c7a1..31655d5 100644
--- a/lib/fuzzer/FuzzerDefs.h
+++ b/lib/fuzzer/FuzzerDefs.h
@@ -129,8 +129,15 @@
 
 #if LIBFUZZER_WINDOWS
 #define ATTRIBUTE_INTERFACE __declspec(dllexport)
+// This is used for __sancov_lowest_stack which is needed for
+// -fsanitize-coverage=stack-depth. That feature is not yet available on
+// Windows, so make the symbol static to avoid linking errors.
+#define ATTRIBUTES_INTERFACE_TLS_INITIAL_EXEC \
+  __attribute__((tls_model("initial-exec"))) thread_local static
 #else
 #define ATTRIBUTE_INTERFACE __attribute__((visibility("default")))
+#define ATTRIBUTES_INTERFACE_TLS_INITIAL_EXEC \
+  ATTRIBUTE_INTERFACE __attribute__((tls_model("initial-exec"))) thread_local
 #endif
 
 namespace fuzzer {
diff --git a/lib/fuzzer/FuzzerDriver.cpp b/lib/fuzzer/FuzzerDriver.cpp
index d11f9a6..918a972 100644
--- a/lib/fuzzer/FuzzerDriver.cpp
+++ b/lib/fuzzer/FuzzerDriver.cpp
@@ -615,8 +615,6 @@
   Options.PrintNewCovPcs = Flags.print_pcs;
   Options.PrintNewCovFuncs = Flags.print_funcs;
   Options.PrintFinalStats = Flags.print_final_stats;
-  Options.PrintMutationStats = Flags.print_mutation_stats;
-  Options.UseWeightedMutations = Flags.use_weighted_mutations;
   Options.PrintCorpusStats = Flags.print_corpus_stats;
   Options.PrintCoverage = Flags.print_coverage;
   Options.PrintUnstableStats = Flags.print_unstable_stats;
diff --git a/lib/fuzzer/FuzzerExtFunctionsDlsymWin.cpp b/lib/fuzzer/FuzzerExtFunctionsDlsymWin.cpp
deleted file mode 100644
index 321b3ec..0000000
--- a/lib/fuzzer/FuzzerExtFunctionsDlsymWin.cpp
+++ /dev/null
@@ -1,62 +0,0 @@
-//===- FuzzerExtFunctionsDlsymWin.cpp - Interface to external functions ---===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-// Implementation using dynamic loading for Windows.
-//===----------------------------------------------------------------------===//
-#include "FuzzerDefs.h"
-#if LIBFUZZER_WINDOWS
-
-#include "FuzzerExtFunctions.h"
-#include "FuzzerIO.h"
-#include "Windows.h"
-
-// This must be included after Windows.h.
-#include "Psapi.h"
-
-namespace fuzzer {
-
-ExternalFunctions::ExternalFunctions() {
-  HMODULE Modules[1024];
-  DWORD BytesNeeded;
-  HANDLE CurrentProcess = GetCurrentProcess();
-
-  if (!EnumProcessModules(CurrentProcess, Modules, sizeof(Modules),
-                          &BytesNeeded)) {
-    Printf("EnumProcessModules failed (error: %d).\n", GetLastError());
-    exit(1);
-  }
-
-  if (sizeof(Modules) < BytesNeeded) {
-    Printf("Error: the array is not big enough to hold all loaded modules.\n");
-    exit(1);
-  }
-
-  for (size_t i = 0; i < (BytesNeeded / sizeof(HMODULE)); i++)
-  {
-    FARPROC Fn;
-#define EXT_FUNC(NAME, RETURN_TYPE, FUNC_SIG, WARN)                            \
-    if (this->NAME == nullptr) {                                               \
-      Fn = GetProcAddress(Modules[i], #NAME);                                  \
-      if (Fn == nullptr)                                                       \
-         Fn = GetProcAddress(Modules[i], #NAME "__dll");                       \
-      this->NAME = (decltype(ExternalFunctions::NAME)) Fn;                     \
-    }
-#include "FuzzerExtFunctions.def"
-#undef EXT_FUNC
-  }
-
-#define EXT_FUNC(NAME, RETURN_TYPE, FUNC_SIG, WARN)                            \
-  if (this->NAME == nullptr && WARN)                                           \
-    Printf("WARNING: Failed to find function \"%s\".\n", #NAME);
-#include "FuzzerExtFunctions.def"
-#undef EXT_FUNC
-}
-
-} // namespace fuzzer
-
-#endif // LIBFUZZER_WINDOWS
diff --git a/lib/fuzzer/FuzzerExtFunctionsWeak.cpp b/lib/fuzzer/FuzzerExtFunctionsWeak.cpp
index a4e56fc..6a6ef49 100644
--- a/lib/fuzzer/FuzzerExtFunctionsWeak.cpp
+++ b/lib/fuzzer/FuzzerExtFunctionsWeak.cpp
@@ -22,7 +22,7 @@
 extern "C" {
 // Declare these symbols as weak to allow them to be optionally defined.
 #define EXT_FUNC(NAME, RETURN_TYPE, FUNC_SIG, WARN)                            \
-  __attribute__((weak)) RETURN_TYPE NAME FUNC_SIG
+  __attribute__((weak, visibility("default"))) RETURN_TYPE NAME FUNC_SIG
 
 #include "FuzzerExtFunctions.def"
 
diff --git a/lib/fuzzer/FuzzerFlags.def b/lib/fuzzer/FuzzerFlags.def
index 258427c..0417dda 100644
--- a/lib/fuzzer/FuzzerFlags.def
+++ b/lib/fuzzer/FuzzerFlags.def
@@ -162,6 +162,3 @@
 FUZZER_FLAG_INT(analyze_dict, 0, "Experimental")
 FUZZER_DEPRECATED_FLAG(use_clang_coverage)
 FUZZER_FLAG_STRING(data_flow_trace, "Experimental: use the data flow trace")
-FUZZER_FLAG_INT(print_mutation_stats, 0, "Experimental")
-FUZZER_FLAG_INT(use_weighted_mutations, 0, "Experimental: If 1, fuzzing will "
-    "favor mutations that perform better during runtime.")
diff --git a/lib/fuzzer/FuzzerIO.cpp b/lib/fuzzer/FuzzerIO.cpp
index f3ead0e..dac5ec6 100644
--- a/lib/fuzzer/FuzzerIO.cpp
+++ b/lib/fuzzer/FuzzerIO.cpp
@@ -100,14 +100,6 @@
   return DirPath + GetSeparator() + FileName;
 }
 
-std::string Basename(const std::string &Path, char Separator) {
-  size_t Pos = Path.rfind(Separator);
-  if (Pos == std::string::npos)
-    return Path;
-  assert(Pos < Path.size());
-  return Path.substr(Pos + 1);
-}
-
 void DupAndCloseStderr() {
   int OutputFd = DuplicateFile(2);
   if (OutputFd > 0) {
diff --git a/lib/fuzzer/FuzzerIO.h b/lib/fuzzer/FuzzerIO.h
index 6d77574..b4a6819 100644
--- a/lib/fuzzer/FuzzerIO.h
+++ b/lib/fuzzer/FuzzerIO.h
@@ -68,7 +68,7 @@
 
 char GetSeparator();
 // Similar to the basename utility: returns the file name w/o the dir prefix.
-std::string Basename(const std::string &Path, char Separator = GetSeparator());
+std::string Basename(const std::string &Path);
 
 FILE* OpenFile(int Fd, const char *Mode);
 
diff --git a/lib/fuzzer/FuzzerIOPosix.cpp b/lib/fuzzer/FuzzerIOPosix.cpp
index 17e884d..401b4cb 100644
--- a/lib/fuzzer/FuzzerIOPosix.cpp
+++ b/lib/fuzzer/FuzzerIOPosix.cpp
@@ -46,6 +46,13 @@
   return St.st_size;
 }
 
+std::string Basename(const std::string &Path) {
+  size_t Pos = Path.rfind(GetSeparator());
+  if (Pos == std::string::npos) return Path;
+  assert(Pos < Path.size());
+  return Path.substr(Pos + 1);
+}
+
 void ListFilesInDirRecursive(const std::string &Dir, long *Epoch,
                              Vector<std::string> *V, bool TopDir) {
   auto E = GetEpoch(Dir);
diff --git a/lib/fuzzer/FuzzerIOWindows.cpp b/lib/fuzzer/FuzzerIOWindows.cpp
index 7485364..75dcaf7 100644
--- a/lib/fuzzer/FuzzerIOWindows.cpp
+++ b/lib/fuzzer/FuzzerIOWindows.cpp
@@ -72,6 +72,26 @@
   return IsFile(Path, Att);
 }
 
+std::string Basename(const std::string &Path) {
+  size_t Pos = Path.find_last_of("/\\");
+  if (Pos == std::string::npos) return Path;
+  assert(Pos < Path.size());
+  return Path.substr(Pos + 1);
+}
+
+size_t FileSize(const std::string &Path) {
+  WIN32_FILE_ATTRIBUTE_DATA attr;
+  if (!GetFileAttributesExA(Path.c_str(), GetFileExInfoStandard, &attr)) {
+    Printf("GetFileAttributesExA() failed for \"%s\" (Error code: %lu).\n",
+           Path.c_str(), GetLastError());
+    return 0;
+  }
+  ULARGE_INTEGER size;
+  size.HighPart = attr.nFileSizeHigh;
+  size.LowPart = attr.nFileSizeLow;
+  return size.QuadPart;
+}
+
 void ListFilesInDirRecursive(const std::string &Dir, long *Epoch,
                              Vector<std::string> *V, bool TopDir) {
   auto E = GetEpoch(Dir);
@@ -91,7 +111,7 @@
   {
     if (GetLastError() == ERROR_FILE_NOT_FOUND)
       return;
-    Printf("No such directory: %s; exiting\n", Dir.c_str());
+    Printf("No such file or directory: %s; exiting\n", Dir.c_str());
     exit(1);
   }
 
diff --git a/lib/fuzzer/FuzzerLoop.cpp b/lib/fuzzer/FuzzerLoop.cpp
index c7b13d1..7b98f55 100644
--- a/lib/fuzzer/FuzzerLoop.cpp
+++ b/lib/fuzzer/FuzzerLoop.cpp
@@ -38,7 +38,6 @@
 
 namespace fuzzer {
 static const size_t kMaxUnitSizeToPrint = 256;
-static const size_t kUpdateMutationWeightRuns = 10000;
 
 thread_local bool Fuzzer::IsMyThread;
 
@@ -361,7 +360,6 @@
     TPC.DumpCoverage();
   if (Options.PrintCorpusStats)
     Corpus.PrintStats();
-  if (Options.PrintMutationStats) MD.PrintMutationStats();
   if (!Options.PrintFinalStats)
     return;
   size_t ExecPerSec = execPerSec();
@@ -550,9 +548,6 @@
 
 void Fuzzer::ExecuteCallback(const uint8_t *Data, size_t Size) {
   TPC.RecordInitialStack();
-  if (Options.UseWeightedMutations &&
-      TotalNumberOfRuns % kUpdateMutationWeightRuns == 0)
-    MD.UpdateDistribution();
   TotalNumberOfRuns++;
   assert(InFuzzingThread());
   if (SMR.IsClient())
diff --git a/lib/fuzzer/FuzzerMutate.cpp b/lib/fuzzer/FuzzerMutate.cpp
index fac3c7a..142b2b0 100644
--- a/lib/fuzzer/FuzzerMutate.cpp
+++ b/lib/fuzzer/FuzzerMutate.cpp
@@ -30,41 +30,34 @@
   DefaultMutators.insert(
       DefaultMutators.begin(),
       {
-          // Initialize useful and total mutation counts as 1 in order to
-          // have mutation stats (i.e. weights) with equal non-zero values.
-          {&MutationDispatcher::Mutate_EraseBytes, "EraseBytes", 1, 1},
-          {&MutationDispatcher::Mutate_InsertByte, "InsertByte", 1, 1},
+          {&MutationDispatcher::Mutate_EraseBytes, "EraseBytes"},
+          {&MutationDispatcher::Mutate_InsertByte, "InsertByte"},
           {&MutationDispatcher::Mutate_InsertRepeatedBytes,
-           "InsertRepeatedBytes", 1, 1},
-          {&MutationDispatcher::Mutate_ChangeByte, "ChangeByte", 1, 1},
-          {&MutationDispatcher::Mutate_ChangeBit, "ChangeBit", 1, 1},
-          {&MutationDispatcher::Mutate_ShuffleBytes, "ShuffleBytes", 1, 1},
-          {&MutationDispatcher::Mutate_ChangeASCIIInteger, "ChangeASCIIInt", 1,
-           1},
-          {&MutationDispatcher::Mutate_ChangeBinaryInteger, "ChangeBinInt", 1,
-           1},
-          {&MutationDispatcher::Mutate_CopyPart, "CopyPart", 1, 1},
-          {&MutationDispatcher::Mutate_CrossOver, "CrossOver", 1, 1},
+           "InsertRepeatedBytes"},
+          {&MutationDispatcher::Mutate_ChangeByte, "ChangeByte"},
+          {&MutationDispatcher::Mutate_ChangeBit, "ChangeBit"},
+          {&MutationDispatcher::Mutate_ShuffleBytes, "ShuffleBytes"},
+          {&MutationDispatcher::Mutate_ChangeASCIIInteger, "ChangeASCIIInt"},
+          {&MutationDispatcher::Mutate_ChangeBinaryInteger, "ChangeBinInt"},
+          {&MutationDispatcher::Mutate_CopyPart, "CopyPart"},
+          {&MutationDispatcher::Mutate_CrossOver, "CrossOver"},
           {&MutationDispatcher::Mutate_AddWordFromManualDictionary,
-           "ManualDict", 1, 1},
+           "ManualDict"},
           {&MutationDispatcher::Mutate_AddWordFromPersistentAutoDictionary,
-           "PersAutoDict", 1, 1},
+           "PersAutoDict"},
       });
   if(Options.UseCmp)
     DefaultMutators.push_back(
-        {&MutationDispatcher::Mutate_AddWordFromTORC, "CMP", 1, 1});
+        {&MutationDispatcher::Mutate_AddWordFromTORC, "CMP"});
 
   if (EF->LLVMFuzzerCustomMutator)
-    Mutators.push_back({&MutationDispatcher::Mutate_Custom, "Custom", 1, 1});
+    Mutators.push_back({&MutationDispatcher::Mutate_Custom, "Custom"});
   else
     Mutators = DefaultMutators;
 
   if (EF->LLVMFuzzerCustomCrossOver)
     Mutators.push_back(
-        {&MutationDispatcher::Mutate_CustomCrossOver, "CustomCrossOver", 1, 1});
-
-  // For weighted mutation selection, init with uniform weights distribution.
-  Stats.resize(Mutators.size());
+        {&MutationDispatcher::Mutate_CustomCrossOver, "CustomCrossOver"});
 }
 
 static char RandCh(Random &Rand) {
@@ -471,7 +464,6 @@
     if (!PersistentAutoDictionary.ContainsWord(DE->GetW()))
       PersistentAutoDictionary.push_back({DE->GetW(), 1});
   }
-  RecordUsefulMutations();
 }
 
 void MutationDispatcher::PrintRecommendedDictionary() {
@@ -492,7 +484,8 @@
 
 void MutationDispatcher::PrintMutationSequence() {
   Printf("MS: %zd ", CurrentMutatorSequence.size());
-  for (auto M : CurrentMutatorSequence) Printf("%s-", M->Name);
+  for (auto M : CurrentMutatorSequence)
+    Printf("%s-", M.Name);
   if (!CurrentDictionaryEntrySequence.empty()) {
     Printf(" DE: ");
     for (auto DE : CurrentDictionaryEntrySequence) {
@@ -519,20 +512,13 @@
   // Some mutations may fail (e.g. can't insert more bytes if Size == MaxSize),
   // in which case they will return 0.
   // Try several times before returning un-mutated data.
-  Mutator *M = nullptr;
   for (int Iter = 0; Iter < 100; Iter++) {
-    // Even when using weighted mutations, fallback to the default selection in
-    // 20% of cases.
-    if (Options.UseWeightedMutations && Rand(5))
-      M = &Mutators[WeightedIndex()];
-    else
-      M = &Mutators[Rand(Mutators.size())];
-    size_t NewSize = (this->*(M->Fn))(Data, Size, MaxSize);
+    auto M = Mutators[Rand(Mutators.size())];
+    size_t NewSize = (this->*(M.Fn))(Data, Size, MaxSize);
     if (NewSize && NewSize <= MaxSize) {
       if (Options.OnlyASCII)
         ToASCII(Data, NewSize);
       CurrentMutatorSequence.push_back(M);
-      M->TotalCount++;
       return NewSize;
     }
   }
@@ -573,34 +559,4 @@
       {W, std::numeric_limits<size_t>::max()});
 }
 
-void MutationDispatcher::RecordUsefulMutations() {
-  for (auto M : CurrentMutatorSequence) M->UsefulCount++;
-}
-
-void MutationDispatcher::PrintMutationStats() {
-  Printf("\nstat::mutation_usefulness:      ");
-  UpdateMutationStats();
-  for (size_t i = 0; i < Stats.size(); i++) {
-    Printf("%.3f", 100 * Stats[i]);
-    if (i < Stats.size() - 1)
-      Printf(",");
-    else
-      Printf("\n");
-  }
-}
-
-void MutationDispatcher::UpdateMutationStats() {
-  // Calculate usefulness statistic for each mutation
-  for (size_t i = 0; i < Stats.size(); i++)
-    Stats[i] =
-        static_cast<double>(Mutators[i].UsefulCount) / Mutators[i].TotalCount;
-}
-
-void MutationDispatcher::UpdateDistribution() {
-  UpdateMutationStats();
-  Distribution = std::discrete_distribution<size_t>(Stats.begin(), Stats.end());
-}
-
-size_t MutationDispatcher::WeightedIndex() { return Distribution(GetRand()); }
-
 }  // namespace fuzzer
diff --git a/lib/fuzzer/FuzzerMutate.h b/lib/fuzzer/FuzzerMutate.h
index d89667c..a51c7fb 100644
--- a/lib/fuzzer/FuzzerMutate.h
+++ b/lib/fuzzer/FuzzerMutate.h
@@ -93,29 +93,10 @@
 
   Random &GetRand() { return Rand; }
 
-  /// Records tally of mutations resulting in new coverage, for usefulness
-  /// metric.
-  void RecordUsefulMutations();
-
-  /// Outputs usefulness stats on command line if option is enabled.
-  void PrintMutationStats();
-
-  /// Recalculates mutation stats based on latest run data.
-  void UpdateMutationStats();
-
-  /// Sets weights based on mutation performance during fuzzer run.
-  void UpdateDistribution();
-
-  /// Returns the index of a mutation based on how useful it has been.
-  /// Favors mutations with higher usefulness ratios but can return any index.
-  size_t WeightedIndex();
-
  private:
   struct Mutator {
     size_t (MutationDispatcher::*Fn)(uint8_t *Data, size_t Size, size_t Max);
     const char *Name;
-    uint64_t UsefulCount;
-    uint64_t TotalCount;
   };
 
   size_t AddWordFromDictionary(Dictionary &D, uint8_t *Data, size_t Size,
@@ -154,7 +135,6 @@
   Dictionary PersistentAutoDictionary;
 
   Vector<DictionaryEntry *> CurrentDictionaryEntrySequence;
-  Vector<Mutator *> CurrentMutatorSequence;
 
   static const size_t kCmpDictionaryEntriesDequeSize = 16;
   DictionaryEntry CmpDictionaryEntriesDeque[kCmpDictionaryEntriesDequeSize];
@@ -169,10 +149,7 @@
 
   Vector<Mutator> Mutators;
   Vector<Mutator> DefaultMutators;
-
-  // Used to weight mutations based on usefulness.
-  Vector<double> Stats;
-  std::discrete_distribution<size_t> Distribution;
+  Vector<Mutator> CurrentMutatorSequence;
 };
 
 }  // namespace fuzzer
diff --git a/lib/fuzzer/FuzzerOptions.h b/lib/fuzzer/FuzzerOptions.h
index 82855ce..bb642f1 100644
--- a/lib/fuzzer/FuzzerOptions.h
+++ b/lib/fuzzer/FuzzerOptions.h
@@ -52,8 +52,6 @@
   bool PrintNewCovPcs = false;
   int PrintNewCovFuncs = 0;
   bool PrintFinalStats = false;
-  bool PrintMutationStats = false;
-  bool UseWeightedMutations = false;
   bool PrintCorpusStats = false;
   bool PrintCoverage = false;
   bool PrintUnstableStats = false;
diff --git a/lib/fuzzer/FuzzerTracePC.cpp b/lib/fuzzer/FuzzerTracePC.cpp
index 1aba816..7ba75c7 100644
--- a/lib/fuzzer/FuzzerTracePC.cpp
+++ b/lib/fuzzer/FuzzerTracePC.cpp
@@ -32,8 +32,7 @@
 uintptr_t __sancov_trace_pc_pcs[fuzzer::TracePC::kNumPCs];
 
 // Used by -fsanitize-coverage=stack-depth to track stack depth
-ATTRIBUTE_INTERFACE __attribute__((tls_model("initial-exec")))
-thread_local uintptr_t __sancov_lowest_stack;
+ATTRIBUTES_INTERFACE_TLS_INITIAL_EXEC uintptr_t __sancov_lowest_stack;
 
 namespace fuzzer {
 
@@ -195,11 +194,42 @@
   ValueProfileMap.AddValueModPrime(Idx);
 }
 
+/// \return the address of the previous instruction.
+/// Note: the logic is copied from `sanitizer_common/sanitizer_stacktrace.h`
+inline ALWAYS_INLINE uintptr_t GetPreviousInstructionPc(uintptr_t PC) {
+#if defined(__arm__)
+  // T32 (Thumb) branch instructions might be 16 or 32 bit long,
+  // so we return (pc-2) in that case in order to be safe.
+  // For A32 mode we return (pc-4) because all instructions are 32 bit long.
+  return (PC - 3) & (~1);
+#elif defined(__powerpc__) || defined(__powerpc64__) || defined(__aarch64__)
+  // PCs are always 4 byte aligned.
+  return PC - 4;
+#elif defined(__sparc__) || defined(__mips__)
+  return PC - 8;
+#else
+  return PC - 1;
+#endif
+}
+
+/// \return the address of the next instruction.
+/// Note: the logic is copied from `sanitizer_common/sanitizer_stacktrace.cc`
+inline ALWAYS_INLINE uintptr_t GetNextInstructionPc(uintptr_t PC) {
+#if defined(__mips__)
+  return PC + 8;
+#elif defined(__powerpc__) || defined(__sparc__) || defined(__arm__) || \
+    defined(__aarch64__)
+  return PC + 4;
+#else
+  return PC + 1;
+#endif
+}
+
 void TracePC::UpdateObservedPCs() {
   Vector<uintptr_t> CoveredFuncs;
   auto ObservePC = [&](uintptr_t PC) {
     if (ObservedPCs.insert(PC).second && DoPrintNewPCs) {
-      PrintPC("\tNEW_PC: %p %F %L", "\tNEW_PC: %p", PC + 1);
+      PrintPC("\tNEW_PC: %p %F %L", "\tNEW_PC: %p", GetNextInstructionPc(PC));
       Printf("\n");
     }
   };
@@ -234,22 +264,11 @@
   for (size_t i = 0, N = Min(CoveredFuncs.size(), NumPrintNewFuncs); i < N;
        i++) {
     Printf("\tNEW_FUNC[%zd/%zd]: ", i + 1, CoveredFuncs.size());
-    PrintPC("%p %F %L", "%p", CoveredFuncs[i] + 1);
+    PrintPC("%p %F %L", "%p", GetNextInstructionPc(CoveredFuncs[i]));
     Printf("\n");
   }
 }
 
-inline ALWAYS_INLINE uintptr_t GetPreviousInstructionPc(uintptr_t PC) {
-  // TODO: this implementation is x86 only.
-  // see sanitizer_common GetPreviousInstructionPc for full implementation.
-  return PC - 1;
-}
-
-inline ALWAYS_INLINE uintptr_t GetNextInstructionPc(uintptr_t PC) {
-  // TODO: this implementation is x86 only.
-  // see sanitizer_common GetPreviousInstructionPc for full implementation.
-  return PC + 1;
-}
 
 static std::string GetModuleName(uintptr_t PC) {
   char ModulePathRaw[4096] = "";  // What's PATH_MAX in portable C++?
diff --git a/lib/fuzzer/FuzzerUtilFuchsia.cpp b/lib/fuzzer/FuzzerUtilFuchsia.cpp
index cd2bb74..cd48fef 100644
--- a/lib/fuzzer/FuzzerUtilFuchsia.cpp
+++ b/lib/fuzzer/FuzzerUtilFuchsia.cpp
@@ -49,9 +49,6 @@
 
 namespace {
 
-// TODO(phosek): remove this and replace it with ZX_TIME_INFINITE
-#define ZX_TIME_INFINITE_OLD INT64_MAX
-
 // A magic value for the Zircon exception port, chosen to spell 'FUZZING'
 // when interpreted as a byte sequence on little-endian platforms.
 const uint64_t kFuzzingCrash = 0x474e495a5a5546;
@@ -237,7 +234,7 @@
             "_zx_object_signal");
 
   zx_port_packet_t Packet;
-  ExitOnErr(_zx_port_wait(Port.Handle, ZX_TIME_INFINITE_OLD, &Packet),
+  ExitOnErr(_zx_port_wait(Port.Handle, ZX_TIME_INFINITE, &Packet),
             "_zx_port_wait");
 
   // At this point, we want to get the state of the crashing thread, but
@@ -315,8 +312,8 @@
   ExitOnErr(_zx_event_create(0, &Event), "_zx_event_create");
 
   std::thread T(CrashHandler, &Event);
-  zx_status_t Status = _zx_object_wait_one(Event, ZX_USER_SIGNAL_0,
-                                           ZX_TIME_INFINITE_OLD, nullptr);
+  zx_status_t Status =
+      _zx_object_wait_one(Event, ZX_USER_SIGNAL_0, ZX_TIME_INFINITE, nullptr);
   _zx_handle_close(Event);
   ExitOnErr(Status, "_zx_object_wait_one");
 
@@ -378,19 +375,28 @@
     Argv[i] = Args[i].c_str();
   Argv[Argc] = nullptr;
 
-  // Determine stdout
+  // Determine output.  On Fuchsia, the fuzzer is typically run as a component
+  // that lacks a mutable working directory. Fortunately, when this is the case
+  // a mutable output directory must be specified using "-artifact_prefix=...",
+  // so write the log file(s) there.
   int FdOut = STDOUT_FILENO;
-
   if (Cmd.hasOutputFile()) {
-    auto Filename = Cmd.getOutputFile();
-    FdOut = open(Filename.c_str(), O_WRONLY | O_CREAT | O_TRUNC, 0);
+    std::string Path;
+    if (Cmd.hasFlag("artifact_prefix"))
+      Path = Cmd.getFlagValue("artifact_prefix") + "/" + Cmd.getOutputFile();
+    else
+      Path = Cmd.getOutputFile();
+    FdOut = open(Path.c_str(), O_WRONLY | O_CREAT | O_TRUNC, 0);
     if (FdOut == -1) {
-      Printf("libFuzzer: failed to open %s: %s\n", Filename.c_str(),
+      Printf("libFuzzer: failed to open %s: %s\n", Path.c_str(),
              strerror(errno));
       return ZX_ERR_IO;
     }
   }
-  auto CloseFdOut = at_scope_exit([&]() { close(FdOut); } );
+  auto CloseFdOut = at_scope_exit([FdOut]() {
+    if (FdOut != STDOUT_FILENO)
+      close(FdOut);
+  });
 
   // Determine stderr
   int FdErr = STDERR_FILENO;
@@ -440,7 +446,7 @@
 
   // Now join the process and return the exit status.
   if ((rc = _zx_object_wait_one(ProcessHandle, ZX_PROCESS_TERMINATED,
-                                ZX_TIME_INFINITE_OLD, nullptr)) != ZX_OK) {
+                                ZX_TIME_INFINITE, nullptr)) != ZX_OK) {
     Printf("libFuzzer: failed to join '%s': %s\n", Argv[0],
            _zx_status_get_string(rc));
     return rc;
diff --git a/lib/fuzzer/FuzzerUtilWindows.cpp b/lib/fuzzer/FuzzerUtilWindows.cpp
index 8227e77..393b476 100644
--- a/lib/fuzzer/FuzzerUtilWindows.cpp
+++ b/lib/fuzzer/FuzzerUtilWindows.cpp
@@ -24,7 +24,7 @@
 #include <windows.h>
 
 // This must be included after windows.h.
-#include <Psapi.h>
+#include <psapi.h>
 
 namespace fuzzer {
 
@@ -179,7 +179,9 @@
 }
 
 std::string DisassembleCmd(const std::string &FileName) {
-  if (ExecuteCommand("dumpbin /summary > nul") == 0)
+  Vector<std::string> command_vector;
+  command_vector.push_back("dumpbin /summary > nul");
+  if (ExecuteCommand(Command(command_vector)) == 0)
     return "dumpbin /disasm " + FileName;
   Printf("libFuzzer: couldn't find tool to disassemble (dumpbin)\n");
   exit(1);
diff --git a/lib/fuzzer/tests/CMakeLists.txt b/lib/fuzzer/tests/CMakeLists.txt
index ed58071..0b561c1 100644
--- a/lib/fuzzer/tests/CMakeLists.txt
+++ b/lib/fuzzer/tests/CMakeLists.txt
@@ -17,6 +17,8 @@
 
 if(APPLE OR CMAKE_SYSTEM_NAME STREQUAL "FreeBSD")
   list(APPEND LIBFUZZER_UNITTEST_LINK_FLAGS -lc++ -lpthread)
+elseif(WIN32)
+  list(APPEND LIBFUZZER_UNITTEST_LINK_FLAGS -lstdc++)
 else()
   list(APPEND LIBFUZZER_UNITTEST_LINK_FLAGS -lstdc++ -lpthread)
 endif()
diff --git a/lib/fuzzer/tests/FuzzerUnittest.cpp b/lib/fuzzer/tests/FuzzerUnittest.cpp
index e3b0670..7cdd445 100644
--- a/lib/fuzzer/tests/FuzzerUnittest.cpp
+++ b/lib/fuzzer/tests/FuzzerUnittest.cpp
@@ -34,6 +34,13 @@
   EXPECT_EQ(Basename("/bar"), "bar");
   EXPECT_EQ(Basename("foo/x"), "x");
   EXPECT_EQ(Basename("foo/"), "");
+#if LIBFUZZER_WINDOWS
+  EXPECT_EQ(Basename("foo\\bar"), "bar");
+  EXPECT_EQ(Basename("foo\\bar/baz"), "baz");
+  EXPECT_EQ(Basename("\\bar"), "bar");
+  EXPECT_EQ(Basename("foo\\x"), "x");
+  EXPECT_EQ(Basename("foo\\"), "");
+#endif
 }
 
 TEST(Fuzzer, CrossOver) {
diff --git a/lib/hwasan/CMakeLists.txt b/lib/hwasan/CMakeLists.txt
index 42bf436..552133e 100644
--- a/lib/hwasan/CMakeLists.txt
+++ b/lib/hwasan/CMakeLists.txt
@@ -10,6 +10,7 @@
   hwasan_poisoning.cc
   hwasan_report.cc
   hwasan_thread.cc
+  hwasan_thread_list.cc
   )
 
 set(HWASAN_RTL_CXX_SOURCES
@@ -25,8 +26,12 @@
   hwasan_mapping.h
   hwasan_poisoning.h
   hwasan_report.h
-  hwasan_thread.h)
+  hwasan_thread.h
+  hwasan_thread_list.h
+  )
 
+set(HWASAN_DEFINITIONS)
+append_list_if(COMPILER_RT_HWASAN_WITH_INTERCEPTORS HWASAN_WITH_INTERCEPTORS=1 HWASAN_DEFINITIONS)
 
 set(HWASAN_RTL_CFLAGS ${SANITIZER_COMMON_CFLAGS})
 append_rtti_flag(OFF HWASAN_RTL_CFLAGS)
@@ -36,6 +41,14 @@
 
 set(HWASAN_DYNAMIC_LINK_FLAGS ${SANITIZER_COMMON_LINK_FLAGS})
 
+if(ANDROID)
+# Put most Sanitizer shared libraries in the global group. For more details, see
+# android-changes-for-ndk-developers.md#changes-to-library-search-order
+  if (COMPILER_RT_HAS_Z_GLOBAL)
+    list(APPEND HWASAN_DYNAMIC_LINK_FLAGS -Wl,-z,global)
+  endif()
+endif()
+
 set(HWASAN_DYNAMIC_CFLAGS ${HWASAN_RTL_CFLAGS})
 append_list_if(COMPILER_RT_HAS_FTLS_MODEL_INITIAL_EXEC
   -ftls-model=initial-exec HWASAN_DYNAMIC_CFLAGS)
@@ -47,7 +60,6 @@
 append_list_if(COMPILER_RT_HAS_LIBRT rt HWASAN_DYNAMIC_LIBS)
 append_list_if(COMPILER_RT_HAS_LIBM m HWASAN_DYNAMIC_LIBS)
 append_list_if(COMPILER_RT_HAS_LIBPTHREAD pthread HWASAN_DYNAMIC_LIBS)
-append_list_if(COMPILER_RT_HAS_LIBLOG log HWASAN_DYNAMIC_LIBS)
 
 # Static runtime library.
 add_compiler_rt_component(hwasan)
@@ -56,23 +68,27 @@
   ARCHS ${HWASAN_SUPPORTED_ARCH}
   SOURCES ${HWASAN_RTL_SOURCES}
   ADDITIONAL_HEADERS ${HWASAN_RTL_HEADERS}
-  CFLAGS ${HWASAN_RTL_CFLAGS})
+  CFLAGS ${HWASAN_RTL_CFLAGS}
+  DEFS ${HWASAN_DEFINITIONS})
 add_compiler_rt_object_libraries(RTHwasan_cxx
   ARCHS ${HWASAN_SUPPORTED_ARCH}
   SOURCES ${HWASAN_RTL_CXX_SOURCES}
   ADDITIONAL_HEADERS ${HWASAN_RTL_HEADERS}
-  CFLAGS ${HWASAN_RTL_CFLAGS})
+  CFLAGS ${HWASAN_RTL_CFLAGS}
+  DEFS ${HWASAN_DEFINITIONS})
 add_compiler_rt_object_libraries(RTHwasan_dynamic
   ARCHS ${HWASAN_SUPPORTED_ARCH}
   SOURCES ${HWASAN_RTL_SOURCES} ${HWASAN_RTL_CXX_SOURCES}
   ADDITIONAL_HEADERS ${HWASAN_RTL_HEADERS}
-  CFLAGS ${HWASAN_DYNAMIC_CFLAGS})
+  CFLAGS ${HWASAN_DYNAMIC_CFLAGS}
+  DEFS ${HWASAN_DEFINITIONS})
 
 file(WRITE ${CMAKE_CURRENT_BINARY_DIR}/dummy.cc "")
 add_compiler_rt_object_libraries(RTHwasan_dynamic_version_script_dummy
   ARCHS ${HWASAN_SUPPORTED_ARCH}
   SOURCES ${CMAKE_CURRENT_BINARY_DIR}/dummy.cc
-  CFLAGS ${HWASAN_DYNAMIC_CFLAGS})
+  CFLAGS ${HWASAN_DYNAMIC_CFLAGS}
+  DEFS ${HWASAN_DEFINITIONS})
 
 foreach(arch ${HWASAN_SUPPORTED_ARCH})
   add_compiler_rt_runtime(clang_rt.hwasan
diff --git a/lib/hwasan/hwasan.cc b/lib/hwasan/hwasan.cc
index 7dab824..02aee4d 100644
--- a/lib/hwasan/hwasan.cc
+++ b/lib/hwasan/hwasan.cc
@@ -17,6 +17,7 @@
 #include "hwasan_poisoning.h"
 #include "hwasan_report.h"
 #include "hwasan_thread.h"
+#include "hwasan_thread_list.h"
 #include "sanitizer_common/sanitizer_atomic.h"
 #include "sanitizer_common/sanitizer_common.h"
 #include "sanitizer_common/sanitizer_flags.h"
@@ -36,17 +37,17 @@
 namespace __hwasan {
 
 void EnterSymbolizer() {
-  HwasanThread *t = GetCurrentThread();
+  Thread *t = GetCurrentThread();
   CHECK(t);
   t->EnterSymbolizer();
 }
 void ExitSymbolizer() {
-  HwasanThread *t = GetCurrentThread();
+  Thread *t = GetCurrentThread();
   CHECK(t);
   t->LeaveSymbolizer();
 }
 bool IsInSymbolizer() {
-  HwasanThread *t = GetCurrentThread();
+  Thread *t = GetCurrentThread();
   return t && t->InSymbolizer();
 }
 
@@ -57,6 +58,7 @@
 }
 
 int hwasan_inited = 0;
+int hwasan_shadow_inited = 0;
 bool hwasan_init_is_running;
 
 int hwasan_report_count = 0;
@@ -86,7 +88,18 @@
     cf.check_printf = false;
     cf.intercept_tls_get_addr = true;
     cf.exitcode = 99;
+    // Sigtrap is used in error reporting.
     cf.handle_sigtrap = kHandleSignalExclusive;
+
+#if SANITIZER_ANDROID
+    // Let platform handle other signals. It is better at reporting them then we
+    // are.
+    cf.handle_segv = kHandleSignalNo;
+    cf.handle_sigbus = kHandleSignalNo;
+    cf.handle_abort = kHandleSignalNo;
+    cf.handle_sigill = kHandleSignalNo;
+    cf.handle_sigfpe = kHandleSignalNo;
+#endif
     OverrideCommonFlags(cf);
   }
 
@@ -119,7 +132,8 @@
 #if HWASAN_CONTAINS_UBSAN
   ubsan_parser.ParseString(GetEnv("UBSAN_OPTIONS"));
 #endif
-  VPrintf(1, "HWASAN_OPTIONS: %s\n", hwasan_options ? hwasan_options : "<empty>");
+  VPrintf(1, "HWASAN_OPTIONS: %s\n",
+          hwasan_options ? hwasan_options : "<empty>");
 
   InitializeCommonFlags();
 
@@ -130,8 +144,13 @@
 
 void GetStackTrace(BufferedStackTrace *stack, uptr max_s, uptr pc, uptr bp,
                    void *context, bool request_fast_unwind) {
-  HwasanThread *t = GetCurrentThread();
-  if (!t || !StackTrace::WillUseFastUnwind(request_fast_unwind)) {
+  Thread *t = GetCurrentThread();
+  if (!t) {
+    // the thread is still being created.
+    stack->size = 0;
+    return;
+  }
+  if (!StackTrace::WillUseFastUnwind(request_fast_unwind)) {
     // Block reports from our interceptors during _Unwind_Backtrace.
     SymbolizerScope sym_scope;
     return stack->Unwind(max_s, pc, bp, context, 0, 0, request_fast_unwind);
@@ -153,6 +172,54 @@
   Die();
 }
 
+static constexpr uptr kMemoryUsageBufferSize = 4096;
+
+static void HwasanFormatMemoryUsage(InternalScopedString &s) {
+  HwasanThreadList &thread_list = hwasanThreadList();
+  auto thread_stats = thread_list.GetThreadStats();
+  auto *sds = StackDepotGetStats();
+  AllocatorStatCounters asc;
+  GetAllocatorStats(asc);
+  s.append(
+      "HWASAN pid: %d rss: %zd threads: %zd stacks: %zd"
+      " thr_aux: %zd stack_depot: %zd uniq_stacks: %zd"
+      " heap: %zd",
+      internal_getpid(), GetRSS(), thread_stats.n_live_threads,
+      thread_stats.total_stack_size,
+      thread_stats.n_live_threads * thread_list.MemoryUsedPerThread(),
+      sds->allocated, sds->n_uniq_ids, asc[AllocatorStatMapped]);
+}
+
+#if SANITIZER_ANDROID
+static char *memory_usage_buffer = nullptr;
+
+#define PR_SET_VMA 0x53564d41
+#define PR_SET_VMA_ANON_NAME 0
+
+static void InitMemoryUsage() {
+  memory_usage_buffer =
+      (char *)MmapOrDie(kMemoryUsageBufferSize, "memory usage string");
+  CHECK(memory_usage_buffer);
+  memory_usage_buffer[0] = '\0';
+  CHECK(internal_prctl(PR_SET_VMA, PR_SET_VMA_ANON_NAME,
+                       (uptr)memory_usage_buffer, kMemoryUsageBufferSize,
+                       (uptr)memory_usage_buffer) == 0);
+}
+
+void UpdateMemoryUsage() {
+  if (!flags()->export_memory_stats)
+    return;
+  if (!memory_usage_buffer)
+    InitMemoryUsage();
+  InternalScopedString s(kMemoryUsageBufferSize);
+  HwasanFormatMemoryUsage(s);
+  internal_strncpy(memory_usage_buffer, s.data(), kMemoryUsageBufferSize - 1);
+  memory_usage_buffer[kMemoryUsageBufferSize - 1] = '\0';
+}
+#else
+void UpdateMemoryUsage() {}
+#endif
+
 } // namespace __hwasan
 
 // Interface.
@@ -161,6 +228,16 @@
 
 uptr __hwasan_shadow_memory_dynamic_address;  // Global interface symbol.
 
+void __hwasan_shadow_init() {
+  if (hwasan_shadow_inited) return;
+  if (!InitShadow()) {
+    Printf("FATAL: HWAddressSanitizer cannot mmap the shadow memory.\n");
+    DumpProcessMap();
+    Die();
+  }
+  hwasan_shadow_inited = 1;
+}
+
 void __hwasan_init() {
   CHECK(!hwasan_init_is_running);
   if (hwasan_inited) return;
@@ -178,17 +255,16 @@
   __sanitizer_set_report_path(common_flags()->log_path);
 
   DisableCoreDumperIfNecessary();
-  if (!InitShadow()) {
-    Printf("FATAL: HWAddressSanitizer cannot mmap the shadow memory.\n");
-    if (HWASAN_FIXED_MAPPING) {
-      Printf("FATAL: Make sure to compile with -fPIE and to link with -pie.\n");
-      Printf("FATAL: Disabling ASLR is known to cause this error.\n");
-      Printf("FATAL: If running under GDB, try "
-             "'set disable-randomization off'.\n");
-    }
-    DumpProcessMap();
-    Die();
-  }
+
+  __hwasan_shadow_init();
+
+  InitThreads();
+  hwasanThreadList().CreateCurrentThread();
+
+  MadviseShadow();
+
+  // This may call libc -> needs initialized shadow.
+  AndroidLogInit();
 
   InitializeInterceptors();
   InstallDeadlySignalHandlers(HwasanOnDeadlySignal);
@@ -198,14 +274,11 @@
 
   InitializeCoverage(common_flags()->coverage, common_flags()->coverage_dir);
 
-  HwasanTSDInit(HwasanTSDDtor);
+  HwasanTSDInit();
+  HwasanTSDThreadInit();
 
   HwasanAllocatorInit();
 
-  HwasanThread *main_thread = HwasanThread::Create(nullptr, nullptr);
-  SetCurrentThread(main_thread);
-  main_thread->ThreadStart();
-
 #if HWASAN_CONTAINS_UBSAN
   __ubsan::InitAsPlugin();
 #endif
@@ -216,9 +289,14 @@
   hwasan_inited = 1;
 }
 
-void __hwasan_print_shadow(const void *x, uptr size) {
-  // FIXME:
-  Printf("FIXME: __hwasan_print_shadow unimplemented\n");
+void __hwasan_print_shadow(const void *p, uptr sz) {
+  uptr ptr_raw = UntagAddr(reinterpret_cast<uptr>(p));
+  uptr shadow_first = MemToShadow(ptr_raw);
+  uptr shadow_last = MemToShadow(ptr_raw + sz - 1);
+  Printf("HWASan shadow map for %zx .. %zx (pointer tag %x)\n", ptr_raw,
+         ptr_raw + sz, GetTagFromPointer((uptr)p));
+  for (uptr s = shadow_first; s <= shadow_last; ++s)
+    Printf("  %zx: %x\n", ShadowToMem(s), *(tag_t *)s);
 }
 
 sptr __hwasan_test_shadow(const void *p, uptr sz) {
@@ -227,12 +305,12 @@
   tag_t ptr_tag = GetTagFromPointer((uptr)p);
   if (ptr_tag == 0)
     return -1;
-  uptr ptr_raw = GetAddressFromPointer((uptr)p);
-  uptr shadow_first = MEM_TO_SHADOW(ptr_raw);
-  uptr shadow_last = MEM_TO_SHADOW(ptr_raw + sz - 1);
+  uptr ptr_raw = UntagAddr(reinterpret_cast<uptr>(p));
+  uptr shadow_first = MemToShadow(ptr_raw);
+  uptr shadow_last = MemToShadow(ptr_raw + sz - 1);
   for (uptr s = shadow_first; s <= shadow_last; ++s)
     if (*(tag_t*)s != ptr_tag)
-      return SHADOW_TO_MEM(s) - ptr_raw;
+      return ShadowToMem(s) - ptr_raw;
   return -1;
 }
 
@@ -288,7 +366,7 @@
 __attribute__((always_inline, nodebug)) static void CheckAddress(uptr p) {
   tag_t ptr_tag = GetTagFromPointer(p);
   uptr ptr_raw = p & ~kAddressTagMask;
-  tag_t mem_tag = *(tag_t *)MEM_TO_SHADOW(ptr_raw);
+  tag_t mem_tag = *(tag_t *)MemToShadow(ptr_raw);
   if (UNLIKELY(ptr_tag != mem_tag)) {
     SigTrap<0x20 * (EA == ErrorAction::Recover) +
            0x10 * (AT == AccessType::Store) + LogSize>(p);
@@ -302,8 +380,8 @@
   CHECK_NE(0, sz);
   tag_t ptr_tag = GetTagFromPointer(p);
   uptr ptr_raw = p & ~kAddressTagMask;
-  tag_t *shadow_first = (tag_t *)MEM_TO_SHADOW(ptr_raw);
-  tag_t *shadow_last = (tag_t *)MEM_TO_SHADOW(ptr_raw + sz - 1);
+  tag_t *shadow_first = (tag_t *)MemToShadow(ptr_raw);
+  tag_t *shadow_last = (tag_t *)MemToShadow(ptr_raw + sz - 1);
   for (tag_t *t = shadow_first; t <= shadow_last; ++t)
     if (UNLIKELY(ptr_tag != *t)) {
       SigTrap<0x20 * (EA == ErrorAction::Recover) +
@@ -392,10 +470,38 @@
   TagMemoryAligned(p, sz, tag);
 }
 
+uptr __hwasan_tag_pointer(uptr p, u8 tag) {
+  return AddTagToPointer(p, tag);
+}
+
+void __hwasan_handle_longjmp(const void *sp_dst) {
+  uptr dst = (uptr)sp_dst;
+  // HWASan does not support tagged SP.
+  CHECK(GetTagFromPointer(dst) == 0);
+
+  uptr sp = (uptr)__builtin_frame_address(0);
+  static const uptr kMaxExpectedCleanupSize = 64 << 20;  // 64M
+  if (dst < sp || dst - sp > kMaxExpectedCleanupSize) {
+    Report(
+        "WARNING: HWASan is ignoring requested __hwasan_handle_longjmp: "
+        "stack top: %p; target %p; distance: %p (%zd)\n"
+        "False positive error reports may follow\n",
+        (void *)sp, (void *)dst, dst - sp);
+    return;
+  }
+  TagMemory(sp, dst - sp, 0);
+}
+
+void __hwasan_print_memory_usage() {
+  InternalScopedString s(kMemoryUsageBufferSize);
+  HwasanFormatMemoryUsage(s);
+  Printf("%s\n", s.data());
+}
+
 static const u8 kFallbackTag = 0xBB;
 
 u8 __hwasan_generate_tag() {
-  HwasanThread *t = GetCurrentThread();
+  Thread *t = GetCurrentThread();
   if (!t) return kFallbackTag;
   return t->GenerateRandomTag();
 }
diff --git a/lib/hwasan/hwasan.h b/lib/hwasan/hwasan.h
index 47d1d05..a3da09e 100644
--- a/lib/hwasan/hwasan.h
+++ b/lib/hwasan/hwasan.h
@@ -30,6 +30,10 @@
 # define HWASAN_CONTAINS_UBSAN CAN_SANITIZE_UB
 #endif
 
+#ifndef HWASAN_WITH_INTERCEPTORS
+#define HWASAN_WITH_INTERCEPTORS 0
+#endif
+
 typedef u8 tag_t;
 
 // TBI (Top Byte Ignore) feature of AArch64: bits [63:56] are ignored in address
@@ -37,16 +41,21 @@
 const unsigned kAddressTagShift = 56;
 const uptr kAddressTagMask = 0xFFUL << kAddressTagShift;
 
+// Minimal alignment of the shadow base address. Determines the space available
+// for threads and stack histories. This is an ABI constant.
+const unsigned kShadowBaseAlignment = 32;
+
 static inline tag_t GetTagFromPointer(uptr p) {
   return p >> kAddressTagShift;
 }
 
-static inline uptr GetAddressFromPointer(uptr p) {
-  return p & ~kAddressTagMask;
+static inline uptr UntagAddr(uptr tagged_addr) {
+  return tagged_addr & ~kAddressTagMask;
 }
 
-static inline void * GetAddressFromPointer(const void *p) {
-  return (void *)((uptr)p & ~kAddressTagMask);
+static inline void *UntagPtr(const void *tagged_ptr) {
+  return reinterpret_cast<void *>(
+      UntagAddr(reinterpret_cast<uptr>(tagged_ptr)));
 }
 
 static inline uptr AddTagToPointer(uptr p, tag_t tag) {
@@ -61,6 +70,8 @@
 
 bool ProtectRange(uptr beg, uptr end);
 bool InitShadow();
+void InitThreads();
+void MadviseShadow();
 char *GetProcSelfMaps();
 void InitializeInterceptors();
 
@@ -135,13 +146,13 @@
   u64 va_arg_overflow_size_tls;
 };
 
-void HwasanTSDInit(void (*destructor)(void *tsd));
-void *HwasanTSDGet();
-void HwasanTSDSet(void *tsd);
-void HwasanTSDDtor(void *tsd);
+void HwasanTSDInit();
+void HwasanTSDThreadInit();
 
 void HwasanOnDeadlySignal(int signo, void *info, void *context);
 
+void UpdateMemoryUsage();
+
 }  // namespace __hwasan
 
 #define HWASAN_MALLOC_HOOK(ptr, size)       \
diff --git a/lib/hwasan/hwasan_allocator.cc b/lib/hwasan/hwasan_allocator.cc
index c2b9b0b..b9c379e 100644
--- a/lib/hwasan/hwasan_allocator.cc
+++ b/lib/hwasan/hwasan_allocator.cc
@@ -12,10 +12,6 @@
 // HWAddressSanitizer allocator.
 //===----------------------------------------------------------------------===//
 
-#include "sanitizer_common/sanitizer_allocator.h"
-#include "sanitizer_common/sanitizer_allocator_checks.h"
-#include "sanitizer_common/sanitizer_allocator_interface.h"
-#include "sanitizer_common/sanitizer_allocator_report.h"
 #include "sanitizer_common/sanitizer_atomic.h"
 #include "sanitizer_common/sanitizer_errno.h"
 #include "sanitizer_common/sanitizer_stackdepot.h"
@@ -23,29 +19,14 @@
 #include "hwasan_allocator.h"
 #include "hwasan_mapping.h"
 #include "hwasan_thread.h"
-#include "hwasan_poisoning.h"
+#include "hwasan_report.h"
 
 namespace __hwasan {
 
-enum {
-  CHUNK_INVALID = 0,
-  CHUNK_FREE = 1,
-  CHUNK_ALLOCATED = 2
-};
-
-struct Metadata {
-  u64 state : 2;
-  u64 requested_size : 62;
-  u32 alloc_context_id;
-  u32 free_context_id;
-};
-
-bool HwasanChunkView::IsValid() const {
-  return metadata_ && metadata_->state != CHUNK_INVALID;
-}
 bool HwasanChunkView::IsAllocated() const {
-  return metadata_ && metadata_->state == CHUNK_ALLOCATED;
+  return metadata_ && metadata_->alloc_context_id && metadata_->requested_size;
 }
+
 uptr HwasanChunkView::Beg() const {
   return block_;
 }
@@ -58,44 +39,6 @@
 u32 HwasanChunkView::GetAllocStackId() const {
   return metadata_->alloc_context_id;
 }
-u32 HwasanChunkView::GetFreeStackId() const {
-  return metadata_->free_context_id;
-}
-
-struct HwasanMapUnmapCallback {
-  void OnMap(uptr p, uptr size) const {}
-  void OnUnmap(uptr p, uptr size) const {
-    // We are about to unmap a chunk of user memory.
-    // It can return as user-requested mmap() or another thread stack.
-    // Make it accessible with zero-tagged pointer.
-    TagMemory(p, size, 0);
-  }
-};
-
-#if !defined(__aarch64__) && !defined(__x86_64__)
-#error Unsupported platform
-#endif
-
-static const uptr kMaxAllowedMallocSize = 2UL << 30;  // 2G
-static const uptr kRegionSizeLog = 20;
-static const uptr kNumRegions = SANITIZER_MMAP_RANGE_SIZE >> kRegionSizeLog;
-typedef TwoLevelByteMap<(kNumRegions >> 12), 1 << 12> ByteMap;
-
-struct AP32 {
-  static const uptr kSpaceBeg = 0;
-  static const u64 kSpaceSize = SANITIZER_MMAP_RANGE_SIZE;
-  static const uptr kMetadataSize = sizeof(Metadata);
-  typedef __sanitizer::CompactSizeClassMap SizeClassMap;
-  static const uptr kRegionSizeLog = __hwasan::kRegionSizeLog;
-  typedef __hwasan::ByteMap ByteMap;
-  typedef HwasanMapUnmapCallback MapUnmapCallback;
-  static const uptr kFlags = 0;
-};
-typedef SizeClassAllocator32<AP32> PrimaryAllocator;
-typedef SizeClassAllocatorLocalCache<PrimaryAllocator> AllocatorCache;
-typedef LargeMmapAllocator<HwasanMapUnmapCallback> SecondaryAllocator;
-typedef CombinedAllocator<PrimaryAllocator, AllocatorCache,
-                          SecondaryAllocator> Allocator;
 
 static Allocator allocator;
 static AllocatorCache fallback_allocator_cache;
@@ -105,6 +48,10 @@
 static const tag_t kFallbackAllocTag = 0xBB;
 static const tag_t kFallbackFreeTag = 0xBC;
 
+void GetAllocatorStats(AllocatorStatCounters s) {
+  allocator.GetStats(s);
+}
+
 void HwasanAllocatorInit() {
   atomic_store_relaxed(&hwasan_allocator_tagging_enabled,
                        !flags()->disable_allocator_tagging);
@@ -112,34 +59,34 @@
   allocator.Init(common_flags()->allocator_release_to_os_interval_ms);
 }
 
-AllocatorCache *GetAllocatorCache(HwasanThreadLocalMallocStorage *ms) {
-  CHECK(ms);
-  CHECK_LE(sizeof(AllocatorCache), sizeof(ms->allocator_cache));
-  return reinterpret_cast<AllocatorCache *>(ms->allocator_cache);
+void AllocatorSwallowThreadLocalCache(AllocatorCache *cache) {
+  allocator.SwallowCache(cache);
 }
 
-void HwasanThreadLocalMallocStorage::CommitBack() {
-  allocator.SwallowCache(GetAllocatorCache(this));
+static uptr TaggedSize(uptr size) {
+  if (!size) size = 1;
+  uptr new_size = RoundUpTo(size, kShadowAlignment);
+  CHECK_GE(new_size, size);
+  return new_size;
 }
 
-static void *HwasanAllocate(StackTrace *stack, uptr size, uptr alignment,
-                          bool zeroise) {
-  alignment = Max(alignment, kShadowAlignment);
-  size = RoundUpTo(size, kShadowAlignment);
-
-  if (size > kMaxAllowedMallocSize) {
+static void *HwasanAllocate(StackTrace *stack, uptr orig_size, uptr alignment,
+                            bool zeroise) {
+  if (orig_size > kMaxAllowedMallocSize) {
     if (AllocatorMayReturnNull()) {
       Report("WARNING: HWAddressSanitizer failed to allocate 0x%zx bytes\n",
-             size);
+             orig_size);
       return nullptr;
     }
-    ReportAllocationSizeTooBig(size, kMaxAllowedMallocSize, stack);
+    ReportAllocationSizeTooBig(orig_size, kMaxAllowedMallocSize, stack);
   }
-  HwasanThread *t = GetCurrentThread();
+
+  alignment = Max(alignment, kShadowAlignment);
+  uptr size = TaggedSize(orig_size);
+  Thread *t = GetCurrentThread();
   void *allocated;
   if (t) {
-    AllocatorCache *cache = GetAllocatorCache(&t->malloc_storage());
-    allocated = allocator.Allocate(cache, size, alignment);
+    allocated = allocator.Allocate(t->allocator_cache(), size, alignment);
   } else {
     SpinMutexLock l(&fallback_mutex);
     AllocatorCache *cache = &fallback_allocator_cache;
@@ -153,11 +100,14 @@
   }
   Metadata *meta =
       reinterpret_cast<Metadata *>(allocator.GetMetaData(allocated));
-  meta->state = CHUNK_ALLOCATED;
-  meta->requested_size = size;
+  meta->requested_size = static_cast<u32>(orig_size);
   meta->alloc_context_id = StackDepotPut(*stack);
-  if (zeroise)
+  if (zeroise) {
     internal_memset(allocated, 0, size);
+  } else if (flags()->max_malloc_fill_size > 0) {
+    uptr fill_size = Min(size, (uptr)flags()->max_malloc_fill_size);
+    internal_memset(allocated, flags()->malloc_fill_byte, fill_size);
+  }
 
   void *user_ptr = allocated;
   if (flags()->tag_in_malloc &&
@@ -169,67 +119,68 @@
   return user_ptr;
 }
 
-void HwasanDeallocate(StackTrace *stack, void *user_ptr) {
-  CHECK(user_ptr);
-  HWASAN_FREE_HOOK(user_ptr);
+static bool PointerAndMemoryTagsMatch(void *tagged_ptr) {
+  CHECK(tagged_ptr);
+  tag_t ptr_tag = GetTagFromPointer(reinterpret_cast<uptr>(tagged_ptr));
+  tag_t mem_tag = *reinterpret_cast<tag_t *>(
+      MemToShadow(reinterpret_cast<uptr>(UntagPtr(tagged_ptr))));
+  return ptr_tag == mem_tag;
+}
 
-  void *p = GetAddressFromPointer(user_ptr);
-  Metadata *meta = reinterpret_cast<Metadata *>(allocator.GetMetaData(p));
-  uptr size = meta->requested_size;
-  meta->state = CHUNK_FREE;
+void HwasanDeallocate(StackTrace *stack, void *tagged_ptr) {
+  CHECK(tagged_ptr);
+  HWASAN_FREE_HOOK(tagged_ptr);
+
+  if (!PointerAndMemoryTagsMatch(tagged_ptr))
+    ReportInvalidFree(stack, reinterpret_cast<uptr>(tagged_ptr));
+
+  void *untagged_ptr = UntagPtr(tagged_ptr);
+  Metadata *meta =
+      reinterpret_cast<Metadata *>(allocator.GetMetaData(untagged_ptr));
+  uptr orig_size = meta->requested_size;
+  u32 free_context_id = StackDepotPut(*stack);
+  u32 alloc_context_id = meta->alloc_context_id;
   meta->requested_size = 0;
-  meta->free_context_id = StackDepotPut(*stack);
+  meta->alloc_context_id = 0;
   // This memory will not be reused by anyone else, so we are free to keep it
   // poisoned.
-  HwasanThread *t = GetCurrentThread();
+  Thread *t = GetCurrentThread();
+  if (flags()->max_free_fill_size > 0) {
+    uptr fill_size = Min(orig_size, (uptr)flags()->max_free_fill_size);
+    internal_memset(untagged_ptr, flags()->free_fill_byte, fill_size);
+  }
   if (flags()->tag_in_free &&
       atomic_load_relaxed(&hwasan_allocator_tagging_enabled))
-    TagMemoryAligned((uptr)p, size,
+    TagMemoryAligned((uptr)untagged_ptr, TaggedSize(orig_size),
                      t ? t->GenerateRandomTag() : kFallbackFreeTag);
   if (t) {
-    AllocatorCache *cache = GetAllocatorCache(&t->malloc_storage());
-    allocator.Deallocate(cache, p);
+    allocator.Deallocate(t->allocator_cache(), untagged_ptr);
+    if (auto *ha = t->heap_allocations())
+      ha->push({reinterpret_cast<uptr>(tagged_ptr), alloc_context_id,
+                free_context_id, static_cast<u32>(orig_size)});
   } else {
     SpinMutexLock l(&fallback_mutex);
     AllocatorCache *cache = &fallback_allocator_cache;
-    allocator.Deallocate(cache, p);
+    allocator.Deallocate(cache, untagged_ptr);
   }
 }
 
-void *HwasanReallocate(StackTrace *stack, void *user_old_p, uptr new_size,
+void *HwasanReallocate(StackTrace *stack, void *tagged_ptr_old, uptr new_size,
                      uptr alignment) {
-  alignment = Max(alignment, kShadowAlignment);
-  new_size = RoundUpTo(new_size, kShadowAlignment);
+  if (!PointerAndMemoryTagsMatch(tagged_ptr_old))
+    ReportInvalidFree(stack, reinterpret_cast<uptr>(tagged_ptr_old));
 
-  void *old_p = GetAddressFromPointer(user_old_p);
-  Metadata *meta = reinterpret_cast<Metadata*>(allocator.GetMetaData(old_p));
-  uptr old_size = meta->requested_size;
-  uptr actually_allocated_size = allocator.GetActuallyAllocatedSize(old_p);
-  if (new_size <= actually_allocated_size) {
-    // We are not reallocating here.
-    // FIXME: update stack trace for the allocation?
-    meta->requested_size = new_size;
-    if (!atomic_load_relaxed(&hwasan_allocator_tagging_enabled))
-      return user_old_p;
-    if (flags()->retag_in_realloc) {
-      HwasanThread *t = GetCurrentThread();
-      return (void *)TagMemoryAligned(
-          (uptr)old_p, new_size,
-          t ? t->GenerateRandomTag() : kFallbackAllocTag);
-    }
-    if (new_size > old_size) {
-      tag_t tag = GetTagFromPointer((uptr)user_old_p);
-      TagMemoryAligned((uptr)old_p + old_size, new_size - old_size, tag);
-    }
-    return user_old_p;
+  void *tagged_ptr_new =
+      HwasanAllocate(stack, new_size, alignment, false /*zeroise*/);
+  if (tagged_ptr_old && tagged_ptr_new) {
+    void *untagged_ptr_old =  UntagPtr(tagged_ptr_old);
+    Metadata *meta =
+        reinterpret_cast<Metadata *>(allocator.GetMetaData(untagged_ptr_old));
+    internal_memcpy(UntagPtr(tagged_ptr_new), untagged_ptr_old,
+                    Min(new_size, static_cast<uptr>(meta->requested_size)));
+    HwasanDeallocate(stack, tagged_ptr_old);
   }
-  uptr memcpy_size = Min(new_size, old_size);
-  void *new_p = HwasanAllocate(stack, new_size, alignment, false /*zeroise*/);
-  if (new_p) {
-    internal_memcpy(new_p, old_p, memcpy_size);
-    HwasanDeallocate(stack, old_p);
-  }
-  return new_p;
+  return tagged_ptr_new;
 }
 
 void *HwasanCalloc(StackTrace *stack, uptr nmemb, uptr size) {
@@ -250,12 +201,12 @@
   return HwasanChunkView(reinterpret_cast<uptr>(block), metadata);
 }
 
-static uptr AllocationSize(const void *user_ptr) {
-  const void *p = GetAddressFromPointer(user_ptr);
-  if (!p) return 0;
-  const void *beg = allocator.GetBlockBegin(p);
-  if (beg != p) return 0;
-  Metadata *b = (Metadata *)allocator.GetMetaData(p);
+static uptr AllocationSize(const void *tagged_ptr) {
+  const void *untagged_ptr = UntagPtr(tagged_ptr);
+  if (!untagged_ptr) return 0;
+  const void *beg = allocator.GetBlockBegin(untagged_ptr);
+  if (beg != untagged_ptr) return 0;
+  Metadata *b = (Metadata *)allocator.GetMetaData(untagged_ptr);
   return b->requested_size;
 }
 
diff --git a/lib/hwasan/hwasan_allocator.h b/lib/hwasan/hwasan_allocator.h
index d025112..3589212 100644
--- a/lib/hwasan/hwasan_allocator.h
+++ b/lib/hwasan/hwasan_allocator.h
@@ -1,4 +1,4 @@
-//===-- hwasan_allocator.h ----------------------------------------*- C++ -*-===//
+//===-- hwasan_allocator.h --------------------------------------*- C++ -*-===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -14,35 +14,69 @@
 #ifndef HWASAN_ALLOCATOR_H
 #define HWASAN_ALLOCATOR_H
 
+#include "sanitizer_common/sanitizer_allocator.h"
+#include "sanitizer_common/sanitizer_allocator_checks.h"
+#include "sanitizer_common/sanitizer_allocator_interface.h"
+#include "sanitizer_common/sanitizer_allocator_report.h"
 #include "sanitizer_common/sanitizer_common.h"
+#include "sanitizer_common/sanitizer_ring_buffer.h"
+#include "hwasan_poisoning.h"
+
+#if !defined(__aarch64__) && !defined(__x86_64__)
+#error Unsupported platform
+#endif
 
 namespace __hwasan {
 
-struct HwasanThreadLocalMallocStorage {
-  uptr quarantine_cache[16];
-  // Allocator cache contains atomic_uint64_t which must be 8-byte aligned.
-  ALIGNED(8) uptr allocator_cache[96 * (512 * 8 + 16)];  // Opaque.
-  void CommitBack();
-
- private:
-  // These objects are allocated via mmap() and are zero-initialized.
-  HwasanThreadLocalMallocStorage() {}
+struct Metadata {
+  u32 requested_size;  // sizes are < 4G.
+  u32 alloc_context_id;
 };
 
-struct Metadata;
+struct HwasanMapUnmapCallback {
+  void OnMap(uptr p, uptr size) const { UpdateMemoryUsage(); }
+  void OnUnmap(uptr p, uptr size) const {
+    // We are about to unmap a chunk of user memory.
+    // It can return as user-requested mmap() or another thread stack.
+    // Make it accessible with zero-tagged pointer.
+    TagMemory(p, size, 0);
+  }
+};
+
+static const uptr kMaxAllowedMallocSize = 2UL << 30;  // 2G
+static const uptr kRegionSizeLog = 20;
+static const uptr kNumRegions = SANITIZER_MMAP_RANGE_SIZE >> kRegionSizeLog;
+typedef TwoLevelByteMap<(kNumRegions >> 12), 1 << 12> ByteMap;
+
+struct AP32 {
+  static const uptr kSpaceBeg = 0;
+  static const u64 kSpaceSize = SANITIZER_MMAP_RANGE_SIZE;
+  static const uptr kMetadataSize = sizeof(Metadata);
+  typedef __sanitizer::CompactSizeClassMap SizeClassMap;
+  static const uptr kRegionSizeLog = __hwasan::kRegionSizeLog;
+  typedef __hwasan::ByteMap ByteMap;
+  typedef HwasanMapUnmapCallback MapUnmapCallback;
+  static const uptr kFlags = 0;
+};
+typedef SizeClassAllocator32<AP32> PrimaryAllocator;
+typedef SizeClassAllocatorLocalCache<PrimaryAllocator> AllocatorCache;
+typedef LargeMmapAllocator<HwasanMapUnmapCallback> SecondaryAllocator;
+typedef CombinedAllocator<PrimaryAllocator, AllocatorCache,
+                          SecondaryAllocator> Allocator;
+
+
+void AllocatorSwallowThreadLocalCache(AllocatorCache *cache);
 
 class HwasanChunkView {
  public:
   HwasanChunkView() : block_(0), metadata_(nullptr) {}
   HwasanChunkView(uptr block, Metadata *metadata)
       : block_(block), metadata_(metadata) {}
-  bool IsValid() const;        // Checks if it points to a valid allocated chunk
   bool IsAllocated() const;    // Checks if the memory is currently allocated
   uptr Beg() const;            // First byte of user memory
   uptr End() const;            // Last byte of user memory
   uptr UsedSize() const;       // Size requested by the user
   u32 GetAllocStackId() const;
-  u32 GetFreeStackId() const;
  private:
   uptr block_;
   Metadata *const metadata_;
@@ -50,6 +84,21 @@
 
 HwasanChunkView FindHeapChunkByAddress(uptr address);
 
+// Information about one (de)allocation that happened in the past.
+// These are recorded in a thread-local ring buffer.
+// TODO: this is currently 24 bytes (20 bytes + alignment).
+// Compress it to 16 bytes or extend it to be more useful.
+struct HeapAllocationRecord {
+  uptr tagged_addr;
+  u32  alloc_context_id;
+  u32  free_context_id;
+  u32  requested_size;
+};
+
+typedef RingBuffer<HeapAllocationRecord> HeapAllocationsRingBuffer;
+
+void GetAllocatorStats(AllocatorStatCounters s);
+
 } // namespace __hwasan
 
 #endif // HWASAN_ALLOCATOR_H
diff --git a/lib/hwasan/hwasan_dynamic_shadow.cc b/lib/hwasan/hwasan_dynamic_shadow.cc
index 1733800..87670f5 100644
--- a/lib/hwasan/hwasan_dynamic_shadow.cc
+++ b/lib/hwasan/hwasan_dynamic_shadow.cc
@@ -13,6 +13,7 @@
 ///
 //===----------------------------------------------------------------------===//
 
+#include "hwasan.h"
 #include "hwasan_dynamic_shadow.h"
 #include "hwasan_mapping.h"
 #include "sanitizer_common/sanitizer_common.h"
@@ -35,12 +36,16 @@
   }
 }
 
-// Returns an address aligned to 8 pages, such that one page on the left and
-// shadow_size_bytes bytes on the right of it are mapped r/o.
+// Returns an address aligned to kShadowBaseAlignment, such that
+// 2**kShadowBaseAlingment on the left and shadow_size_bytes bytes on the right
+// of it are mapped no access.
 static uptr MapDynamicShadow(uptr shadow_size_bytes) {
   const uptr granularity = GetMmapGranularity();
-  const uptr alignment = granularity * SHADOW_GRANULARITY;
-  const uptr left_padding = granularity;
+  const uptr min_alignment = granularity << kShadowScale;
+  const uptr alignment = 1ULL << kShadowBaseAlignment;
+  CHECK_GE(alignment, min_alignment);
+
+  const uptr left_padding = 1ULL << kShadowBaseAlignment;
   const uptr shadow_size =
       RoundUpTo(shadow_size_bytes, granularity);
   const uptr map_size = shadow_size + left_padding + alignment;
@@ -58,8 +63,7 @@
 
 }  // namespace __hwasan
 
-#if HWASAN_PREMAP_SHADOW
-
+#if SANITIZER_ANDROID
 extern "C" {
 
 INTERFACE_ATTRIBUTE void __hwasan_shadow();
@@ -117,16 +121,22 @@
 
 }  // extern "C"
 
-#endif  // HWASAN_PREMAP_SHADOW
-
 namespace __hwasan {
 
 uptr FindDynamicShadowStart(uptr shadow_size_bytes) {
-#if HWASAN_PREMAP_SHADOW
   if (IsPremapShadowAvailable())
     return FindPremappedShadowStart(shadow_size_bytes);
-#endif
   return MapDynamicShadow(shadow_size_bytes);
 }
 
 }  // namespace __hwasan
+#else
+namespace __hwasan {
+
+uptr FindDynamicShadowStart(uptr shadow_size_bytes) {
+  return MapDynamicShadow(shadow_size_bytes);
+}
+
+}  // namespace __hwasan
+#
+#endif  // SANITIZER_ANDROID
diff --git a/lib/hwasan/hwasan_flags.h b/lib/hwasan/hwasan_flags.h
index 16d60c4..492d5bb 100644
--- a/lib/hwasan/hwasan_flags.h
+++ b/lib/hwasan/hwasan_flags.h
@@ -1,4 +1,4 @@
-//===-- hwasan_flags.h --------------------------------------------*- C++ -*-===//
+//===-- hwasan_flags.h ------------------------------------------*- C++ -*-===//
 //
 //                     The LLVM Compiler Infrastructure
 //
diff --git a/lib/hwasan/hwasan_flags.inc b/lib/hwasan/hwasan_flags.inc
index c457811..f1b416d 100644
--- a/lib/hwasan/hwasan_flags.inc
+++ b/lib/hwasan/hwasan_flags.inc
@@ -17,9 +17,10 @@
 // HWASAN_FLAG(Type, Name, DefaultValue, Description)
 // See COMMON_FLAG in sanitizer_flags.inc for more details.
 
+HWASAN_FLAG(bool, verbose_threads, false,
+            "inform on thread creation/destruction")
 HWASAN_FLAG(bool, tag_in_malloc, true, "")
 HWASAN_FLAG(bool, tag_in_free, true, "")
-HWASAN_FLAG(bool, retag_in_realloc, true, "")
 HWASAN_FLAG(bool, print_stats, false, "")
 HWASAN_FLAG(bool, halt_on_error, true, "")
 HWASAN_FLAG(bool, atexit, false, "")
@@ -31,3 +32,26 @@
 // If false, use simple increment of a thread local counter to generate new
 // tags.
 HWASAN_FLAG(bool, random_tags, true, "")
+
+HWASAN_FLAG(
+    int, max_malloc_fill_size, 0x1000,  // By default, fill only the first 4K.
+    "HWASan allocator flag. max_malloc_fill_size is the maximal amount of "
+    "bytes that will be filled with malloc_fill_byte on malloc.")
+HWASAN_FLAG(
+    int, max_free_fill_size, 0,
+    "HWASan allocator flag. max_free_fill_size is the maximal amount of "
+    "bytes that will be filled with free_fill_byte during free.")
+HWASAN_FLAG(int, malloc_fill_byte, 0xbe,
+          "Value used to fill the newly allocated memory.")
+HWASAN_FLAG(int, free_fill_byte, 0x55,
+          "Value used to fill deallocated memory.")
+HWASAN_FLAG(int, heap_history_size, 1023,
+          "The number of heap (de)allocations remembered per thread. "
+          "Affects the quality of heap-related reports, but not the ability "
+          "to find bugs.")
+HWASAN_FLAG(bool, export_memory_stats, true,
+            "Export up-to-date memory stats through /proc")
+HWASAN_FLAG(int, stack_history_size, 1024,
+            "The number of stack frames remembered per thread. "
+            "Affects the quality of stack-related reports, but not the ability "
+            "to find bugs.")
diff --git a/lib/hwasan/hwasan_interceptors.cc b/lib/hwasan/hwasan_interceptors.cc
index 66aab95..9a0770f 100644
--- a/lib/hwasan/hwasan_interceptors.cc
+++ b/lib/hwasan/hwasan_interceptors.cc
@@ -1,4 +1,4 @@
-//===-- hwasan_interceptors.cc ----------------------------------------------===//
+//===-- hwasan_interceptors.cc --------------------------------------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -50,18 +50,18 @@
 DECLARE_REAL(void *, memset, void *dest, int c, uptr n)
 
 bool IsInInterceptorScope() {
-  HwasanThread *t = GetCurrentThread();
+  Thread *t = GetCurrentThread();
   return t && t->InInterceptorScope();
 }
 
 struct InterceptorScope {
   InterceptorScope() {
-    HwasanThread *t = GetCurrentThread();
+    Thread *t = GetCurrentThread();
     if (t)
       t->EnterInterceptorScope();
   }
   ~InterceptorScope() {
-    HwasanThread *t = GetCurrentThread();
+    Thread *t = GetCurrentThread();
     if (t)
       t->LeaveInterceptorScope();
   }
@@ -128,30 +128,24 @@
   CHECK_UNPOISONED((x),                                         \
     common_flags()->strict_string_checks ? (len) + 1 : (n) )
 
-
-INTERCEPTOR(int, posix_memalign, void **memptr, SIZE_T alignment, SIZE_T size) {
+int __sanitizer_posix_memalign(void **memptr, uptr alignment, uptr size) {
   GET_MALLOC_STACK_TRACE;
   CHECK_NE(memptr, 0);
   int res = hwasan_posix_memalign(memptr, alignment, size, &stack);
   return res;
 }
 
-#if !SANITIZER_FREEBSD && !SANITIZER_NETBSD
-INTERCEPTOR(void *, memalign, SIZE_T alignment, SIZE_T size) {
+void * __sanitizer_memalign(uptr alignment, uptr size) {
   GET_MALLOC_STACK_TRACE;
   return hwasan_memalign(alignment, size, &stack);
 }
-#define HWASAN_MAYBE_INTERCEPT_MEMALIGN INTERCEPT_FUNCTION(memalign)
-#else
-#define HWASAN_MAYBE_INTERCEPT_MEMALIGN
-#endif
 
-INTERCEPTOR(void *, aligned_alloc, SIZE_T alignment, SIZE_T size) {
+void * __sanitizer_aligned_alloc(uptr alignment, uptr size) {
   GET_MALLOC_STACK_TRACE;
   return hwasan_aligned_alloc(alignment, size, &stack);
 }
 
-INTERCEPTOR(void *, __libc_memalign, SIZE_T alignment, SIZE_T size) {
+void * __sanitizer___libc_memalign(uptr alignment, uptr size) {
   GET_MALLOC_STACK_TRACE;
   void *ptr = hwasan_memalign(alignment, size, &stack);
   if (ptr)
@@ -159,80 +153,47 @@
   return ptr;
 }
 
-INTERCEPTOR(void *, valloc, SIZE_T size) {
+void * __sanitizer_valloc(uptr size) {
   GET_MALLOC_STACK_TRACE;
   return hwasan_valloc(size, &stack);
 }
 
-#if !SANITIZER_FREEBSD && !SANITIZER_NETBSD
-INTERCEPTOR(void *, pvalloc, SIZE_T size) {
+void * __sanitizer_pvalloc(uptr size) {
   GET_MALLOC_STACK_TRACE;
   return hwasan_pvalloc(size, &stack);
 }
-#define HWASAN_MAYBE_INTERCEPT_PVALLOC INTERCEPT_FUNCTION(pvalloc)
-#else
-#define HWASAN_MAYBE_INTERCEPT_PVALLOC
-#endif
 
-INTERCEPTOR(void, free, void *ptr) {
+void __sanitizer_free(void *ptr) {
   GET_MALLOC_STACK_TRACE;
   if (!ptr || UNLIKELY(IsInDlsymAllocPool(ptr))) return;
   HwasanDeallocate(&stack, ptr);
 }
 
-#if !SANITIZER_FREEBSD && !SANITIZER_NETBSD
-INTERCEPTOR(void, cfree, void *ptr) {
+void __sanitizer_cfree(void *ptr) {
   GET_MALLOC_STACK_TRACE;
   if (!ptr || UNLIKELY(IsInDlsymAllocPool(ptr))) return;
   HwasanDeallocate(&stack, ptr);
 }
-#define HWASAN_MAYBE_INTERCEPT_CFREE INTERCEPT_FUNCTION(cfree)
-#else
-#define HWASAN_MAYBE_INTERCEPT_CFREE
-#endif
 
-INTERCEPTOR(uptr, malloc_usable_size, void *ptr) {
+uptr __sanitizer_malloc_usable_size(const void *ptr) {
   return __sanitizer_get_allocated_size(ptr);
 }
 
-#if !SANITIZER_FREEBSD && !SANITIZER_NETBSD
-// This function actually returns a struct by value, but we can't unpoison a
-// temporary! The following is equivalent on all supported platforms but
-// aarch64 (which uses a different register for sret value).  We have a test
-// to confirm that.
-INTERCEPTOR(void, mallinfo, __sanitizer_mallinfo *sret) {
-#ifdef __aarch64__
-  uptr r8;
-  asm volatile("mov %0,x8" : "=r" (r8));
-  sret = reinterpret_cast<__sanitizer_mallinfo*>(r8);
-#endif
-  REAL(memset)(sret, 0, sizeof(*sret));
+struct __sanitizer_struct_mallinfo __sanitizer_mallinfo() {
+  __sanitizer_struct_mallinfo sret;
+  internal_memset(&sret, 0, sizeof(sret));
+  return sret;
 }
-#define HWASAN_MAYBE_INTERCEPT_MALLINFO INTERCEPT_FUNCTION(mallinfo)
-#else
-#define HWASAN_MAYBE_INTERCEPT_MALLINFO
-#endif
 
-#if !SANITIZER_FREEBSD && !SANITIZER_NETBSD
-INTERCEPTOR(int, mallopt, int cmd, int value) {
+int __sanitizer_mallopt(int cmd, int value) {
   return -1;
 }
-#define HWASAN_MAYBE_INTERCEPT_MALLOPT INTERCEPT_FUNCTION(mallopt)
-#else
-#define HWASAN_MAYBE_INTERCEPT_MALLOPT
-#endif
 
-#if !SANITIZER_FREEBSD && !SANITIZER_NETBSD
-INTERCEPTOR(void, malloc_stats, void) {
+void __sanitizer_malloc_stats(void) {
   // FIXME: implement, but don't call REAL(malloc_stats)!
 }
-#define HWASAN_MAYBE_INTERCEPT_MALLOC_STATS INTERCEPT_FUNCTION(malloc_stats)
-#else
-#define HWASAN_MAYBE_INTERCEPT_MALLOC_STATS
-#endif
 
-
-INTERCEPTOR(void *, calloc, SIZE_T nmemb, SIZE_T size) {
+void * __sanitizer_calloc(uptr nmemb, uptr size) {
   GET_MALLOC_STACK_TRACE;
   if (UNLIKELY(!hwasan_inited))
     // Hack: dlsym calls calloc before REAL(calloc) is retrieved from dlsym.
@@ -240,7 +201,7 @@
   return hwasan_calloc(nmemb, size, &stack);
 }
 
-INTERCEPTOR(void *, realloc, void *ptr, SIZE_T size) {
+void * __sanitizer_realloc(void *ptr, uptr size) {
   GET_MALLOC_STACK_TRACE;
   if (UNLIKELY(IsInDlsymAllocPool(ptr))) {
     uptr offset = (uptr)ptr - (uptr)alloc_memory_for_dlsym;
@@ -258,7 +219,7 @@
   return hwasan_realloc(ptr, size, &stack);
 }
 
-INTERCEPTOR(void *, malloc, SIZE_T size) {
+void * __sanitizer_malloc(uptr size) {
   GET_MALLOC_STACK_TRACE;
   if (UNLIKELY(!hwasan_init_is_running))
     ENSURE_HWASAN_INITED();
@@ -268,48 +229,62 @@
   return hwasan_malloc(size, &stack);
 }
 
-template <class Mmap>
-static void *mmap_interceptor(Mmap real_mmap, void *addr, SIZE_T sz, int prot,
-                              int flags, int fd, OFF64_T off) {
-  if (addr && !MEM_IS_APP(addr)) {
-    if (flags & map_fixed) {
-      errno = errno_EINVAL;
-      return (void *)-1;
-    } else {
-      addr = nullptr;
-    }
-  }
-  return real_mmap(addr, sz, prot, flags, fd, off);
-}
+#if HWASAN_WITH_INTERCEPTORS
+#define INTERCEPTOR_ALIAS(RET, FN, ARGS...)                                  \
+  extern "C" SANITIZER_INTERFACE_ATTRIBUTE RET WRAP(FN)(ARGS)                \
+      ALIAS("__sanitizer_" #FN);                                             \
+  extern "C" SANITIZER_INTERFACE_ATTRIBUTE SANITIZER_WEAK_ATTRIBUTE RET FN(  \
+      ARGS) ALIAS("__sanitizer_" #FN)
 
+INTERCEPTOR_ALIAS(int, posix_memalign, void **memptr, SIZE_T alignment,
+                  SIZE_T size);
+INTERCEPTOR_ALIAS(void *, aligned_alloc, SIZE_T alignment, SIZE_T size);
+INTERCEPTOR_ALIAS(void *, __libc_memalign, SIZE_T alignment, SIZE_T size);
+INTERCEPTOR_ALIAS(void *, valloc, SIZE_T size);
+INTERCEPTOR_ALIAS(void, free, void *ptr);
+INTERCEPTOR_ALIAS(uptr, malloc_usable_size, const void *ptr);
+INTERCEPTOR_ALIAS(void *, calloc, SIZE_T nmemb, SIZE_T size);
+INTERCEPTOR_ALIAS(void *, realloc, void *ptr, SIZE_T size);
+INTERCEPTOR_ALIAS(void *, malloc, SIZE_T size);
+
+#if !SANITIZER_FREEBSD && !SANITIZER_NETBSD
+INTERCEPTOR_ALIAS(void *, memalign, SIZE_T alignment, SIZE_T size);
+INTERCEPTOR_ALIAS(void *, pvalloc, SIZE_T size);
+INTERCEPTOR_ALIAS(void, cfree, void *ptr);
+INTERCEPTOR_ALIAS(__sanitizer_struct_mallinfo, mallinfo);
+INTERCEPTOR_ALIAS(int, mallopt, int cmd, int value);
+INTERCEPTOR_ALIAS(void, malloc_stats, void);
+#endif
+#endif // HWASAN_WITH_INTERCEPTORS
+
+
+#if HWASAN_WITH_INTERCEPTORS
 extern "C" int pthread_attr_init(void *attr);
 extern "C" int pthread_attr_destroy(void *attr);
 
+struct ThreadStartArg {
+  thread_callback_t callback;
+  void *param;
+};
+
 static void *HwasanThreadStartFunc(void *arg) {
-  HwasanThread *t = (HwasanThread *)arg;
-  SetCurrentThread(t);
-  return t->ThreadStart();
+  __hwasan_thread_enter();
+  ThreadStartArg A = *reinterpret_cast<ThreadStartArg*>(arg);
+  UnmapOrDie(arg, GetPageSizeCached());
+  return A.callback(A.param);
 }
 
 INTERCEPTOR(int, pthread_create, void *th, void *attr, void *(*callback)(void*),
             void * param) {
-  ENSURE_HWASAN_INITED(); // for GetTlsSize()
-  __sanitizer_pthread_attr_t myattr;
-  if (!attr) {
-    pthread_attr_init(&myattr);
-    attr = &myattr;
-  }
-
-  AdjustStackSize(attr);
-
-  HwasanThread *t = HwasanThread::Create(callback, param);
-
-  int res = REAL(pthread_create)(th, attr, HwasanThreadStartFunc, t);
-
-  if (attr == &myattr)
-    pthread_attr_destroy(&myattr);
+  ScopedTaggingDisabler disabler;
+  ThreadStartArg *A = reinterpret_cast<ThreadStartArg *> (MmapOrDie(
+      GetPageSizeCached(), "pthread_create"));
+  *A = {callback, param};
+  int res = REAL(pthread_create)(UntagPtr(th), UntagPtr(attr),
+                                 &HwasanThreadStartFunc, A);
   return res;
 }
+#endif // HWASAN_WITH_INTERCEPTORS
 
 static void BeforeFork() {
   StackDepotLockAll();
@@ -341,138 +316,18 @@
 
 } // namespace __hwasan
 
-// A version of CHECK_UNPOISONED using a saved scope value. Used in common
-// interceptors.
-#define CHECK_UNPOISONED_CTX(ctx, x, n)                         \
-  do {                                                          \
-    if (!((HwasanInterceptorContext *)ctx)->in_interceptor_scope) \
-      CHECK_UNPOISONED_0(x, n);                                 \
-  } while (0)
-
-#define HWASAN_INTERCEPT_FUNC(name)                                       \
-  do {                                                                  \
-    if ((!INTERCEPT_FUNCTION(name) || !REAL(name)))                     \
-      VReport(1, "HWAddressSanitizer: failed to intercept '" #name "'\n"); \
-  } while (0)
-
-#define HWASAN_INTERCEPT_FUNC_VER(name, ver)                                    \
-  do {                                                                        \
-    if ((!INTERCEPT_FUNCTION_VER(name, ver) || !REAL(name)))                  \
-      VReport(                                                                \
-          1, "HWAddressSanitizer: failed to intercept '" #name "@@" #ver "'\n"); \
-  } while (0)
-
-#define COMMON_INTERCEPT_FUNCTION(name) HWASAN_INTERCEPT_FUNC(name)
-#define COMMON_INTERCEPT_FUNCTION_VER(name, ver)                          \
-  HWASAN_INTERCEPT_FUNC_VER(name, ver)
-#define COMMON_INTERCEPTOR_WRITE_RANGE(ctx, ptr, size) \
-  CHECK_UNPOISONED_CTX(ctx, ptr, size)
-#define COMMON_INTERCEPTOR_READ_RANGE(ctx, ptr, size) \
-  CHECK_UNPOISONED_CTX(ctx, ptr, size)
-#define COMMON_INTERCEPTOR_INITIALIZE_RANGE(ptr, size) \
-  HWASAN_WRITE_RANGE(ctx, ptr, size)
-#define COMMON_INTERCEPTOR_ENTER(ctx, func, ...)                  \
-  if (hwasan_init_is_running) return REAL(func)(__VA_ARGS__);       \
-  ENSURE_HWASAN_INITED();                                           \
-  HwasanInterceptorContext hwasan_ctx = {IsInInterceptorScope()};     \
-  ctx = (void *)&hwasan_ctx;                                        \
-  (void)ctx;                                                      \
-  InterceptorScope interceptor_scope;
-#define COMMON_INTERCEPTOR_DIR_ACQUIRE(ctx, path) \
-  do {                                            \
-  } while (false)
-#define COMMON_INTERCEPTOR_FD_ACQUIRE(ctx, fd) \
-  do {                                         \
-  } while (false)
-#define COMMON_INTERCEPTOR_FD_RELEASE(ctx, fd) \
-  do {                                         \
-  } while (false)
-#define COMMON_INTERCEPTOR_FD_SOCKET_ACCEPT(ctx, fd, newfd) \
-  do {                                                      \
-  } while (false)
-#define COMMON_INTERCEPTOR_SET_THREAD_NAME(ctx, name) \
-  do {                                                \
-  } while (false)  // FIXME
-#define COMMON_INTERCEPTOR_SET_PTHREAD_NAME(ctx, thread, name) \
-  do {                                                         \
-  } while (false)  // FIXME
-#define COMMON_INTERCEPTOR_BLOCK_REAL(name) REAL(name)
-#define COMMON_INTERCEPTOR_ON_EXIT(ctx) OnExit()
-
-#define COMMON_INTERCEPTOR_GET_TLS_RANGE(begin, end)                           \
-  if (HwasanThread *t = GetCurrentThread()) {                                    \
-    *begin = t->tls_begin();                                                   \
-    *end = t->tls_end();                                                       \
-  } else {                                                                     \
-    *begin = *end = 0;                                                         \
-  }
-
-#define COMMON_INTERCEPTOR_MEMSET_IMPL(ctx, dst, v, size) \
-  {                                                       \
-    COMMON_INTERCEPTOR_ENTER(ctx, memset, dst, v, size);  \
-    if (common_flags()->intercept_intrin &&               \
-        MEM_IS_APP(GetAddressFromPointer(dst)))           \
-      COMMON_INTERCEPTOR_WRITE_RANGE(ctx, dst, size);     \
-    return REAL(memset)(dst, v, size);                    \
-  }
-
-#define COMMON_INTERCEPTOR_MMAP_IMPL(ctx, mmap, addr, length, prot, flags, fd, \
-                                     offset)                                   \
-  do {                                                                         \
-    return mmap_interceptor(REAL(mmap), addr, length, prot, flags, fd,         \
-                            offset);                                           \
-  } while (false)
-
-#include "sanitizer_common/sanitizer_platform_interceptors.h"
-#include "sanitizer_common/sanitizer_common_interceptors.inc"
-#include "sanitizer_common/sanitizer_signal_interceptors.inc"
-
-#define COMMON_SYSCALL_PRE_READ_RANGE(p, s) CHECK_UNPOISONED(p, s)
-#define COMMON_SYSCALL_PRE_WRITE_RANGE(p, s) \
-  do {                                       \
-    (void)(p);                               \
-    (void)(s);                               \
-  } while (false)
-#define COMMON_SYSCALL_POST_READ_RANGE(p, s) \
-  do {                                       \
-    (void)(p);                               \
-    (void)(s);                               \
-  } while (false)
-#define COMMON_SYSCALL_POST_WRITE_RANGE(p, s) \
-  do {                                        \
-    (void)(p);                                \
-    (void)(s);                                \
-  } while (false)
-#include "sanitizer_common/sanitizer_common_syscalls.inc"
-#include "sanitizer_common/sanitizer_syscalls_netbsd.inc"
-
-
-
 namespace __hwasan {
 
 void InitializeInterceptors() {
   static int inited = 0;
   CHECK_EQ(inited, 0);
-  InitializeCommonInterceptors();
-  InitializeSignalInterceptors();
 
-  INTERCEPT_FUNCTION(posix_memalign);
-  HWASAN_MAYBE_INTERCEPT_MEMALIGN;
-  INTERCEPT_FUNCTION(__libc_memalign);
-  INTERCEPT_FUNCTION(valloc);
-  HWASAN_MAYBE_INTERCEPT_PVALLOC;
-  INTERCEPT_FUNCTION(malloc);
-  INTERCEPT_FUNCTION(calloc);
-  INTERCEPT_FUNCTION(realloc);
-  INTERCEPT_FUNCTION(free);
-  HWASAN_MAYBE_INTERCEPT_CFREE;
-  INTERCEPT_FUNCTION(malloc_usable_size);
-  HWASAN_MAYBE_INTERCEPT_MALLINFO;
-  HWASAN_MAYBE_INTERCEPT_MALLOPT;
-  HWASAN_MAYBE_INTERCEPT_MALLOC_STATS;
-  INTERCEPT_FUNCTION(pthread_create);
   INTERCEPT_FUNCTION(fork);
 
+#if HWASAN_WITH_INTERCEPTORS
+  INTERCEPT_FUNCTION(pthread_create);
+#endif
+
   inited = 1;
 }
 } // namespace __hwasan
diff --git a/lib/hwasan/hwasan_interface_internal.h b/lib/hwasan/hwasan_interface_internal.h
index b4e5c80..448997e 100644
--- a/lib/hwasan/hwasan_interface_internal.h
+++ b/lib/hwasan/hwasan_interface_internal.h
@@ -1,4 +1,4 @@
-//===-- hwasan_interface_internal.h -------------------------------*- C++ -*-===//
+//===-- hwasan_interface_internal.h -----------------------------*- C++ -*-===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -16,10 +16,14 @@
 #define HWASAN_INTERFACE_INTERNAL_H
 
 #include "sanitizer_common/sanitizer_internal_defs.h"
+#include "sanitizer_common/sanitizer_platform_limits_posix.h"
 
 extern "C" {
 
 SANITIZER_INTERFACE_ATTRIBUTE
+void __hwasan_shadow_init();
+
+SANITIZER_INTERFACE_ATTRIBUTE
 void __hwasan_init();
 
 using __sanitizer::uptr;
@@ -91,6 +95,9 @@
 void __hwasan_tag_memory(uptr p, u8 tag, uptr sz);
 
 SANITIZER_INTERFACE_ATTRIBUTE
+uptr __hwasan_tag_pointer(uptr p, u8 tag);
+
+SANITIZER_INTERFACE_ATTRIBUTE
 u8 __hwasan_generate_tag();
 
 // Returns the offset of the first tag mismatch or -1 if the whole range is
@@ -105,6 +112,9 @@
 void __hwasan_print_shadow(const void *x, uptr size);
 
 SANITIZER_INTERFACE_ATTRIBUTE
+void __hwasan_handle_longjmp(const void *sp_dst);
+
+SANITIZER_INTERFACE_ATTRIBUTE
 u16 __sanitizer_unaligned_load16(const uu16 *p);
 
 SANITIZER_INTERFACE_ATTRIBUTE
@@ -128,6 +138,59 @@
 SANITIZER_INTERFACE_ATTRIBUTE
 void __hwasan_disable_allocator_tagging();
 
+SANITIZER_INTERFACE_ATTRIBUTE
+void __hwasan_thread_enter();
+
+SANITIZER_INTERFACE_ATTRIBUTE
+void __hwasan_thread_exit();
+
+SANITIZER_INTERFACE_ATTRIBUTE
+void __hwasan_print_memory_usage();
+
+SANITIZER_INTERFACE_ATTRIBUTE
+int __sanitizer_posix_memalign(void **memptr, uptr alignment, uptr size);
+
+SANITIZER_INTERFACE_ATTRIBUTE
+void * __sanitizer_memalign(uptr alignment, uptr size);
+
+SANITIZER_INTERFACE_ATTRIBUTE
+void * __sanitizer_aligned_alloc(uptr alignment, uptr size);
+
+SANITIZER_INTERFACE_ATTRIBUTE
+void * __sanitizer___libc_memalign(uptr alignment, uptr size);
+
+SANITIZER_INTERFACE_ATTRIBUTE
+void * __sanitizer_valloc(uptr size);
+
+SANITIZER_INTERFACE_ATTRIBUTE
+void * __sanitizer_pvalloc(uptr size);
+
+SANITIZER_INTERFACE_ATTRIBUTE
+void __sanitizer_free(void *ptr);
+
+SANITIZER_INTERFACE_ATTRIBUTE
+void __sanitizer_cfree(void *ptr);
+
+SANITIZER_INTERFACE_ATTRIBUTE
+uptr __sanitizer_malloc_usable_size(const void *ptr);
+
+SANITIZER_INTERFACE_ATTRIBUTE
+__hwasan::__sanitizer_struct_mallinfo __sanitizer_mallinfo();
+
+SANITIZER_INTERFACE_ATTRIBUTE
+int __sanitizer_mallopt(int cmd, int value);
+
+SANITIZER_INTERFACE_ATTRIBUTE
+void __sanitizer_malloc_stats(void);
+
+SANITIZER_INTERFACE_ATTRIBUTE
+void * __sanitizer_calloc(uptr nmemb, uptr size);
+
+SANITIZER_INTERFACE_ATTRIBUTE
+void * __sanitizer_realloc(void *ptr, uptr size);
+
+SANITIZER_INTERFACE_ATTRIBUTE
+void * __sanitizer_malloc(uptr size);
 }  // extern "C"
 
 #endif  // HWASAN_INTERFACE_INTERNAL_H
diff --git a/lib/hwasan/hwasan_linux.cc b/lib/hwasan/hwasan_linux.cc
index 5ab98dc..eef332f 100644
--- a/lib/hwasan/hwasan_linux.cc
+++ b/lib/hwasan/hwasan_linux.cc
@@ -22,6 +22,7 @@
 #include "hwasan_mapping.h"
 #include "hwasan_report.h"
 #include "hwasan_thread.h"
+#include "hwasan_thread_list.h"
 
 #include <elf.h>
 #include <link.h>
@@ -37,6 +38,10 @@
 #include "sanitizer_common/sanitizer_common.h"
 #include "sanitizer_common/sanitizer_procmaps.h"
 
+#if HWASAN_WITH_INTERCEPTORS && !SANITIZER_ANDROID
+THREADLOCAL uptr __hwasan_tls;
+#endif
+
 namespace __hwasan {
 
 static void ReserveShadowMemoryRange(uptr beg, uptr end, const char *name) {
@@ -51,8 +56,6 @@
         size);
     Abort();
   }
-  if (common_flags()->no_huge_pages_for_shadow) NoHugePagesInRegion(beg, size);
-  if (common_flags()->use_madv_dontdump) DontDumpShadowMemory(beg, size);
 }
 
 static void ProtectGap(uptr addr, uptr size) {
@@ -103,55 +106,31 @@
   else
     CHECK_EQ(kHighShadowEnd + 1, kHighMemStart);
   PrintRange(kHighShadowStart, kHighShadowEnd, "HighShadow");
-  if (SHADOW_OFFSET) {
-    if (kLowShadowEnd + 1 < kHighShadowStart)
-      PrintRange(kLowShadowEnd + 1, kHighShadowStart - 1, "ShadowGap");
-    else
-      CHECK_EQ(kLowMemEnd + 1, kHighShadowStart);
-    PrintRange(kLowShadowStart, kLowShadowEnd, "LowShadow");
-    if (kLowMemEnd + 1 < kLowShadowStart)
-      PrintRange(kLowMemEnd + 1, kLowShadowStart - 1, "ShadowGap");
-    else
-      CHECK_EQ(kLowMemEnd + 1, kLowShadowStart);
-    PrintRange(kLowMemStart, kLowMemEnd, "LowMem");
-    CHECK_EQ(0, kLowMemStart);
-  } else {
-    if (kLowMemEnd + 1 < kHighShadowStart)
-      PrintRange(kLowMemEnd + 1, kHighShadowStart - 1, "ShadowGap");
-    else
-      CHECK_EQ(kLowMemEnd + 1, kHighShadowStart);
-    PrintRange(kLowMemStart, kLowMemEnd, "LowMem");
-    CHECK_EQ(kLowShadowEnd + 1, kLowMemStart);
-    PrintRange(kLowShadowStart, kLowShadowEnd, "LowShadow");
-    PrintRange(0, kLowShadowStart - 1, "ShadowGap");
-  }
+  if (kLowShadowEnd + 1 < kHighShadowStart)
+    PrintRange(kLowShadowEnd + 1, kHighShadowStart - 1, "ShadowGap");
+  else
+    CHECK_EQ(kLowMemEnd + 1, kHighShadowStart);
+  PrintRange(kLowShadowStart, kLowShadowEnd, "LowShadow");
+  if (kLowMemEnd + 1 < kLowShadowStart)
+    PrintRange(kLowMemEnd + 1, kLowShadowStart - 1, "ShadowGap");
+  else
+    CHECK_EQ(kLowMemEnd + 1, kLowShadowStart);
+  PrintRange(kLowMemStart, kLowMemEnd, "LowMem");
+  CHECK_EQ(0, kLowMemStart);
 }
 
 static uptr GetHighMemEnd() {
   // HighMem covers the upper part of the address space.
   uptr max_address = GetMaxUserVirtualAddress();
-  if (SHADOW_OFFSET)
-    // Adjust max address to make sure that kHighMemEnd and kHighMemStart are
-    // properly aligned:
-    max_address |= SHADOW_GRANULARITY * GetMmapGranularity() - 1;
+  // Adjust max address to make sure that kHighMemEnd and kHighMemStart are
+  // properly aligned:
+  max_address |= (GetMmapGranularity() << kShadowScale) - 1;
   return max_address;
 }
 
 static void InitializeShadowBaseAddress(uptr shadow_size_bytes) {
-  // Set the shadow memory address to uninitialized.
-  __hwasan_shadow_memory_dynamic_address = kDefaultShadowSentinel;
-  uptr shadow_start = SHADOW_OFFSET;
-  // Detect if a dynamic shadow address must be used and find the available
-  // location when necessary. When dynamic address is used, the macro
-  // kLowShadowBeg expands to __hwasan_shadow_memory_dynamic_address which
-  // was just set to kDefaultShadowSentinel.
-  if (shadow_start == kDefaultShadowSentinel) {
-    __hwasan_shadow_memory_dynamic_address = 0;
-    CHECK_EQ(0, SHADOW_OFFSET);
-    shadow_start = FindDynamicShadowStart(shadow_size_bytes);
-  }
-  // Update the shadow memory address (potentially) used by instrumentation.
-  __hwasan_shadow_memory_dynamic_address = shadow_start;
+  __hwasan_shadow_memory_dynamic_address =
+      FindDynamicShadowStart(shadow_size_bytes);
 }
 
 bool InitShadow() {
@@ -159,29 +138,23 @@
   kHighMemEnd = GetHighMemEnd();
 
   // Determine shadow memory base offset.
-  InitializeShadowBaseAddress(MEM_TO_SHADOW_SIZE(kHighMemEnd));
+  InitializeShadowBaseAddress(MemToShadowSize(kHighMemEnd));
 
   // Place the low memory first.
-  if (SHADOW_OFFSET) {
-    kLowMemEnd = SHADOW_OFFSET - 1;
-    kLowMemStart = 0;
-  } else {
-    // LowMem covers as much of the first 4GB as possible.
-    kLowMemEnd = (1UL << 32) - 1;
-    kLowMemStart = MEM_TO_SHADOW(kLowMemEnd) + 1;
-  }
+  kLowMemEnd = __hwasan_shadow_memory_dynamic_address - 1;
+  kLowMemStart = 0;
 
   // Define the low shadow based on the already placed low memory.
-  kLowShadowEnd = MEM_TO_SHADOW(kLowMemEnd);
-  kLowShadowStart = SHADOW_OFFSET ? SHADOW_OFFSET : MEM_TO_SHADOW(kLowMemStart);
+  kLowShadowEnd = MemToShadow(kLowMemEnd);
+  kLowShadowStart = __hwasan_shadow_memory_dynamic_address;
 
   // High shadow takes whatever memory is left up there (making sure it is not
   // interfering with low memory in the fixed case).
-  kHighShadowEnd = MEM_TO_SHADOW(kHighMemEnd);
-  kHighShadowStart = Max(kLowMemEnd, MEM_TO_SHADOW(kHighShadowEnd)) + 1;
+  kHighShadowEnd = MemToShadow(kHighMemEnd);
+  kHighShadowStart = Max(kLowMemEnd, MemToShadow(kHighShadowEnd)) + 1;
 
   // High memory starts where allocated shadow allows.
-  kHighMemStart = SHADOW_TO_MEM(kHighShadowStart);
+  kHighMemStart = ShadowToMem(kHighShadowStart);
 
   // Check the sanity of the defined memory ranges (there might be gaps).
   CHECK_EQ(kHighMemStart % GetMmapGranularity(), 0);
@@ -190,10 +163,7 @@
   CHECK_GT(kHighShadowStart, kLowMemEnd);
   CHECK_GT(kLowMemEnd, kLowMemStart);
   CHECK_GT(kLowShadowEnd, kLowShadowStart);
-  if (SHADOW_OFFSET)
-    CHECK_GT(kLowShadowStart, kLowMemEnd);
-  else
-    CHECK_GT(kLowMemEnd, kLowShadowStart);
+  CHECK_GT(kLowShadowStart, kLowMemEnd);
 
   if (Verbosity())
     PrintAddressSpaceLayout();
@@ -204,21 +174,43 @@
 
   // Protect all the gaps.
   ProtectGap(0, Min(kLowMemStart, kLowShadowStart));
-  if (SHADOW_OFFSET) {
-    if (kLowMemEnd + 1 < kLowShadowStart)
-      ProtectGap(kLowMemEnd + 1, kLowShadowStart - kLowMemEnd - 1);
-    if (kLowShadowEnd + 1 < kHighShadowStart)
-      ProtectGap(kLowShadowEnd + 1, kHighShadowStart - kLowShadowEnd - 1);
-  } else {
-    if (kLowMemEnd + 1 < kHighShadowStart)
-      ProtectGap(kLowMemEnd + 1, kHighShadowStart - kLowMemEnd - 1);
-  }
+  if (kLowMemEnd + 1 < kLowShadowStart)
+    ProtectGap(kLowMemEnd + 1, kLowShadowStart - kLowMemEnd - 1);
+  if (kLowShadowEnd + 1 < kHighShadowStart)
+    ProtectGap(kLowShadowEnd + 1, kHighShadowStart - kLowShadowEnd - 1);
   if (kHighShadowEnd + 1 < kHighMemStart)
     ProtectGap(kHighShadowEnd + 1, kHighMemStart - kHighShadowEnd - 1);
 
   return true;
 }
 
+void InitThreads() {
+  CHECK(__hwasan_shadow_memory_dynamic_address);
+  uptr guard_page_size = GetMmapGranularity();
+  uptr thread_space_start =
+      __hwasan_shadow_memory_dynamic_address - (1ULL << kShadowBaseAlignment);
+  uptr thread_space_end =
+      __hwasan_shadow_memory_dynamic_address - guard_page_size;
+  ReserveShadowMemoryRange(thread_space_start, thread_space_end - 1,
+                           "hwasan threads");
+  ProtectGap(thread_space_end,
+             __hwasan_shadow_memory_dynamic_address - thread_space_end);
+  InitThreadList(thread_space_start, thread_space_end - thread_space_start);
+}
+
+static void MadviseShadowRegion(uptr beg, uptr end) {
+  uptr size = end - beg + 1;
+  if (common_flags()->no_huge_pages_for_shadow)
+    NoHugePagesInRegion(beg, size);
+  if (common_flags()->use_madv_dontdump)
+    DontDumpShadowMemory(beg, size);
+}
+
+void MadviseShadow() {
+  MadviseShadowRegion(kLowShadowStart, kLowShadowEnd);
+  MadviseShadowRegion(kHighShadowStart, kHighShadowEnd);
+}
+
 bool MemIsApp(uptr p) {
   CHECK(GetTagFromPointer(p) == 0);
   return p >= kHighMemStart || (p >= kLowMemStart && p <= kLowMemEnd);
@@ -240,37 +232,60 @@
 
 // ---------------------- TSD ---------------- {{{1
 
+extern "C" void __hwasan_thread_enter() {
+  hwasanThreadList().CreateCurrentThread();
+}
+
+extern "C" void __hwasan_thread_exit() {
+  Thread *t = GetCurrentThread();
+  // Make sure that signal handler can not see a stale current thread pointer.
+  atomic_signal_fence(memory_order_seq_cst);
+  if (t)
+    hwasanThreadList().ReleaseThread(t);
+}
+
+#if HWASAN_WITH_INTERCEPTORS
 static pthread_key_t tsd_key;
 static bool tsd_key_inited = false;
 
-void HwasanTSDInit(void (*destructor)(void *tsd)) {
-  CHECK(!tsd_key_inited);
-  tsd_key_inited = true;
-  CHECK_EQ(0, pthread_key_create(&tsd_key, destructor));
-}
-
-HwasanThread *GetCurrentThread() {
-  return (HwasanThread*)pthread_getspecific(tsd_key);
-}
-
-void SetCurrentThread(HwasanThread *t) {
-  // Make sure that HwasanTSDDtor gets called at the end.
-  CHECK(tsd_key_inited);
-  // Make sure we do not reset the current HwasanThread.
-  CHECK_EQ(0, pthread_getspecific(tsd_key));
-  pthread_setspecific(tsd_key, (void *)t);
+void HwasanTSDThreadInit() {
+  if (tsd_key_inited)
+    CHECK_EQ(0, pthread_setspecific(tsd_key,
+                                    (void *)GetPthreadDestructorIterations()));
 }
 
 void HwasanTSDDtor(void *tsd) {
-  HwasanThread *t = (HwasanThread*)tsd;
-  if (t->destructor_iterations_ > 1) {
-    t->destructor_iterations_--;
-    CHECK_EQ(0, pthread_setspecific(tsd_key, tsd));
+  uptr iterations = (uptr)tsd;
+  if (iterations > 1) {
+    CHECK_EQ(0, pthread_setspecific(tsd_key, (void *)(iterations - 1)));
     return;
   }
-  // Make sure that signal handler can not see a stale current thread pointer.
-  atomic_signal_fence(memory_order_seq_cst);
-  HwasanThread::TSDDtor(tsd);
+  __hwasan_thread_exit();
+}
+
+void HwasanTSDInit() {
+  CHECK(!tsd_key_inited);
+  tsd_key_inited = true;
+  CHECK_EQ(0, pthread_key_create(&tsd_key, HwasanTSDDtor));
+}
+#else
+void HwasanTSDInit() {}
+void HwasanTSDThreadInit() {}
+#endif
+
+#if SANITIZER_ANDROID
+uptr *GetCurrentThreadLongPtr() {
+  return (uptr *)get_android_tls_ptr();
+}
+#else
+uptr *GetCurrentThreadLongPtr() {
+  return &__hwasan_tls;
+}
+#endif
+
+Thread *GetCurrentThread() {
+  auto *R = (StackAllocationsRingBuffer*)GetCurrentThreadLongPtr();
+  return hwasanThreadList().GetThreadByBufferAddress((uptr)(R->Next()));
 }
 
 struct AccessInfo {
@@ -340,8 +355,8 @@
   BufferedStackTrace *stack = stack_buffer.data();
   stack->Reset();
   SignalContext sig{info, uc};
-  GetStackTrace(stack, kStackTraceMax, sig.pc, sig.bp, uc,
-                common_flags()->fast_unwind_on_fatal);
+  GetStackTrace(stack, kStackTraceMax, StackTrace::GetNextInstructionPc(sig.pc),
+                sig.bp, uc, common_flags()->fast_unwind_on_fatal);
 
   ReportTagMismatch(stack, ai.addr, ai.size, ai.is_store);
 
@@ -360,8 +375,8 @@
 
 static void OnStackUnwind(const SignalContext &sig, const void *,
                           BufferedStackTrace *stack) {
-  GetStackTrace(stack, kStackTraceMax, sig.pc, sig.bp, sig.context,
-                common_flags()->fast_unwind_on_fatal);
+  GetStackTrace(stack, kStackTraceMax, StackTrace::GetNextInstructionPc(sig.pc),
+                sig.bp, sig.context, common_flags()->fast_unwind_on_fatal);
 }
 
 void HwasanOnDeadlySignal(int signo, void *info, void *context) {
diff --git a/lib/hwasan/hwasan_mapping.h b/lib/hwasan/hwasan_mapping.h
index 650a5ae..e5e23dc 100644
--- a/lib/hwasan/hwasan_mapping.h
+++ b/lib/hwasan/hwasan_mapping.h
@@ -16,68 +16,41 @@
 #define HWASAN_MAPPING_H
 
 #include "sanitizer_common/sanitizer_internal_defs.h"
+#include "hwasan_interface_internal.h"
 
-// Typical mapping on Linux/x86_64 with fixed shadow mapping:
-// || [0x080000000000, 0x7fffffffffff] || HighMem    ||
-// || [0x008000000000, 0x07ffffffffff] || HighShadow ||
-// || [0x000100000000, 0x007fffffffff] || ShadowGap  ||
-// || [0x000010000000, 0x0000ffffffff] || LowMem     ||
-// || [0x000001000000, 0x00000fffffff] || LowShadow  ||
-// || [0x000000000000, 0x000000ffffff] || ShadowGap  ||
-//
-// and with dynamic shadow mapped at [0x770d59f40000, 0x7f0d59f40000]:
+// Typical mapping on Linux/x86_64:
+// with dynamic shadow mapped at [0x770d59f40000, 0x7f0d59f40000]:
 // || [0x7f0d59f40000, 0x7fffffffffff] || HighMem    ||
 // || [0x7efe2f934000, 0x7f0d59f3ffff] || HighShadow ||
 // || [0x7e7e2f934000, 0x7efe2f933fff] || ShadowGap  ||
 // || [0x770d59f40000, 0x7e7e2f933fff] || LowShadow  ||
 // || [0x000000000000, 0x770d59f3ffff] || LowMem     ||
 
-// Typical mapping on Android/AArch64 (39-bit VMA):
-// || [0x001000000000, 0x007fffffffff] || HighMem    ||
-// || [0x000800000000, 0x000fffffffff] || ShadowGap  ||
-// || [0x000100000000, 0x0007ffffffff] || HighShadow ||
-// || [0x000010000000, 0x0000ffffffff] || LowMem     ||
-// || [0x000001000000, 0x00000fffffff] || LowShadow  ||
-// || [0x000000000000, 0x000000ffffff] || ShadowGap  ||
-//
-// and with dynamic shadow mapped: [0x007477480000, 0x007c77480000]:
+// Typical mapping on Android/AArch64
+// with dynamic shadow mapped: [0x007477480000, 0x007c77480000]:
 // || [0x007c77480000, 0x007fffffffff] || HighMem    ||
 // || [0x007c3ebc8000, 0x007c7747ffff] || HighShadow ||
 // || [0x007bbebc8000, 0x007c3ebc7fff] || ShadowGap  ||
 // || [0x007477480000, 0x007bbebc7fff] || LowShadow  ||
 // || [0x000000000000, 0x00747747ffff] || LowMem     ||
 
-static constexpr __sanitizer::u64 kDefaultShadowSentinel = ~(__sanitizer::u64)0;
-
 // Reasonable values are 4 (for 1/16th shadow) and 6 (for 1/64th).
-constexpr __sanitizer::uptr kShadowScale = 4;
-constexpr __sanitizer::uptr kShadowAlignment = 1ULL << kShadowScale;
-
-#if SANITIZER_ANDROID
-# define HWASAN_FIXED_MAPPING 0
-#else
-# define HWASAN_FIXED_MAPPING 1
-#endif
-
-#if HWASAN_FIXED_MAPPING
-# define SHADOW_OFFSET (0)
-# define HWASAN_PREMAP_SHADOW 0
-#else
-# define SHADOW_OFFSET (__hwasan_shadow_memory_dynamic_address)
-# define HWASAN_PREMAP_SHADOW 1
-#endif
-
-#define SHADOW_GRANULARITY (1ULL << kShadowScale)
-
-#define MEM_TO_SHADOW(mem) (((uptr)(mem) >> kShadowScale) + SHADOW_OFFSET)
-#define SHADOW_TO_MEM(shadow) (((uptr)(shadow) - SHADOW_OFFSET) << kShadowScale)
-
-#define MEM_TO_SHADOW_SIZE(size) ((uptr)(size) >> kShadowScale)
-
-#define MEM_IS_APP(mem) MemIsApp((uptr)(mem))
+constexpr uptr kShadowScale = 4;
+constexpr uptr kShadowAlignment = 1ULL << kShadowScale;
 
 namespace __hwasan {
 
+inline uptr MemToShadow(uptr untagged_addr) {
+  return (untagged_addr >> kShadowScale) +
+         __hwasan_shadow_memory_dynamic_address;
+}
+inline uptr ShadowToMem(uptr shadow_addr) {
+  return (shadow_addr - __hwasan_shadow_memory_dynamic_address) << kShadowScale;
+}
+inline uptr MemToShadowSize(uptr size) {
+  return size >> kShadowScale;
+}
+
 bool MemIsApp(uptr p);
 
 }  // namespace __hwasan
diff --git a/lib/hwasan/hwasan_poisoning.cc b/lib/hwasan/hwasan_poisoning.cc
index b99d8ed..9c8e16b 100644
--- a/lib/hwasan/hwasan_poisoning.cc
+++ b/lib/hwasan/hwasan_poisoning.cc
@@ -1,4 +1,4 @@
-//===-- hwasan_poisoning.cc ---------------------------------------*- C++ -*-===//
+//===-- hwasan_poisoning.cc -------------------------------------*- C++ -*-===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -22,8 +22,8 @@
 uptr TagMemoryAligned(uptr p, uptr size, tag_t tag) {
   CHECK(IsAligned(p, kShadowAlignment));
   CHECK(IsAligned(size, kShadowAlignment));
-  uptr shadow_start = MEM_TO_SHADOW(p);
-  uptr shadow_size = MEM_TO_SHADOW_SIZE(size);
+  uptr shadow_start = MemToShadow(p);
+  uptr shadow_size = MemToShadowSize(size);
   internal_memset((void *)shadow_start, tag, shadow_size);
   return AddTagToPointer(p, tag);
 }
diff --git a/lib/hwasan/hwasan_poisoning.h b/lib/hwasan/hwasan_poisoning.h
index b44a91f..0dbf9d8 100644
--- a/lib/hwasan/hwasan_poisoning.h
+++ b/lib/hwasan/hwasan_poisoning.h
@@ -1,4 +1,4 @@
-//===-- hwasan_poisoning.h ----------------------------------------*- C++ -*-===//
+//===-- hwasan_poisoning.h --------------------------------------*- C++ -*-===//
 //
 //                     The LLVM Compiler Infrastructure
 //
diff --git a/lib/hwasan/hwasan_report.cc b/lib/hwasan/hwasan_report.cc
index 16e9016..cf5f468 100644
--- a/lib/hwasan/hwasan_report.cc
+++ b/lib/hwasan/hwasan_report.cc
@@ -15,6 +15,8 @@
 #include "hwasan.h"
 #include "hwasan_allocator.h"
 #include "hwasan_mapping.h"
+#include "hwasan_thread.h"
+#include "hwasan_thread_list.h"
 #include "sanitizer_common/sanitizer_allocator_internal.h"
 #include "sanitizer_common/sanitizer_common.h"
 #include "sanitizer_common/sanitizer_flags.h"
@@ -34,51 +36,169 @@
   return res;
 }
 
+// A RAII object that holds a copy of the current thread stack ring buffer.
+// The actual stack buffer may change while we are iterating over it (for
+// example, Printf may call syslog() which can itself be built with hwasan).
+class SavedStackAllocations {
+ public:
+  SavedStackAllocations(StackAllocationsRingBuffer *rb) {
+    uptr size = rb->size() * sizeof(uptr);
+    void *storage =
+        MmapAlignedOrDieOnFatalError(size, size * 2, "saved stack allocations");
+    new (&rb_) StackAllocationsRingBuffer(*rb, storage);
+  }
+
+  ~SavedStackAllocations() {
+    StackAllocationsRingBuffer *rb = get();
+    UnmapOrDie(rb->StartOfStorage(), rb->size() * sizeof(uptr));
+  }
+
+  StackAllocationsRingBuffer *get() {
+    return (StackAllocationsRingBuffer *)&rb_;
+  }
+
+ private:
+  uptr rb_;
+};
+
 class Decorator: public __sanitizer::SanitizerCommonDecorator {
  public:
   Decorator() : SanitizerCommonDecorator() { }
+  const char *Access() { return Blue(); }
   const char *Allocation() const { return Magenta(); }
   const char *Origin() const { return Magenta(); }
   const char *Name() const { return Green(); }
+  const char *Location() { return Green(); }
+  const char *Thread() { return Green(); }
 };
 
-struct HeapAddressDescription {
-  uptr addr;
-  u32 alloc_stack_id;
-  u32 free_stack_id;
-
-  void Print() const {
-    Decorator d;
-    if (free_stack_id) {
-      Printf("%sfreed here:%s\n", d.Allocation(), d.Default());
-      GetStackTraceFromId(free_stack_id).Print();
-      Printf("%spreviously allocated here:%s\n", d.Allocation(), d.Default());
-    } else {
-      Printf("%sallocated here:%s\n", d.Allocation(), d.Default());
+// Returns the index of the rb element that matches tagged_addr (plus one),
+// or zero if found nothing.
+uptr FindHeapAllocation(HeapAllocationsRingBuffer *rb,
+                        uptr tagged_addr,
+                        HeapAllocationRecord *har) {
+  if (!rb) return 0;
+  for (uptr i = 0, size = rb->size(); i < size; i++) {
+    auto h = (*rb)[i];
+    if (h.tagged_addr <= tagged_addr &&
+        h.tagged_addr + h.requested_size > tagged_addr) {
+      *har = h;
+      return i + 1;
     }
-    GetStackTraceFromId(alloc_stack_id).Print();
   }
-};
-
-bool GetHeapAddressInformation(uptr addr, uptr access_size,
-                               HeapAddressDescription *description) {
-  HwasanChunkView chunk = FindHeapChunkByAddress(addr);
-  if (!chunk.IsValid())
-    return false;
-  description->addr = addr;
-  description->alloc_stack_id = chunk.GetAllocStackId();
-  description->free_stack_id = chunk.GetFreeStackId();
-  return true;
+  return 0;
 }
 
-void PrintAddressDescription(uptr addr, uptr access_size) {
-  HeapAddressDescription heap_description;
-  if (GetHeapAddressInformation(addr, access_size, &heap_description)) {
-    heap_description.Print();
-    return;
+void PrintAddressDescription(
+    uptr tagged_addr, uptr access_size,
+    StackAllocationsRingBuffer *current_stack_allocations) {
+  Decorator d;
+  int num_descriptions_printed = 0;
+  uptr untagged_addr = UntagAddr(tagged_addr);
+  // Check if this looks like a heap buffer overflow by scanning
+  // the shadow left and right and looking for the first adjacent
+  // object with a different memory tag. If that tag matches addr_tag,
+  // check the allocator if it has a live chunk there.
+  tag_t addr_tag = GetTagFromPointer(tagged_addr);
+  tag_t *tag_ptr = reinterpret_cast<tag_t*>(MemToShadow(untagged_addr));
+  if (*tag_ptr != addr_tag) { // should be true usually.
+    tag_t *left = tag_ptr, *right = tag_ptr;
+    // scan left.
+    for (int i = 0; i < 1000 && *left == *tag_ptr; i++, left--){}
+    // scan right.
+    for (int i = 0; i < 1000 && *right == *tag_ptr; i++, right++){}
+    // Chose the object that has addr_tag and that is closer to addr.
+    tag_t *candidate = nullptr;
+    if (*right == addr_tag && *left == addr_tag)
+      candidate = right - tag_ptr < tag_ptr - left ? right : left;
+    else if (*right == addr_tag)
+      candidate = right;
+    else if (*left == addr_tag)
+      candidate = left;
+
+    if (candidate) {
+      uptr mem = ShadowToMem(reinterpret_cast<uptr>(candidate));
+      HwasanChunkView chunk = FindHeapChunkByAddress(mem);
+      if (chunk.IsAllocated()) {
+        Printf("%s", d.Location());
+        Printf(
+            "%p is located %zd bytes to the %s of %zd-byte region [%p,%p)\n",
+            untagged_addr,
+            candidate == left ? untagged_addr - chunk.End()
+            : chunk.Beg() - untagged_addr,
+            candidate == right ? "left" : "right", chunk.UsedSize(),
+            chunk.Beg(), chunk.End());
+        Printf("%s", d.Allocation());
+        Printf("allocated here:\n");
+        Printf("%s", d.Default());
+        GetStackTraceFromId(chunk.GetAllocStackId()).Print();
+        num_descriptions_printed++;
+      }
+    }
   }
-  // We exhausted our possibilities. Bail out.
-  Printf("HWAddressSanitizer can not describe address in more detail.\n");
+
+  hwasanThreadList().VisitAllLiveThreads([&](Thread *t) {
+    // Scan all threads' ring buffers to find if it's a heap-use-after-free.
+    HeapAllocationRecord har;
+    if (uptr D = FindHeapAllocation(t->heap_allocations(), tagged_addr, &har)) {
+      Printf("%s", d.Location());
+      Printf("%p is located %zd bytes inside of %zd-byte region [%p,%p)\n",
+             untagged_addr, untagged_addr - UntagAddr(har.tagged_addr),
+             har.requested_size, UntagAddr(har.tagged_addr),
+             UntagAddr(har.tagged_addr) + har.requested_size);
+      Printf("%s", d.Allocation());
+      Printf("freed by thread T%zd here:\n", t->unique_id());
+      Printf("%s", d.Default());
+      GetStackTraceFromId(har.free_context_id).Print();
+
+      Printf("%s", d.Allocation());
+      Printf("previously allocated here:\n", t);
+      Printf("%s", d.Default());
+      GetStackTraceFromId(har.alloc_context_id).Print();
+      t->Announce();
+
+      // Print a developer note: the index of this heap object
+      // in the thread's deallocation ring buffer.
+      Printf("hwasan_dev_note_heap_rb_distance: %zd %zd\n", D,
+             flags()->heap_history_size);
+
+      num_descriptions_printed++;
+    }
+
+    // Very basic check for stack memory.
+    if (t->AddrIsInStack(untagged_addr)) {
+      Printf("%s", d.Location());
+      Printf("Address %p is located in stack of thread T%zd\n", untagged_addr,
+             t->unique_id());
+      Printf("%s", d.Default());
+      t->Announce();
+
+      // Temporary report section, needs to be improved.
+      Printf("Previosly allocated frames:\n");
+      auto *sa = (t == GetCurrentThread() && current_stack_allocations)
+                     ? current_stack_allocations
+                     : t->stack_allocations();
+      uptr frames = Min((uptr)flags()->stack_history_size, sa->size());
+      for (uptr i = 0; i < frames; i++) {
+        uptr record = (*sa)[i];
+        if (!record)
+          break;
+        uptr sp = (record >> 48) << 4;
+        uptr pc_mask = (1ULL << 48) - 1;
+        uptr pc = record & pc_mask;
+        uptr fixed_pc = StackTrace::GetNextInstructionPc(pc);
+        StackTrace stack(&fixed_pc, 1);
+        Printf("record: %p pc: %p sp: %p", record, pc, sp);
+        stack.Print();
+      }
+
+      num_descriptions_printed++;
+    }
+  });
+
+  if (!num_descriptions_printed)
+    // We exhausted our possibilities. Bail out.
+    Printf("HWAddressSanitizer can not describe address in more detail.\n");
 }
 
 void ReportInvalidAccess(StackTrace *stack, u32 origin) {
@@ -97,37 +217,106 @@
 void ReportInvalidAccessInsideAddressRange(const char *what, const void *start,
                                            uptr size, uptr offset) {
   ScopedErrorReportLock l;
+  SavedStackAllocations current_stack_allocations(
+      GetCurrentThread()->stack_allocations());
 
   Decorator d;
   Printf("%s", d.Warning());
   Printf("%sTag mismatch in %s%s%s at offset %zu inside [%p, %zu)%s\n",
          d.Warning(), d.Name(), what, d.Warning(), offset, start, size,
          d.Default());
-  PrintAddressDescription((uptr)start + offset, 1);
+  PrintAddressDescription((uptr)start + offset, 1,
+                          current_stack_allocations.get());
   // if (__sanitizer::Verbosity())
   //   DescribeMemoryRange(start, size);
 }
 
-void ReportTagMismatch(StackTrace *stack, uptr addr, uptr access_size,
-                       bool is_store) {
+static void PrintTagsAroundAddr(tag_t *tag_ptr) {
+  Printf(
+      "Memory tags around the buggy address (one tag corresponds to %zd "
+      "bytes):\n", kShadowAlignment);
+
+  const uptr row_len = 16;  // better be power of two.
+  const uptr num_rows = 11;
+  tag_t *center_row_beg = reinterpret_cast<tag_t *>(
+      RoundDownTo(reinterpret_cast<uptr>(tag_ptr), row_len));
+  tag_t *beg_row = center_row_beg - row_len * (num_rows / 2);
+  tag_t *end_row = center_row_beg + row_len * (num_rows / 2);
+  InternalScopedString s(GetPageSizeCached());
+  for (tag_t *row = beg_row; row < end_row; row += row_len) {
+    s.append("%s", row == center_row_beg ? "=>" : "  ");
+    for (uptr i = 0; i < row_len; i++) {
+      s.append("%s", row + i == tag_ptr ? "[" : " ");
+      s.append("%02x", row[i]);
+      s.append("%s", row + i == tag_ptr ? "]" : " ");
+    }
+    s.append("%s\n", row == center_row_beg ? "<=" : "  ");
+    Printf("%s", s.data());
+    s.clear();
+  }
+}
+
+void ReportInvalidFree(StackTrace *stack, uptr tagged_addr) {
   ScopedErrorReportLock l;
-
+  uptr untagged_addr = UntagAddr(tagged_addr);
+  tag_t ptr_tag = GetTagFromPointer(tagged_addr);
+  tag_t *tag_ptr = reinterpret_cast<tag_t*>(MemToShadow(untagged_addr));
+  tag_t mem_tag = *tag_ptr;
   Decorator d;
-  Printf("%s", d.Warning());
-  uptr address = GetAddressFromPointer(addr);
-  Printf("%s of size %zu at %p\n", is_store ? "WRITE" : "READ", access_size,
-         address);
-
-  tag_t ptr_tag = GetTagFromPointer(addr);
-  tag_t mem_tag = *(tag_t *)MEM_TO_SHADOW(address);
-  Printf("pointer tag 0x%x\nmemory tag  0x%x\n", ptr_tag, mem_tag);
+  Printf("%s", d.Error());
+  uptr pc = stack->size ? stack->trace[0] : 0;
+  const char *bug_type = "invalid-free";
+  Report("ERROR: %s: %s on address %p at pc %p\n", SanitizerToolName, bug_type,
+         untagged_addr, pc);
+  Printf("%s", d.Access());
+  Printf("tags: %02x/%02x (ptr/mem)\n", ptr_tag, mem_tag);
   Printf("%s", d.Default());
 
   stack->Print();
 
-  PrintAddressDescription(address, access_size);
+  PrintAddressDescription(tagged_addr, 0, nullptr);
 
-  ReportErrorSummary("tag-mismatch", stack);
+  PrintTagsAroundAddr(tag_ptr);
+
+  ReportErrorSummary(bug_type, stack);
+  Die();
+}
+
+void ReportTagMismatch(StackTrace *stack, uptr tagged_addr, uptr access_size,
+                       bool is_store) {
+  ScopedErrorReportLock l;
+  SavedStackAllocations current_stack_allocations(
+      GetCurrentThread()->stack_allocations());
+
+  Decorator d;
+  Printf("%s", d.Error());
+  uptr untagged_addr = UntagAddr(tagged_addr);
+  // TODO: when possible, try to print heap-use-after-free, etc.
+  const char *bug_type = "tag-mismatch";
+  uptr pc = stack->size ? stack->trace[0] : 0;
+  Report("ERROR: %s: %s on address %p at pc %p\n", SanitizerToolName, bug_type,
+         untagged_addr, pc);
+
+  Thread *t = GetCurrentThread();
+
+  tag_t ptr_tag = GetTagFromPointer(tagged_addr);
+  tag_t *tag_ptr = reinterpret_cast<tag_t*>(MemToShadow(untagged_addr));
+  tag_t mem_tag = *tag_ptr;
+  Printf("%s", d.Access());
+  Printf("%s of size %zu at %p tags: %02x/%02x (ptr/mem) in thread T%zd\n",
+         is_store ? "WRITE" : "READ", access_size, untagged_addr, ptr_tag,
+         mem_tag, t->unique_id());
+  Printf("%s", d.Default());
+
+  stack->Print();
+
+  PrintAddressDescription(tagged_addr, access_size,
+                          current_stack_allocations.get());
+  t->Announce();
+
+  PrintTagsAroundAddr(tag_ptr);
+
+  ReportErrorSummary(bug_type, stack);
 }
 
 }  // namespace __hwasan
diff --git a/lib/hwasan/hwasan_report.h b/lib/hwasan/hwasan_report.h
index bb33f1a..7a9eec8 100644
--- a/lib/hwasan/hwasan_report.h
+++ b/lib/hwasan/hwasan_report.h
@@ -27,6 +27,7 @@
                                            uptr size, uptr offset);
 void ReportTagMismatch(StackTrace *stack, uptr addr, uptr access_size,
                        bool is_store);
+void ReportInvalidFree(StackTrace *stack, uptr addr);
 
 void ReportAtExitStatistics();
 
diff --git a/lib/hwasan/hwasan_thread.cc b/lib/hwasan/hwasan_thread.cc
index b50c0fc..0d15c7e 100644
--- a/lib/hwasan/hwasan_thread.cc
+++ b/lib/hwasan/hwasan_thread.cc
@@ -5,8 +5,11 @@
 #include "hwasan_poisoning.h"
 #include "hwasan_interface_internal.h"
 
+#include "sanitizer_common/sanitizer_file.h"
+#include "sanitizer_common/sanitizer_placement_new.h"
 #include "sanitizer_common/sanitizer_tls_get_addr.h"
 
+
 namespace __hwasan {
 
 static u32 RandomSeed() {
@@ -22,69 +25,79 @@
   return seed;
 }
 
-HwasanThread *HwasanThread::Create(thread_callback_t start_routine,
-                               void *arg) {
-  uptr PageSize = GetPageSizeCached();
-  uptr size = RoundUpTo(sizeof(HwasanThread), PageSize);
-  HwasanThread *thread = (HwasanThread*)MmapOrDie(size, __func__);
-  thread->start_routine_ = start_routine;
-  thread->arg_ = arg;
-  thread->destructor_iterations_ = GetPthreadDestructorIterations();
-  thread->random_state_ = flags()->random_tags ? RandomSeed() : 0;
+void Thread::Init(uptr stack_buffer_start, uptr stack_buffer_size) {
+  static u64 unique_id;
+  unique_id_ = unique_id++;
+  random_state_ = flags()->random_tags ? RandomSeed() : unique_id_;
+  if (auto sz = flags()->heap_history_size)
+    heap_allocations_ = HeapAllocationsRingBuffer::New(sz);
 
-  return thread;
+  HwasanTSDThreadInit();  // Only needed with interceptors.
+  uptr *ThreadLong = GetCurrentThreadLongPtr();
+  // The following implicitly sets (this) as the current thread.
+  stack_allocations_ = new (ThreadLong)
+      StackAllocationsRingBuffer((void *)stack_buffer_start, stack_buffer_size);
+  // Check that it worked.
+  CHECK_EQ(GetCurrentThread(), this);
+
+  // ScopedTaggingDisable needs GetCurrentThread to be set up.
+  ScopedTaggingDisabler disabler;
+
+  // If this process is "init" (pid 1), /proc may not be mounted yet.
+  if (IsMainThread() && !FileExists("/proc/self/maps")) {
+    stack_top_ = stack_bottom_ = 0;
+    tls_begin_ = tls_end_ = 0;
+  } else {
+    uptr tls_size;
+    uptr stack_size;
+    GetThreadStackAndTls(IsMainThread(), &stack_bottom_, &stack_size,
+                         &tls_begin_, &tls_size);
+    stack_top_ = stack_bottom_ + stack_size;
+    tls_end_ = tls_begin_ + tls_size;
+
+    int local;
+    CHECK(AddrIsInStack((uptr)&local));
+    CHECK(MemIsApp(stack_bottom_));
+    CHECK(MemIsApp(stack_top_ - 1));
+
+    if (stack_bottom_) {
+      CHECK(MemIsApp(stack_bottom_));
+      CHECK(MemIsApp(stack_top_ - 1));
+    }
+  }
+
+  if (flags()->verbose_threads) {
+    if (IsMainThread()) {
+      Printf("sizeof(Thread): %zd sizeof(HeapRB): %zd sizeof(StackRB): %zd\n",
+             sizeof(Thread), heap_allocations_->SizeInBytes(),
+             stack_allocations_->size() * sizeof(uptr));
+    }
+    Print("Creating  : ");
+  }
 }
 
-void HwasanThread::SetThreadStackAndTls() {
-  uptr tls_size = 0;
-  uptr stack_size = 0;
-  GetThreadStackAndTls(IsMainThread(), &stack_bottom_, &stack_size,
-                       &tls_begin_, &tls_size);
-  stack_top_ = stack_bottom_ + stack_size;
-  tls_end_ = tls_begin_ + tls_size;
-
-  int local;
-  CHECK(AddrIsInStack((uptr)&local));
-}
-
-void HwasanThread::Init() {
-  SetThreadStackAndTls();
-  CHECK(MEM_IS_APP(stack_bottom_));
-  CHECK(MEM_IS_APP(stack_top_ - 1));
-}
-
-void HwasanThread::TSDDtor(void *tsd) {
-  HwasanThread *t = (HwasanThread*)tsd;
-  t->Destroy();
-}
-
-void HwasanThread::ClearShadowForThreadStackAndTLS() {
-  TagMemory(stack_bottom_, stack_top_ - stack_bottom_, 0);
+void Thread::ClearShadowForThreadStackAndTLS() {
+  if (stack_top_ != stack_bottom_)
+    TagMemory(stack_bottom_, stack_top_ - stack_bottom_, 0);
   if (tls_begin_ != tls_end_)
     TagMemory(tls_begin_, tls_end_ - tls_begin_, 0);
 }
 
-void HwasanThread::Destroy() {
-  malloc_storage().CommitBack();
+void Thread::Destroy() {
+  if (flags()->verbose_threads)
+    Print("Destroying: ");
+  AllocatorSwallowThreadLocalCache(allocator_cache());
   ClearShadowForThreadStackAndTLS();
-  uptr size = RoundUpTo(sizeof(HwasanThread), GetPageSizeCached());
-  UnmapOrDie(this, size);
+  if (heap_allocations_)
+    heap_allocations_->Delete();
   DTLS_Destroy();
 }
 
-thread_return_t HwasanThread::ThreadStart() {
-  Init();
-
-  if (!start_routine_) {
-    // start_routine_ == 0 if we're on the main thread or on one of the
-    // OS X libdispatch worker threads. But nobody is supposed to call
-    // ThreadStart() for the worker threads.
-    return 0;
-  }
-
-  thread_return_t res = start_routine_(arg_);
-
-  return res;
+void Thread::Print(const char *Prefix) {
+  Printf("%sT%zd %p stack: [%p,%p) sz: %zd tls: [%p,%p)\n", Prefix,
+         unique_id_, this, stack_bottom(), stack_top(),
+         stack_top() - stack_bottom(),
+         tls_begin(), tls_end());
 }
 
 static u32 xorshift(u32 state) {
@@ -95,7 +108,8 @@
 }
 
 // Generate a (pseudo-)random non-zero tag.
-tag_t HwasanThread::GenerateRandomTag() {
+tag_t Thread::GenerateRandomTag() {
+  if (tagging_disabled_) return 0;
   tag_t tag;
   do {
     if (flags()->random_tags) {
diff --git a/lib/hwasan/hwasan_thread.h b/lib/hwasan/hwasan_thread.h
index 1e482ad..4830473 100644
--- a/lib/hwasan/hwasan_thread.h
+++ b/lib/hwasan/hwasan_thread.h
@@ -1,4 +1,4 @@
-//===-- hwasan_thread.h -------------------------------------------*- C++ -*-===//
+//===-- hwasan_thread.h -----------------------------------------*- C++ -*-===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -16,23 +16,23 @@
 
 #include "hwasan_allocator.h"
 #include "sanitizer_common/sanitizer_common.h"
+#include "sanitizer_common/sanitizer_ring_buffer.h"
 
 namespace __hwasan {
 
-class HwasanThread {
- public:
-  static HwasanThread *Create(thread_callback_t start_routine, void *arg);
-  static void TSDDtor(void *tsd);
-  void Destroy();
+typedef __sanitizer::CompactRingBuffer<uptr> StackAllocationsRingBuffer;
 
-  void Init();  // Should be called from the thread itself.
-  thread_return_t ThreadStart();
+class Thread {
+ public:
+  void Init(uptr stack_buffer_start, uptr stack_buffer_size);  // Must be called from the thread itself.
+  void Destroy();
 
   uptr stack_top() { return stack_top_; }
   uptr stack_bottom() { return stack_bottom_; }
+  uptr stack_size() { return stack_top() - stack_bottom(); }
   uptr tls_begin() { return tls_begin_; }
   uptr tls_end() { return tls_end_; }
-  bool IsMainThread() { return start_routine_ == nullptr; }
+  bool IsMainThread() { return unique_id_ == 0; }
 
   bool AddrIsInStack(uptr addr) {
     return addr >= stack_bottom_ && addr < stack_top_;
@@ -50,19 +50,28 @@
   void EnterInterceptorScope() { in_interceptor_scope_++; }
   void LeaveInterceptorScope() { in_interceptor_scope_--; }
 
-  HwasanThreadLocalMallocStorage &malloc_storage() { return malloc_storage_; }
+  AllocatorCache *allocator_cache() { return &allocator_cache_; }
+  HeapAllocationsRingBuffer *heap_allocations() { return heap_allocations_; }
+  StackAllocationsRingBuffer *stack_allocations() { return stack_allocations_; }
 
   tag_t GenerateRandomTag();
 
-  int destructor_iterations_;
+  void DisableTagging() { tagging_disabled_++; }
+  void EnableTagging() { tagging_disabled_--; }
+  bool TaggingIsDisabled() const { return tagging_disabled_; }
+
+  u64 unique_id() const { return unique_id_; }
+  void Announce() {
+    if (announced_) return;
+    announced_ = true;
+    Print("Thread: ");
+  }
 
  private:
-  // NOTE: There is no HwasanThread constructor. It is allocated
+  // NOTE: There is no Thread constructor. It is allocated
   // via mmap() and *must* be valid in zero-initialized state.
-  void SetThreadStackAndTls();
   void ClearShadowForThreadStackAndTLS();
-  thread_callback_t start_routine_;
-  void *arg_;
+  void Print(const char *prefix);
   uptr stack_top_;
   uptr stack_bottom_;
   uptr tls_begin_;
@@ -75,11 +84,30 @@
   u32 random_state_;
   u32 random_buffer_;
 
-  HwasanThreadLocalMallocStorage malloc_storage_;
+  AllocatorCache allocator_cache_;
+  HeapAllocationsRingBuffer *heap_allocations_;
+  StackAllocationsRingBuffer *stack_allocations_;
+
+  static void InsertIntoThreadList(Thread *t);
+  static void RemoveFromThreadList(Thread *t);
+  Thread *next_;  // All live threads form a linked list.
+
+  u64 unique_id_;  // counting from zero.
+
+  u32 tagging_disabled_;  // if non-zero, malloc uses zero tag in this thread.
+
+  bool announced_;
+
+  friend struct ThreadListHead;
 };
 
-HwasanThread *GetCurrentThread();
-void SetCurrentThread(HwasanThread *t);
+Thread *GetCurrentThread();
+uptr *GetCurrentThreadLongPtr();
+
+struct ScopedTaggingDisabler {
+  ScopedTaggingDisabler() { GetCurrentThread()->DisableTagging(); }
+  ~ScopedTaggingDisabler() { GetCurrentThread()->EnableTagging(); }
+};
 
 } // namespace __hwasan
 
diff --git a/lib/hwasan/hwasan_thread_list.cc b/lib/hwasan/hwasan_thread_list.cc
new file mode 100644
index 0000000..a31eee8
--- /dev/null
+++ b/lib/hwasan/hwasan_thread_list.cc
@@ -0,0 +1,15 @@
+#include "hwasan_thread_list.h"
+
+namespace __hwasan {
+static ALIGNED(16) char thread_list_placeholder[sizeof(HwasanThreadList)];
+static HwasanThreadList *hwasan_thread_list;
+
+HwasanThreadList &hwasanThreadList() { return *hwasan_thread_list; }
+
+void InitThreadList(uptr storage, uptr size) {
+  CHECK(hwasan_thread_list == nullptr);
+  hwasan_thread_list =
+      new (thread_list_placeholder) HwasanThreadList(storage, size);
+}
+
+} // namespace
diff --git a/lib/hwasan/hwasan_thread_list.h b/lib/hwasan/hwasan_thread_list.h
new file mode 100644
index 0000000..53747b5
--- /dev/null
+++ b/lib/hwasan/hwasan_thread_list.h
@@ -0,0 +1,200 @@
+//===-- hwasan_thread_list.h ------------------------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file is a part of HWAddressSanitizer.
+//
+//===----------------------------------------------------------------------===//
+
+// HwasanThreadList is a registry for live threads, as well as an allocator for
+// HwasanThread objects and their stack history ring buffers. There are
+// constraints on memory layout of the shadow region and CompactRingBuffer that
+// are part of the ABI contract between compiler-rt and llvm.
+//
+// * Start of the shadow memory region is aligned to 2**kShadowBaseAlignment.
+// * All stack ring buffers are located within (2**kShadowBaseAlignment)
+// sized region below and adjacent to the shadow region.
+// * Each ring buffer has a size of (2**N)*4096 where N is in [0, 8), and is
+// aligned to twice its size. The value of N can be different for each buffer.
+//
+// These constrains guarantee that, given an address A of any element of the
+// ring buffer,
+//     A_next = (A + sizeof(uptr)) & ~((1 << (N + 13)) - 1)
+//   is the address of the next element of that ring buffer (with wrap-around).
+// And, with K = kShadowBaseAlignment,
+//     S = (A | ((1 << K) - 1)) + 1
+//   (align up to kShadowBaseAlignment) is the start of the shadow region.
+//
+// These calculations are used in compiler instrumentation to update the ring
+// buffer and obtain the base address of shadow using only two inputs: address
+// of the current element of the ring buffer, and N (i.e. size of the ring
+// buffer). Since the value of N is very limited, we pack both inputs into a
+// single thread-local word as
+//   (1 << (N + 56)) | A
+// See the implementation of class CompactRingBuffer, which is what is stored in
+// said thread-local word.
+//
+// Note the unusual way of aligning up the address of the shadow:
+//   (A | ((1 << K) - 1)) + 1
+// It is only correct if A is not already equal to the shadow base address, but
+// it saves 2 instructions on AArch64.
+
+#include "hwasan.h"
+#include "hwasan_allocator.h"
+#include "hwasan_flags.h"
+#include "hwasan_thread.h"
+
+#include "sanitizer_common/sanitizer_placement_new.h"
+
+namespace __hwasan {
+
+static uptr RingBufferSize() {
+  uptr desired_bytes = flags()->stack_history_size * sizeof(uptr);
+  // FIXME: increase the limit to 8 once this bug is fixed:
+  // https://bugs.llvm.org/show_bug.cgi?id=39030
+  for (int shift = 1; shift < 7; ++shift) {
+    uptr size = 4096 * (1ULL << shift);
+    if (size >= desired_bytes)
+      return size;
+  }
+  Printf("stack history size too large: %d\n", flags()->stack_history_size);
+  CHECK(0);
+  return 0;
+}
+
+struct ThreadListHead {
+  Thread *list_;
+
+  ThreadListHead() : list_(nullptr) {}
+
+  void Push(Thread *t) {
+    t->next_ = list_;
+    list_ = t;
+  }
+
+  Thread *Pop() {
+    Thread *t = list_;
+    if (t)
+      list_ = t->next_;
+    return t;
+  }
+
+  void Remove(Thread *t) {
+    Thread **cur = &list_;
+    while (*cur != t) cur = &(*cur)->next_;
+    CHECK(*cur && "thread not found");
+    *cur = (*cur)->next_;
+  }
+
+  template <class CB>
+  void ForEach(CB cb) {
+    Thread *t = list_;
+    while (t) {
+      cb(t);
+      t = t->next_;
+    }
+  }
+};
+
+struct ThreadStats {
+  uptr n_live_threads;
+  uptr total_stack_size;
+};
+
+class HwasanThreadList {
+ public:
+  HwasanThreadList(uptr storage, uptr size)
+      : free_space_(storage),
+        free_space_end_(storage + size),
+        ring_buffer_size_(RingBufferSize()) {}
+
+  Thread *CreateCurrentThread() {
+    Thread *t;
+    {
+      SpinMutexLock l(&list_mutex_);
+      t = free_list_.Pop();
+      if (t)
+        internal_memset((void *)t, 0, sizeof(Thread) + ring_buffer_size_);
+      else
+        t = AllocThread();
+      live_list_.Push(t);
+    }
+    t->Init((uptr)(t + 1), ring_buffer_size_);
+    AddThreadStats(t);
+    return t;
+  }
+
+  void ReleaseThread(Thread *t) {
+    // FIXME: madvise away the ring buffer?
+    RemoveThreadStats(t);
+    t->Destroy();
+    SpinMutexLock l(&list_mutex_);
+    live_list_.Remove(t);
+    free_list_.Push(t);
+  }
+
+  Thread *GetThreadByBufferAddress(uptr p) {
+    uptr align = ring_buffer_size_ * 2;
+    return (Thread *)(RoundDownTo(p, align) - sizeof(Thread));
+  }
+
+  uptr MemoryUsedPerThread() {
+    uptr res = sizeof(Thread) + ring_buffer_size_;
+    if (auto sz = flags()->heap_history_size)
+      res += HeapAllocationsRingBuffer::SizeInBytes(sz);
+    return res;
+  }
+
+  template <class CB>
+  void VisitAllLiveThreads(CB cb) {
+    SpinMutexLock l(&list_mutex_);
+    live_list_.ForEach(cb);
+  }
+
+  void AddThreadStats(Thread *t) {
+    SpinMutexLock l(&stats_mutex_);
+    stats_.n_live_threads++;
+    stats_.total_stack_size += t->stack_size();
+  }
+
+  void RemoveThreadStats(Thread *t) {
+    SpinMutexLock l(&stats_mutex_);
+    stats_.n_live_threads--;
+    stats_.total_stack_size -= t->stack_size();
+  }
+
+  ThreadStats GetThreadStats() {
+    SpinMutexLock l(&stats_mutex_);
+    return stats_;
+  }
+
+ private:
+  Thread *AllocThread() {
+    uptr align = ring_buffer_size_ * 2;
+    uptr ring_buffer_start = RoundUpTo(free_space_ + sizeof(Thread), align);
+    free_space_ = ring_buffer_start + ring_buffer_size_;
+    CHECK(free_space_ <= free_space_end_ && "out of thread memory");
+    return (Thread *)(ring_buffer_start - sizeof(Thread));
+  }
+
+  uptr free_space_;
+  uptr free_space_end_;
+  uptr ring_buffer_size_;
+
+  ThreadListHead free_list_;
+  ThreadListHead live_list_;
+  SpinMutex list_mutex_;
+
+  ThreadStats stats_;
+  SpinMutex stats_mutex_;
+};
+
+void InitThreadList(uptr storage, uptr size);
+HwasanThreadList &hwasanThreadList();
+
+} // namespace
diff --git a/lib/interception/interception.h b/lib/interception/interception.h
index ddd6ec2..07142f1 100644
--- a/lib/interception/interception.h
+++ b/lib/interception/interception.h
@@ -29,6 +29,7 @@
 typedef __sanitizer::sptr    SSIZE_T;
 typedef __sanitizer::sptr    PTRDIFF_T;
 typedef __sanitizer::s64     INTMAX_T;
+typedef __sanitizer::u64     UINTMAX_T;
 typedef __sanitizer::OFF_T   OFF_T;
 typedef __sanitizer::OFF64_T OFF64_T;
 
diff --git a/lib/interception/interception_win.cc b/lib/interception/interception_win.cc
index bd4ad72..cd13827 100644
--- a/lib/interception/interception_win.cc
+++ b/lib/interception/interception_win.cc
@@ -223,8 +223,8 @@
   return true;
 }
 
-static const u8 kHintNop9Bytes[] = {
-  0x66, 0x0F, 0x1F, 0x84, 0x00, 0x00, 0x00, 0x00, 0x00
+static const u8 kHintNop8Bytes[] = {
+  0x0F, 0x1F, 0x84, 0x00, 0x00, 0x00, 0x00, 0x00
 };
 
 template<class T>
@@ -239,8 +239,8 @@
 static bool FunctionHasPadding(uptr address, uptr size) {
   if (IsMemoryPadding(address - size, size))
     return true;
-  if (size <= sizeof(kHintNop9Bytes) &&
-      FunctionHasPrefix(address, kHintNop9Bytes))
+  if (size <= sizeof(kHintNop8Bytes) &&
+      FunctionHasPrefix(address, kHintNop8Bytes))
     return true;
   return false;
 }
diff --git a/lib/lsan/lsan_common.cc b/lib/lsan/lsan_common.cc
index 012a673..eaa5cad 100644
--- a/lib/lsan/lsan_common.cc
+++ b/lib/lsan/lsan_common.cc
@@ -100,8 +100,6 @@
 
 static InternalMmapVector<RootRegion> *root_regions;
 
-static uptr initialized_for_pid;
-
 InternalMmapVector<RootRegion> const *GetRootRegions() { return root_regions; }
 
 void InitializeRootRegions() {
@@ -115,7 +113,6 @@
 }
 
 void InitCommonLsan() {
-  initialized_for_pid = internal_getpid();
   InitializeRootRegions();
   if (common_flags()->detect_leaks) {
     // Initialization which can fail or print warnings should only be done if
@@ -571,12 +568,6 @@
 static bool CheckForLeaks() {
   if (&__lsan_is_turned_off && __lsan_is_turned_off())
       return false;
-  if (initialized_for_pid != internal_getpid()) {
-    // If process was forked and it had threads we fail to detect references
-    // from other threads.
-    Report("WARNING: LeakSanitizer is disabled in forked process.\n");
-    return false;
-  }
   EnsureMainThreadIDIsCorrect();
   CheckForLeaksParam param;
   param.success = false;
diff --git a/lib/lsan/lsan_common_mac.cc b/lib/lsan/lsan_common_mac.cc
index 2508c1d..8dd247c 100644
--- a/lib/lsan/lsan_common_mac.cc
+++ b/lib/lsan/lsan_common_mac.cc
@@ -142,12 +142,6 @@
 }
 
 void ProcessPlatformSpecificAllocations(Frontier *frontier) {
-  mach_port_name_t port;
-  if (task_for_pid(mach_task_self(), internal_getpid(), &port)
-      != KERN_SUCCESS) {
-    return;
-  }
-
   unsigned depth = 1;
   vm_size_t size = 0;
   vm_address_t address = 0;
@@ -158,7 +152,7 @@
 
   while (err == KERN_SUCCESS) {
     struct vm_region_submap_info_64 info;
-    err = vm_region_recurse_64(port, &address, &size, &depth,
+    err = vm_region_recurse_64(mach_task_self(), &address, &size, &depth,
                                (vm_region_info_t)&info, &count);
 
     uptr end_address = address + size;
diff --git a/lib/msan/msan.cc b/lib/msan/msan.cc
index 06bcbdf..ba2d5d5 100644
--- a/lib/msan/msan.cc
+++ b/lib/msan/msan.cc
@@ -59,6 +59,10 @@
 ALIGNED(16) THREADLOCAL u64 __msan_va_arg_tls[kMsanParamTlsSize / sizeof(u64)];
 
 SANITIZER_INTERFACE_ATTRIBUTE
+ALIGNED(16)
+THREADLOCAL u32 __msan_va_arg_origin_tls[kMsanParamTlsSize / sizeof(u32)];
+
+SANITIZER_INTERFACE_ATTRIBUTE
 THREADLOCAL u64 __msan_va_arg_overflow_size_tls;
 
 SANITIZER_INTERFACE_ATTRIBUTE
@@ -277,6 +281,8 @@
   internal_memset(__msan_param_tls, 0, sizeof(__msan_param_tls));
   internal_memset(__msan_retval_tls, 0, sizeof(__msan_retval_tls));
   internal_memset(__msan_va_arg_tls, 0, sizeof(__msan_va_arg_tls));
+  internal_memset(__msan_va_arg_origin_tls, 0,
+                  sizeof(__msan_va_arg_origin_tls));
 
   if (__msan_get_track_origins()) {
     internal_memset(&__msan_retval_origin_tls, 0,
diff --git a/lib/msan/msan_interceptors.cc b/lib/msan/msan_interceptors.cc
index b3429bc..f338d42 100644
--- a/lib/msan/msan_interceptors.cc
+++ b/lib/msan/msan_interceptors.cc
@@ -249,11 +249,11 @@
 // temporary! The following is equivalent on all supported platforms but
 // aarch64 (which uses a different register for sret value).  We have a test
 // to confirm that.
-INTERCEPTOR(void, mallinfo, __sanitizer_mallinfo *sret) {
+INTERCEPTOR(void, mallinfo, __sanitizer_struct_mallinfo *sret) {
 #ifdef __aarch64__
   uptr r8;
   asm volatile("mov %0,x8" : "=r" (r8));
-  sret = reinterpret_cast<__sanitizer_mallinfo*>(r8);
+  sret = reinterpret_cast<__sanitizer_struct_mallinfo*>(r8);
 #endif
   REAL(memset)(sret, 0, sizeof(*sret));
   __msan_unpoison(sret, sizeof(*sret));
diff --git a/lib/msan/tests/msan_test.cc b/lib/msan/tests/msan_test.cc
index 29260f1..19f46ab 100644
--- a/lib/msan/tests/msan_test.cc
+++ b/lib/msan/tests/msan_test.cc
@@ -1934,12 +1934,14 @@
   EXPECT_NOT_POISONED(quo);
 }
 
+#if !defined(__NetBSD__)
 TEST(MemorySanitizer, remquol) {
   int quo;
   long double res = remquof(29.0, 3.0, &quo);
   ASSERT_NE(0.0, res);
   EXPECT_NOT_POISONED(quo);
 }
+#endif
 
 TEST(MemorySanitizer, lgamma) {
   double res = lgamma(1.1);
@@ -1953,11 +1955,13 @@
   EXPECT_NOT_POISONED(signgam);
 }
 
+#if !defined(__NetBSD__)
 TEST(MemorySanitizer, lgammal) {
   long double res = lgammal(1.1);
   ASSERT_NE(0.0, res);
   EXPECT_NOT_POISONED(signgam);
 }
+#endif
 
 TEST(MemorySanitizer, lgamma_r) {
   int sgn;
diff --git a/lib/profile/InstrProfilingValue.c b/lib/profile/InstrProfilingValue.c
index 674a486..c7b01a5 100644
--- a/lib/profile/InstrProfilingValue.c
+++ b/lib/profile/InstrProfilingValue.c
@@ -105,8 +105,7 @@
   return 1;
 }
 
-static ValueProfNode *allocateOneNode(__llvm_profile_data *Data, uint32_t Index,
-                                      uint64_t Value) {
+static ValueProfNode *allocateOneNode(void) {
   ValueProfNode *Node;
 
   if (!hasStaticCounters)
@@ -205,7 +204,7 @@
     return;
   }
 
-  CurVNode = allocateOneNode(PData, CounterIndex, TargetValue);
+  CurVNode = allocateOneNode();
   if (!CurVNode)
     return;
   CurVNode->Value = TargetValue;
diff --git a/lib/profile/WindowsMMap.h b/lib/profile/WindowsMMap.h
index 271619a..ac2c911 100644
--- a/lib/profile/WindowsMMap.h
+++ b/lib/profile/WindowsMMap.h
@@ -12,7 +12,7 @@
 
 #if defined(_WIN32)
 
-#include <BaseTsd.h>
+#include <basetsd.h>
 #include <io.h>
 #include <sys/types.h>
 
diff --git a/lib/safestack/CMakeLists.txt b/lib/safestack/CMakeLists.txt
index 5a1bac2..cc874a3 100644
--- a/lib/safestack/CMakeLists.txt
+++ b/lib/safestack/CMakeLists.txt
@@ -6,29 +6,14 @@
 
 set(SAFESTACK_CFLAGS ${SANITIZER_COMMON_CFLAGS})
 
-if(APPLE)
-  # Build universal binary on APPLE.
+foreach(arch ${SAFESTACK_SUPPORTED_ARCH})
   add_compiler_rt_runtime(clang_rt.safestack
     STATIC
-    OS osx
-    ARCHS ${SAFESTACK_SUPPORTED_ARCH}
+    ARCHS ${arch}
     SOURCES ${SAFESTACK_SOURCES}
-            $<TARGET_OBJECTS:RTInterception.osx>
-            $<TARGET_OBJECTS:RTSanitizerCommon.osx>
-            $<TARGET_OBJECTS:RTSanitizerCommonNoLibc.osx>
+            $<TARGET_OBJECTS:RTInterception.${arch}>
+            $<TARGET_OBJECTS:RTSanitizerCommon.${arch}>
+            $<TARGET_OBJECTS:RTSanitizerCommonNoLibc.${arch}>
     CFLAGS ${SAFESTACK_CFLAGS}
     PARENT_TARGET safestack)
-else()
-  # Otherwise, build separate libraries for each target.
-  foreach(arch ${SAFESTACK_SUPPORTED_ARCH})
-    add_compiler_rt_runtime(clang_rt.safestack
-      STATIC
-      ARCHS ${arch}
-      SOURCES ${SAFESTACK_SOURCES}
-              $<TARGET_OBJECTS:RTInterception.${arch}>
-              $<TARGET_OBJECTS:RTSanitizerCommon.${arch}>
-              $<TARGET_OBJECTS:RTSanitizerCommonNoLibc.${arch}>
-      CFLAGS ${SAFESTACK_CFLAGS}
-      PARENT_TARGET safestack)
-  endforeach()
-endif()
+endforeach()
diff --git a/lib/safestack/safestack.cc b/lib/safestack/safestack.cc
index 8af9362..e682080 100644
--- a/lib/safestack/safestack.cc
+++ b/lib/safestack/safestack.cc
@@ -14,11 +14,13 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include <errno.h>
 #include <limits.h>
 #include <pthread.h>
 #include <stddef.h>
 #include <stdint.h>
 #include <unistd.h>
+#include <stdlib.h>
 #include <sys/resource.h>
 #include <sys/types.h>
 #if !defined(__NetBSD__)
@@ -115,14 +117,6 @@
   unsafe_stack_guard = guard;
 }
 
-static void unsafe_stack_free() {
-  if (unsafe_stack_start) {
-    UnmapOrDie((char *)unsafe_stack_start - unsafe_stack_guard,
-               unsafe_stack_size + unsafe_stack_guard);
-  }
-  unsafe_stack_start = nullptr;
-}
-
 /// Thread data for the cleanup handler
 static pthread_key_t thread_cleanup_key;
 
@@ -149,26 +143,73 @@
                      tinfo->unsafe_stack_guard);
 
   // Make sure out thread-specific destructor will be called
-  // FIXME: we can do this only any other specific key is set by
-  // intercepting the pthread_setspecific function itself
   pthread_setspecific(thread_cleanup_key, (void *)1);
 
   return start_routine(start_routine_arg);
 }
 
-/// Thread-specific data destructor
+/// Linked list used to store exiting threads stack/thread information.
+struct thread_stack_ll {
+  struct thread_stack_ll *next;
+  void *stack_base;
+  size_t size;
+  pid_t pid;
+  tid_t tid;
+};
+
+/// Linked list of unsafe stacks for threads that are exiting. We delay
+/// unmapping them until the thread exits.
+static thread_stack_ll *thread_stacks = nullptr;
+static pthread_mutex_t thread_stacks_mutex = PTHREAD_MUTEX_INITIALIZER;
+
+/// Thread-specific data destructor. We want to free the unsafe stack only after
+/// this thread is terminated. libc can call functions in safestack-instrumented
+/// code (like free) after thread-specific data destructors have run.
 static void thread_cleanup_handler(void *_iter) {
-  // We want to free the unsafe stack only after all other destructors
-  // have already run. We force this function to be called multiple times.
-  // User destructors that might run more then PTHREAD_DESTRUCTOR_ITERATIONS-1
-  // times might still end up executing after the unsafe stack is deallocated.
-  size_t iter = (size_t)_iter;
-  if (iter < PTHREAD_DESTRUCTOR_ITERATIONS) {
-    pthread_setspecific(thread_cleanup_key, (void *)(iter + 1));
-  } else {
-    // This is the last iteration
-    unsafe_stack_free();
+  CHECK_NE(unsafe_stack_start, nullptr);
+  pthread_setspecific(thread_cleanup_key, NULL);
+
+  pthread_mutex_lock(&thread_stacks_mutex);
+  // Temporary list to hold the previous threads stacks so we don't hold the
+  // thread_stacks_mutex for long.
+  thread_stack_ll *temp_stacks = thread_stacks;
+  thread_stacks = nullptr;
+  pthread_mutex_unlock(&thread_stacks_mutex);
+
+  pid_t pid = getpid();
+  tid_t tid = GetTid();
+
+  // Free stacks for dead threads
+  thread_stack_ll **stackp = &temp_stacks;
+  while (*stackp) {
+    thread_stack_ll *stack = *stackp;
+    int error;
+    if (stack->pid != pid ||
+        (internal_iserror(TgKill(stack->pid, stack->tid, 0), &error) &&
+         error == ESRCH)) {
+      UnmapOrDie(stack->stack_base, stack->size);
+      *stackp = stack->next;
+      free(stack);
+    } else
+      stackp = &stack->next;
   }
+
+  thread_stack_ll *cur_stack =
+      (thread_stack_ll *)malloc(sizeof(thread_stack_ll));
+  cur_stack->stack_base = (char *)unsafe_stack_start - unsafe_stack_guard;
+  cur_stack->size = unsafe_stack_size + unsafe_stack_guard;
+  cur_stack->pid = pid;
+  cur_stack->tid = tid;
+
+  pthread_mutex_lock(&thread_stacks_mutex);
+  // Merge thread_stacks with the current thread's stack and any remaining
+  // temp_stacks
+  *stackp = thread_stacks;
+  cur_stack->next = temp_stacks;
+  thread_stacks = cur_stack;
+  pthread_mutex_unlock(&thread_stacks_mutex);
+
+  unsafe_stack_start = nullptr;
 }
 
 static void EnsureInterceptorsInitialized();
diff --git a/lib/sanitizer_common/CMakeLists.txt b/lib/sanitizer_common/CMakeLists.txt
index 1be9961..ad3aba0 100644
--- a/lib/sanitizer_common/CMakeLists.txt
+++ b/lib/sanitizer_common/CMakeLists.txt
@@ -16,6 +16,7 @@
   sanitizer_linux.cc
   sanitizer_linux_s390.cc
   sanitizer_mac.cc
+  sanitizer_netbsd.cc
   sanitizer_openbsd.cc
   sanitizer_persistent_allocator.cc
   sanitizer_platform_limits_linux.cc
@@ -158,6 +159,7 @@
   sanitizer_procmaps.h
   sanitizer_quarantine.h
   sanitizer_report_decorator.h
+  sanitizer_ring_buffer.h
   sanitizer_rtems.h
   sanitizer_signal_interceptors.inc
   sanitizer_stackdepot.h
diff --git a/lib/sanitizer_common/sanitizer_allocator_size_class_map.h b/lib/sanitizer_common/sanitizer_allocator_size_class_map.h
index 77ab4fb..0795842 100644
--- a/lib/sanitizer_common/sanitizer_allocator_size_class_map.h
+++ b/lib/sanitizer_common/sanitizer_allocator_size_class_map.h
@@ -232,3 +232,8 @@
 typedef SizeClassMap<3, 4, 8, 17, 128, 16> DefaultSizeClassMap;
 typedef SizeClassMap<3, 4, 8, 17, 64, 14> CompactSizeClassMap;
 typedef SizeClassMap<2, 5, 9, 16, 64, 14> VeryCompactSizeClassMap;
+
+// The following SizeClassMap only holds a way small number of cached entries,
+// allowing for denser per-class arrays, smaller memory footprint and usually
+// better performances in threaded environments.
+typedef SizeClassMap<3, 4, 8, 17, 8, 10> DenseSizeClassMap;
diff --git a/lib/sanitizer_common/sanitizer_atomic_clang_x86.h b/lib/sanitizer_common/sanitizer_atomic_clang_x86.h
index 38feb29..195533e 100644
--- a/lib/sanitizer_common/sanitizer_atomic_clang_x86.h
+++ b/lib/sanitizer_common/sanitizer_atomic_clang_x86.h
@@ -61,8 +61,7 @@
         "emms;"            // Empty mmx state/Reset FP regs
         : "=m" (v)
         : "m" (a->val_dont_use)
-        : // mark the FP stack and mmx registers as clobbered
-          "st", "st(1)", "st(2)", "st(3)", "st(4)", "st(5)", "st(6)", "st(7)",
+        : // mark the mmx registers as clobbered
 #ifdef __MMX__
           "mm0", "mm1", "mm2", "mm3", "mm4", "mm5", "mm6", "mm7",
 #endif  // #ifdef __MMX__
@@ -100,8 +99,7 @@
         "emms;"            // Empty mmx state/Reset FP regs
         : "=m" (a->val_dont_use)
         : "m" (v)
-        : // mark the FP stack and mmx registers as clobbered
-          "st", "st(1)", "st(2)", "st(3)", "st(4)", "st(5)", "st(6)", "st(7)",
+        : // mark the mmx registers as clobbered
 #ifdef __MMX__
           "mm0", "mm1", "mm2", "mm3", "mm4", "mm5", "mm6", "mm7",
 #endif  // #ifdef __MMX__
diff --git a/lib/sanitizer_common/sanitizer_common.cc b/lib/sanitizer_common/sanitizer_common.cc
index 7d72b0c..6868961 100644
--- a/lib/sanitizer_common/sanitizer_common.cc
+++ b/lib/sanitizer_common/sanitizer_common.cc
@@ -339,11 +339,6 @@
 }
 
 SANITIZER_INTERFACE_ATTRIBUTE
-void __sanitizer_set_death_callback(void (*callback)(void)) {
-  SetUserDieCallback(callback);
-}
-
-SANITIZER_INTERFACE_ATTRIBUTE
 int __sanitizer_install_malloc_and_free_hooks(void (*malloc_hook)(const void *,
                                                                   uptr),
                                               void (*free_hook)(const void *)) {
diff --git a/lib/sanitizer_common/sanitizer_common.h b/lib/sanitizer_common/sanitizer_common.h
index 3b999ed..cf06058 100644
--- a/lib/sanitizer_common/sanitizer_common.h
+++ b/lib/sanitizer_common/sanitizer_common.h
@@ -73,6 +73,7 @@
 uptr GetMaxUserVirtualAddress();
 // Threads
 tid_t GetTid();
+int TgKill(pid_t pid, tid_t tid, int sig);
 uptr GetThreadSelf();
 void GetThreadStackTopAndBottom(bool at_initialization, uptr *stack_top,
                                 uptr *stack_bottom);
diff --git a/lib/sanitizer_common/sanitizer_common_interceptors.inc b/lib/sanitizer_common/sanitizer_common_interceptors.inc
index 801d6a0..5304699 100644
--- a/lib/sanitizer_common/sanitizer_common_interceptors.inc
+++ b/lib/sanitizer_common/sanitizer_common_interceptors.inc
@@ -3345,14 +3345,14 @@
   return res;
 }
 
-INTERCEPTOR(INTMAX_T, strtoumax, const char *nptr, char **endptr, int base) {
+INTERCEPTOR(UINTMAX_T, strtoumax, const char *nptr, char **endptr, int base) {
   void *ctx;
   COMMON_INTERCEPTOR_ENTER(ctx, strtoumax, nptr, endptr, base);
   // FIXME: under ASan the call below may write to freed memory and corrupt
   // its metadata. See
   // https://github.com/google/sanitizers/issues/321.
   char *real_endptr;
-  INTMAX_T res = REAL(strtoumax)(nptr, &real_endptr, base);
+  UINTMAX_T res = REAL(strtoumax)(nptr, &real_endptr, base);
   StrtolFixAndCheck(ctx, nptr, endptr, real_endptr, base);
   return res;
 }
diff --git a/lib/sanitizer_common/sanitizer_coverage_fuchsia.cc b/lib/sanitizer_common/sanitizer_coverage_fuchsia.cc
index b4ffcca..f27b95f 100644
--- a/lib/sanitizer_common/sanitizer_coverage_fuchsia.cc
+++ b/lib/sanitizer_common/sanitizer_coverage_fuchsia.cc
@@ -146,9 +146,9 @@
       // indices, but we'll never move the mapping address so we don't have
       // any multi-thread synchronization issues with that.
       uintptr_t mapping;
-      status = _zx_vmar_map_old(_zx_vmar_root_self(), 0, vmo_, 0, MappingSize,
-                                ZX_VM_FLAG_PERM_READ | ZX_VM_FLAG_PERM_WRITE,
-                                &mapping);
+      status =
+          _zx_vmar_map(_zx_vmar_root_self(), ZX_VM_PERM_READ | ZX_VM_PERM_WRITE,
+                       0, vmo_, 0, MappingSize, &mapping);
       CHECK_EQ(status, ZX_OK);
 
       // Hereafter other threads are free to start storing into
diff --git a/lib/sanitizer_common/sanitizer_coverage_win_sections.cc b/lib/sanitizer_common/sanitizer_coverage_win_sections.cc
index 4b0bbf1..108f76e 100644
--- a/lib/sanitizer_common/sanitizer_coverage_win_sections.cc
+++ b/lib/sanitizer_common/sanitizer_coverage_win_sections.cc
@@ -7,16 +7,57 @@
 //
 //===----------------------------------------------------------------------===//
 //
-// This file defines delimiters for Sanitizer Coverage's section.
+// This file defines delimiters for Sanitizer Coverage's section. It contains
+// Windows specific tricks to coax the linker into giving us the start and stop
+// addresses of a section, as ELF linkers can do, to get the size of certain
+// arrays. According to https://msdn.microsoft.com/en-us/library/7977wcck.aspx
+// sections with the same name before "$" are sorted alphabetically by the
+// string that comes after "$" and merged into one section. We take advantage
+// of this by putting data we want the size of into the middle (M) of a section,
+// by using the letter "M" after "$". We get the start of this data (ie:
+// __start_section_name) by making the start variable come at the start of the
+// section (using the letter A after "$"). We do the same to get the end of the
+// data by using the letter "Z" after "$" to make the end variable come after
+// the data. Note that because of our technique the address of the start
+// variable is actually the address of data that comes before our middle
+// section. We also need to prevent the linker from adding any padding. Each
+// technique we use for this is explained in the comments below.
 //===----------------------------------------------------------------------===//
 
 #include "sanitizer_platform.h"
 #if SANITIZER_WINDOWS
 #include <stdint.h>
-#pragma section(".SCOV$A", read, write)  // NOLINT
-#pragma section(".SCOV$Z", read, write)  // NOLINT
 extern "C" {
-__declspec(allocate(".SCOV$A")) uint32_t __start___sancov_guards = 0;
-__declspec(allocate(".SCOV$Z")) uint32_t __stop___sancov_guards = 0;
+// The Guard array and counter array should both be merged into the .data
+// section to reduce the number of PE sections However, because PCTable is
+// constant it should be merged with the .rdata section.
+#pragma section(".SCOV$GA", read, write)  // NOLINT
+// Use align(1) to avoid adding any padding that will mess up clients trying to
+// determine the start and end of the array.
+__declspec(allocate(".SCOV$GA")) __declspec(align(1)) uint64_t
+    __start___sancov_guards = 0;
+#pragma section(".SCOV$GZ", read, write)  // NOLINT
+__declspec(allocate(".SCOV$GZ")) __declspec(align(1)) uint64_t
+    __stop___sancov_guards = 0;
+
+#pragma section(".SCOV$CA", read, write)  // NOLINT
+__declspec(allocate(".SCOV$CA")) __declspec(align(1)) uint64_t
+    __start___sancov_cntrs = 0;
+#pragma section(".SCOV$CZ", read, write)  // NOLINT
+__declspec(allocate(".SCOV$CZ")) __declspec(align(1)) uint64_t
+    __stop___sancov_cntrs = 0;
+
+#pragma comment(linker, "/MERGE:.SCOV=.data")
+
+// Use uint64_t so there won't be any issues if the linker tries to word align
+// the pc array.
+#pragma section(".SCOVP$A", read)  // NOLINT
+__declspec(allocate(".SCOVP$A")) __declspec(align(1)) uint64_t
+    __start___sancov_pcs = 0;
+#pragma section(".SCOVP$Z", read)  // NOLINT
+__declspec(allocate(".SCOVP$Z")) __declspec(align(1)) uint64_t
+    __stop___sancov_pcs = 0;
+
+#pragma comment(linker, "/MERGE:.SCOVP=.rdata")
 }
-#endif // SANITIZER_WINDOWS
+#endif  // SANITIZER_WINDOWS
diff --git a/lib/sanitizer_common/sanitizer_fuchsia.cc b/lib/sanitizer_common/sanitizer_fuchsia.cc
index 3916206..9c54e1e 100644
--- a/lib/sanitizer_common/sanitizer_fuchsia.cc
+++ b/lib/sanitizer_common/sanitizer_fuchsia.cc
@@ -29,9 +29,6 @@
 
 namespace __sanitizer {
 
-// TODO(phosek): remove this and replace it with ZX_TIME_INFINITE
-#define ZX_TIME_INFINITE_OLD INT64_MAX
-
 void NORETURN internal__exit(int exitcode) { _zx_process_exit(exitcode); }
 
 uptr internal_sched_yield() {
@@ -123,7 +120,7 @@
     return;
   while (atomic_exchange(m, MtxSleeping, memory_order_acquire) != MtxUnlocked) {
     zx_status_t status = _zx_futex_wait(reinterpret_cast<zx_futex_t *>(m),
-                                        MtxSleeping, ZX_TIME_INFINITE_OLD);
+                                        MtxSleeping, ZX_TIME_INFINITE);
     if (status != ZX_ERR_BAD_STATE)  // Normal race.
       CHECK_EQ(status, ZX_OK);
   }
@@ -175,8 +172,8 @@
   // TODO(mcgrathr): Maybe allocate a VMAR for all sanitizer heap and use that?
   uintptr_t addr;
   status =
-      _zx_vmar_map_old(_zx_vmar_root_self(), 0, vmo, 0, size,
-                       ZX_VM_FLAG_PERM_READ | ZX_VM_FLAG_PERM_WRITE, &addr);
+      _zx_vmar_map(_zx_vmar_root_self(), ZX_VM_PERM_READ | ZX_VM_PERM_WRITE, 0,
+                   vmo, 0, size, &addr);
   _zx_handle_close(vmo);
 
   if (status != ZX_OK) {
@@ -239,10 +236,9 @@
   DCHECK_GE(base + size_, map_size + offset);
   uintptr_t addr;
 
-  status = _zx_vmar_map_old(
-      vmar, offset, vmo, 0, map_size,
-      ZX_VM_FLAG_PERM_READ | ZX_VM_FLAG_PERM_WRITE | ZX_VM_FLAG_SPECIFIC,
-      &addr);
+  status =
+      _zx_vmar_map(vmar, ZX_VM_PERM_READ | ZX_VM_PERM_WRITE | ZX_VM_SPECIFIC,
+                   offset, vmo, 0, map_size, &addr);
   _zx_handle_close(vmo);
   if (status != ZX_OK) {
     if (status != ZX_ERR_NO_MEMORY || die_for_nomem) {
@@ -281,14 +277,22 @@
 
 void ReservedAddressRange::Unmap(uptr addr, uptr size) {
   CHECK_LE(size, size_);
-  if (addr == reinterpret_cast<uptr>(base_))
-    // If we unmap the whole range, just null out the base.
-    base_ = (size == size_) ? nullptr : reinterpret_cast<void*>(addr + size);
-  else
+  const zx_handle_t vmar = static_cast<zx_handle_t>(os_handle_);
+  if (addr == reinterpret_cast<uptr>(base_)) {
+    if (size == size_) {
+      // Destroying the vmar effectively unmaps the whole mapping.
+      _zx_vmar_destroy(vmar);
+      _zx_handle_close(vmar);
+      os_handle_ = static_cast<uptr>(ZX_HANDLE_INVALID);
+      DecreaseTotalMmap(size);
+      return;
+    }
+  } else {
     CHECK_EQ(addr + size, reinterpret_cast<uptr>(base_) + size_);
-  size_ -= size;
-  UnmapOrDieVmar(reinterpret_cast<void *>(addr), size,
-                 static_cast<zx_handle_t>(os_handle_));
+  }
+  // Partial unmapping does not affect the fact that the initial range is still
+  // reserved, and the resulting unmapped memory can't be reused.
+  UnmapOrDieVmar(reinterpret_cast<void *>(addr), size, vmar);
 }
 
 // This should never be called.
@@ -321,8 +325,8 @@
   size_t map_size = size + alignment;
   uintptr_t addr;
   status =
-      _zx_vmar_map_old(_zx_vmar_root_self(), 0, vmo, 0, map_size,
-                       ZX_VM_FLAG_PERM_READ | ZX_VM_FLAG_PERM_WRITE, &addr);
+      _zx_vmar_map(_zx_vmar_root_self(), ZX_VM_PERM_READ | ZX_VM_PERM_WRITE, 0,
+                   vmo, 0, map_size, &addr);
   if (status == ZX_OK) {
     uintptr_t map_addr = addr;
     uintptr_t map_end = map_addr + map_size;
@@ -334,11 +338,10 @@
                                    sizeof(info), NULL, NULL);
       if (status == ZX_OK) {
         uintptr_t new_addr;
-        status = _zx_vmar_map_old(_zx_vmar_root_self(), addr - info.base, vmo,
-                                  0, size,
-                                  ZX_VM_FLAG_PERM_READ | ZX_VM_FLAG_PERM_WRITE |
-                                      ZX_VM_FLAG_SPECIFIC_OVERWRITE,
-                                  &new_addr);
+        status = _zx_vmar_map(
+            _zx_vmar_root_self(),
+            ZX_VM_PERM_READ | ZX_VM_PERM_WRITE | ZX_VM_SPECIFIC_OVERWRITE,
+            addr - info.base, vmo, 0, size, &new_addr);
         if (status == ZX_OK) CHECK_EQ(new_addr, addr);
       }
     }
@@ -398,8 +401,8 @@
       if (vmo_size < max_len) max_len = vmo_size;
       size_t map_size = RoundUpTo(max_len, PAGE_SIZE);
       uintptr_t addr;
-      status = _zx_vmar_map_old(_zx_vmar_root_self(), 0, vmo, 0, map_size,
-                                ZX_VM_FLAG_PERM_READ, &addr);
+      status = _zx_vmar_map(_zx_vmar_root_self(), ZX_VM_PERM_READ, 0, vmo, 0,
+                            map_size, &addr);
       if (status == ZX_OK) {
         *buff = reinterpret_cast<char *>(addr);
         *buff_size = map_size;
diff --git a/lib/sanitizer_common/sanitizer_internal_defs.h b/lib/sanitizer_common/sanitizer_internal_defs.h
index f8a405b..3402243 100644
--- a/lib/sanitizer_common/sanitizer_internal_defs.h
+++ b/lib/sanitizer_common/sanitizer_internal_defs.h
@@ -105,8 +105,8 @@
 //
 // FIXME: do we have anything like this on Mac?
 #ifndef SANITIZER_CAN_USE_PREINIT_ARRAY
-#if ((SANITIZER_LINUX && !SANITIZER_ANDROID) || SANITIZER_OPENBSD) && \
-    !defined(PIC)
+#if ((SANITIZER_LINUX && !SANITIZER_ANDROID) || SANITIZER_OPENBSD || \
+     SANITIZER_FUCHSIA) && !defined(PIC)
 #define SANITIZER_CAN_USE_PREINIT_ARRAY 1
 // Before Solaris 11.4, .preinit_array is fully supported only with GNU ld.
 // FIXME: Check for those conditions.
@@ -275,8 +275,6 @@
 // NOTE: Functions below must be defined in each run-time.
 void NORETURN Die();
 
-// FIXME: No, this shouldn't be in the sanitizer interface.
-SANITIZER_INTERFACE_ATTRIBUTE
 void NORETURN CheckFailed(const char *file, int line, const char *cond,
                           u64 v1, u64 v2);
 
@@ -431,6 +429,7 @@
 namespace __ubsan { using namespace __sanitizer; }  // NOLINT
 namespace __xray  { using namespace __sanitizer; }  // NOLINT
 namespace __interception  { using namespace __sanitizer; }  // NOLINT
+namespace __hwasan  { using namespace __sanitizer; }  // NOLINT
 
 
 #endif  // SANITIZER_DEFS_H
diff --git a/lib/sanitizer_common/sanitizer_libc.cc b/lib/sanitizer_common/sanitizer_libc.cc
index 4b462bf..4032cb1 100644
--- a/lib/sanitizer_common/sanitizer_libc.cc
+++ b/lib/sanitizer_common/sanitizer_libc.cc
@@ -73,6 +73,18 @@
 }
 
 void *internal_memset(void* s, int c, uptr n) {
+  // Optimize for the most performance-critical case:
+  if ((reinterpret_cast<uptr>(s) % 16) == 0 && (n % 16) == 0) {
+    u64 *p = reinterpret_cast<u64*>(s);
+    u64 *e = p + n / 8;
+    u64 v = c;
+    v |= v << 8;
+    v |= v << 16;
+    v |= v << 32;
+    for (; p < e; p += 2)
+      p[0] = p[1] = v;
+    return s;
+  }
   // The next line prevents Clang from making a call to memset() instead of the
   // loop below.
   // FIXME: building the runtime with -ffreestanding is a better idea. However
diff --git a/lib/sanitizer_common/sanitizer_linux.cc b/lib/sanitizer_common/sanitizer_linux.cc
index 8d39c4d..cd1e46b 100644
--- a/lib/sanitizer_common/sanitizer_linux.cc
+++ b/lib/sanitizer_common/sanitizer_linux.cc
@@ -31,10 +31,6 @@
 #include <asm/param.h>
 #endif
 
-#if SANITIZER_NETBSD
-#include <lwp.h>
-#endif
-
 // For mips64, syscall(__NR_stat) fills the buffer in the 'struct kernel_stat'
 // format. Struct kernel_stat is defined as 'struct stat' in asm/stat.h. To
 // access stat from asm/stat.h, without conflicting with definition in
@@ -69,6 +65,7 @@
 #endif
 #if SANITIZER_OPENBSD
 #include <sys/futex.h>
+#include <sys/sysctl.h>
 #endif
 #include <unistd.h>
 
@@ -117,6 +114,9 @@
 // <linux/futex.h> is broken on some linux distributions.
 const int FUTEX_WAIT = 0;
 const int FUTEX_WAKE = 1;
+const int FUTEX_PRIVATE_FLAG = 128;
+const int FUTEX_WAIT_PRIVATE = FUTEX_WAIT | FUTEX_PRIVATE_FLAG;
+const int FUTEX_WAKE_PRIVATE = FUTEX_WAKE | FUTEX_PRIVATE_FLAG;
 #endif  // SANITIZER_LINUX
 
 // Are we using 32-bit or 64-bit Linux syscalls?
@@ -170,14 +170,11 @@
 #endif
 
 // --------------- sanitizer_libc.h
-#if !SANITIZER_SOLARIS
+#if !SANITIZER_SOLARIS && !SANITIZER_NETBSD
 #if !SANITIZER_S390 && !SANITIZER_OPENBSD
 uptr internal_mmap(void *addr, uptr length, int prot, int flags, int fd,
                    OFF_T offset) {
-#if SANITIZER_NETBSD
-  return internal_syscall64(SYSCALL(mmap), addr, length, prot, flags, fd,
-                              (long)0, offset);
-#elif SANITIZER_FREEBSD || SANITIZER_LINUX_USES_64BIT_SYSCALLS
+#if SANITIZER_FREEBSD || SANITIZER_LINUX_USES_64BIT_SYSCALLS
   return internal_syscall(SYSCALL(mmap), (uptr)addr, length, prot, flags, fd,
                           offset);
 #else
@@ -191,11 +188,11 @@
 
 #if !SANITIZER_OPENBSD
 uptr internal_munmap(void *addr, uptr length) {
-  return internal_syscall_ptr(SYSCALL(munmap), (uptr)addr, length);
+  return internal_syscall(SYSCALL(munmap), (uptr)addr, length);
 }
 
 int internal_mprotect(void *addr, uptr length, int prot) {
-  return internal_syscall_ptr(SYSCALL(mprotect), (uptr)addr, length, prot);
+  return internal_syscall(SYSCALL(mprotect), (uptr)addr, length, prot);
 }
 #endif
 
@@ -207,7 +204,7 @@
 #if SANITIZER_USES_CANONICAL_LINUX_SYSCALLS
   return internal_syscall(SYSCALL(openat), AT_FDCWD, (uptr)filename, flags);
 #else
-  return internal_syscall_ptr(SYSCALL(open), (uptr)filename, flags);
+  return internal_syscall(SYSCALL(open), (uptr)filename, flags);
 #endif
 }
 
@@ -216,32 +213,28 @@
   return internal_syscall(SYSCALL(openat), AT_FDCWD, (uptr)filename, flags,
                           mode);
 #else
-  return internal_syscall_ptr(SYSCALL(open), (uptr)filename, flags, mode);
+  return internal_syscall(SYSCALL(open), (uptr)filename, flags, mode);
 #endif
 }
 
 uptr internal_read(fd_t fd, void *buf, uptr count) {
   sptr res;
-  HANDLE_EINTR(res, (sptr)internal_syscall_ptr(SYSCALL(read), fd, (uptr)buf,
-               count));
+  HANDLE_EINTR(res,
+               (sptr)internal_syscall(SYSCALL(read), fd, (uptr)buf, count));
   return res;
 }
 
 uptr internal_write(fd_t fd, const void *buf, uptr count) {
   sptr res;
-  HANDLE_EINTR(res, (sptr)internal_syscall_ptr(SYSCALL(write), fd, (uptr)buf,
-               count));
+  HANDLE_EINTR(res,
+               (sptr)internal_syscall(SYSCALL(write), fd, (uptr)buf, count));
   return res;
 }
 
 uptr internal_ftruncate(fd_t fd, uptr size) {
   sptr res;
-#if SANITIZER_NETBSD
-  HANDLE_EINTR(res, internal_syscall64(SYSCALL(ftruncate), fd, 0, (s64)size));
-#else
   HANDLE_EINTR(res, (sptr)internal_syscall(SYSCALL(ftruncate), fd,
                (OFF_T)size));
-#endif
   return res;
 }
 
@@ -313,9 +306,8 @@
 #endif
 
 uptr internal_stat(const char *path, void *buf) {
-#if SANITIZER_FREEBSD || SANITIZER_NETBSD || SANITIZER_OPENBSD
-  return internal_syscall_ptr(SYSCALL(fstatat), AT_FDCWD, (uptr)path, (uptr)buf,
-                              0);
+#if SANITIZER_FREEBSD || SANITIZER_OPENBSD
+  return internal_syscall(SYSCALL(fstatat), AT_FDCWD, (uptr)path, (uptr)buf, 0);
 #elif SANITIZER_USES_CANONICAL_LINUX_SYSCALLS
   return internal_syscall(SYSCALL(newfstatat), AT_FDCWD, (uptr)path, (uptr)buf,
                           0);
@@ -338,9 +330,7 @@
 }
 
 uptr internal_lstat(const char *path, void *buf) {
-#if SANITIZER_NETBSD
-  return internal_syscall_ptr(SYSCALL(lstat), path, buf);
-#elif SANITIZER_FREEBSD || SANITIZER_OPENBSD
+#if SANITIZER_FREEBSD || SANITIZER_OPENBSD
   return internal_syscall(SYSCALL(fstatat), AT_FDCWD, (uptr)path, (uptr)buf,
                           AT_SYMLINK_NOFOLLOW);
 #elif SANITIZER_USES_CANONICAL_LINUX_SYSCALLS
@@ -365,16 +355,16 @@
 }
 
 uptr internal_fstat(fd_t fd, void *buf) {
-#if SANITIZER_FREEBSD || SANITIZER_NETBSD || SANITIZER_OPENBSD ||              \
+#if SANITIZER_FREEBSD || SANITIZER_OPENBSD || \
     SANITIZER_LINUX_USES_64BIT_SYSCALLS
-#if SANITIZER_MIPS64 && !SANITIZER_NETBSD && !SANITIZER_OPENBSD
+#if SANITIZER_MIPS64 && !SANITIZER_OPENBSD
   // For mips64, fstat syscall fills buffer in the format of kernel_stat
   struct kernel_stat kbuf;
   int res = internal_syscall(SYSCALL(fstat), fd, &kbuf);
   kernel_stat_to_stat(&kbuf, (struct stat *)buf);
   return res;
 # else
-  return internal_syscall_ptr(SYSCALL(fstat), fd, (uptr)buf);
+  return internal_syscall(SYSCALL(fstat), fd, (uptr)buf);
 # endif
 #else
   struct stat64 buf64;
@@ -404,10 +394,10 @@
   return internal_syscall(SYSCALL(readlinkat), AT_FDCWD, (uptr)path, (uptr)buf,
                           bufsize);
 #elif SANITIZER_OPENBSD
-  return internal_syscall_ptr(SYSCALL(readlinkat), AT_FDCWD, (uptr)path,
-                              (uptr)buf, bufsize);
+  return internal_syscall(SYSCALL(readlinkat), AT_FDCWD, (uptr)path, (uptr)buf,
+                          bufsize);
 #else
-  return internal_syscall_ptr(SYSCALL(readlink), path, buf, bufsize);
+  return internal_syscall(SYSCALL(readlink), path, buf, bufsize);
 #endif
 }
 
@@ -415,7 +405,7 @@
 #if SANITIZER_USES_CANONICAL_LINUX_SYSCALLS || SANITIZER_OPENBSD
   return internal_syscall(SYSCALL(unlinkat), AT_FDCWD, (uptr)path, 0);
 #else
-  return internal_syscall_ptr(SYSCALL(unlink), (uptr)path);
+  return internal_syscall(SYSCALL(unlink), (uptr)path);
 #endif
 }
 
@@ -424,7 +414,7 @@
   return internal_syscall(SYSCALL(renameat), AT_FDCWD, (uptr)oldpath, AT_FDCWD,
                           (uptr)newpath);
 #else
-  return internal_syscall_ptr(SYSCALL(rename), (uptr)oldpath, (uptr)newpath);
+  return internal_syscall(SYSCALL(rename), (uptr)oldpath, (uptr)newpath);
 #endif
 }
 
@@ -433,7 +423,7 @@
 }
 
 void internal__exit(int exitcode) {
-#if SANITIZER_FREEBSD || SANITIZER_NETBSD || SANITIZER_OPENBSD
+#if SANITIZER_FREEBSD || SANITIZER_OPENBSD
   internal_syscall(SYSCALL(exit), exitcode);
 #else
   internal_syscall(SYSCALL(exit_group), exitcode);
@@ -445,17 +435,17 @@
   struct timespec ts;
   ts.tv_sec = 1;
   ts.tv_nsec = 0;
-  int res = internal_syscall_ptr(SYSCALL(nanosleep), &ts, &ts);
+  int res = internal_syscall(SYSCALL(nanosleep), &ts, &ts);
   if (res) return ts.tv_sec;
   return 0;
 }
 
 uptr internal_execve(const char *filename, char *const argv[],
                      char *const envp[]) {
-  return internal_syscall_ptr(SYSCALL(execve), (uptr)filename, (uptr)argv,
-                              (uptr)envp);
+  return internal_syscall(SYSCALL(execve), (uptr)filename, (uptr)argv,
+                          (uptr)envp);
 }
-#endif // !SANITIZER_SOLARIS
+#endif  // !SANITIZER_SOLARIS && !SANITIZER_NETBSD
 
 // ----------------- sanitizer_common.h
 bool FileExists(const char *filename) {
@@ -470,6 +460,7 @@
   return S_ISREG(st.st_mode);
 }
 
+#if !SANITIZER_NETBSD
 tid_t GetTid() {
 #if SANITIZER_FREEBSD
   long Tid;
@@ -477,8 +468,6 @@
   return Tid;
 #elif SANITIZER_OPENBSD
   return internal_syscall(SYSCALL(getthrid));
-#elif SANITIZER_NETBSD
-  return _lwp_self();
 #elif SANITIZER_SOLARIS
   return thr_self();
 #else
@@ -486,28 +475,43 @@
 #endif
 }
 
-#if !SANITIZER_SOLARIS
+int TgKill(pid_t pid, tid_t tid, int sig) {
+#if SANITIZER_LINUX
+  return internal_syscall(SYSCALL(tgkill), pid, tid, sig);
+#elif SANITIZER_FREEBSD
+  return internal_syscall(SYSCALL(thr_kill2), pid, tid, sig);
+#elif SANITIZER_OPENBSD
+  (void)pid;
+  return internal_syscall(SYSCALL(thrkill), tid, sig, nullptr);
+#elif SANITIZER_SOLARIS
+  (void)pid;
+  return thr_kill(tid, sig);
+#endif
+}
+#endif
+
+#if !SANITIZER_SOLARIS && !SANITIZER_NETBSD
 u64 NanoTime() {
-#if SANITIZER_FREEBSD || SANITIZER_NETBSD || SANITIZER_OPENBSD
+#if SANITIZER_FREEBSD || SANITIZER_OPENBSD
   timeval tv;
 #else
   kernel_timeval tv;
 #endif
   internal_memset(&tv, 0, sizeof(tv));
-  internal_syscall_ptr(SYSCALL(gettimeofday), &tv, 0);
+  internal_syscall(SYSCALL(gettimeofday), &tv, 0);
   return (u64)tv.tv_sec * 1000*1000*1000 + tv.tv_usec * 1000;
 }
 
 uptr internal_clock_gettime(__sanitizer_clockid_t clk_id, void *tp) {
-  return internal_syscall_ptr(SYSCALL(clock_gettime), clk_id, tp);
+  return internal_syscall(SYSCALL(clock_gettime), clk_id, tp);
 }
-#endif // !SANITIZER_SOLARIS
+#endif  // !SANITIZER_SOLARIS && !SANITIZER_NETBSD
 
 // Like getenv, but reads env directly from /proc (on Linux) or parses the
 // 'environ' array (on some others) and does not use libc. This function
 // should be called first inside __asan_init.
 const char *GetEnv(const char *name) {
-#if SANITIZER_FREEBSD || SANITIZER_NETBSD || SANITIZER_OPENBSD ||              \
+#if SANITIZER_FREEBSD || SANITIZER_NETBSD || SANITIZER_OPENBSD || \
     SANITIZER_SOLARIS
   if (::environ != 0) {
     uptr NameLen = internal_strlen(name);
@@ -585,8 +589,8 @@
   // kern.ps_strings sysctl, which returns a pointer to a structure containing
   // this information. See also <sys/exec.h>.
   ps_strings *pss;
-  size_t sz = sizeof(pss);
-  if (sysctlbyname("kern.ps_strings", &pss, &sz, NULL, 0) == -1) {
+  uptr sz = sizeof(pss);
+  if (internal_sysctlbyname("kern.ps_strings", &pss, &sz, NULL, 0) == -1) {
     Printf("sysctl kern.ps_strings failed\n");
     Die();
   }
@@ -628,10 +632,10 @@
     CTL_KERN, KERN_PROC_ARGS, -1, KERN_PROC_PATHNAME,
   };
   char path[400];
-  size_t len;
+  uptr len;
 
   len = sizeof(path);
-  if (sysctl(name, ARRAY_SIZE(name), path, &len, NULL, 0) != -1)
+  if (internal_sysctl(name, ARRAY_SIZE(name), path, &len, NULL, 0) != -1)
     pathname = path;
 #elif SANITIZER_SOLARIS
   pathname = getexecname();
@@ -669,7 +673,8 @@
 #elif SANITIZER_NETBSD
     sched_yield(); /* No userspace futex-like synchronization */
 #else
-    internal_syscall(SYSCALL(futex), (uptr)m, FUTEX_WAIT, MtxSleeping, 0, 0, 0);
+    internal_syscall(SYSCALL(futex), (uptr)m, FUTEX_WAIT_PRIVATE, MtxSleeping,
+                     0, 0, 0);
 #endif
   }
 }
@@ -684,7 +689,7 @@
 #elif SANITIZER_NETBSD
                    /* No userspace futex-like synchronization */
 #else
-    internal_syscall(SYSCALL(futex), (uptr)m, FUTEX_WAKE, 1, 0, 0, 0);
+    internal_syscall(SYSCALL(futex), (uptr)m, FUTEX_WAKE_PRIVATE, 1, 0, 0, 0);
 #endif
   }
 }
@@ -699,7 +704,9 @@
 // The actual size of this structure is specified by d_reclen.
 // Note that getdents64 uses a different structure format. We only provide the
 // 32-bit syscall here.
-#if SANITIZER_NETBSD || SANITIZER_OPENBSD
+#if SANITIZER_NETBSD
+// Not used
+#elif SANITIZER_OPENBSD
 // struct dirent is different for Linux and us. At this moment, we use only
 // d_fileno (Linux call this d_ino), d_reclen, and d_name.
 struct linux_dirent {
@@ -726,27 +733,15 @@
 };
 #endif
 
-#if !SANITIZER_SOLARIS
+#if !SANITIZER_SOLARIS && !SANITIZER_NETBSD
 // Syscall wrappers.
 uptr internal_ptrace(int request, int pid, void *addr, void *data) {
-#if SANITIZER_NETBSD
-  // XXX We need additional work for ptrace:
-  //   - for request, we use PT_FOO whereas Linux uses PTRACE_FOO
-  //   - data is int for us, but void * for Linux
-  //   - Linux sometimes uses data in the case where we use addr instead
-  // At this moment, this function is used only within
-  // "#if SANITIZER_LINUX && defined(__x86_64__)" block in
-  // sanitizer_stoptheworld_linux_libcdep.cc.
-  return internal_syscall_ptr(SYSCALL(ptrace), request, pid, (uptr)addr,
-                              (uptr)data);
-#else
   return internal_syscall(SYSCALL(ptrace), request, pid, (uptr)addr,
                           (uptr)data);
-#endif
 }
 
 uptr internal_waitpid(int pid, int *status, int options) {
-  return internal_syscall_ptr(SYSCALL(wait4), pid, (uptr)status, options,
+  return internal_syscall(SYSCALL(wait4), pid, (uptr)status, options,
                           0 /* rusage */);
 }
 
@@ -764,16 +759,12 @@
 #elif SANITIZER_USES_CANONICAL_LINUX_SYSCALLS
   return internal_syscall(SYSCALL(getdents64), fd, (uptr)dirp, count);
 #else
-  return internal_syscall_ptr(SYSCALL(getdents), fd, (uptr)dirp, count);
+  return internal_syscall(SYSCALL(getdents), fd, (uptr)dirp, count);
 #endif
 }
 
 uptr internal_lseek(fd_t fd, OFF_T offset, int whence) {
-#if SANITIZER_NETBSD
-  return internal_syscall64(SYSCALL(lseek), fd, 0, offset, whence);
-#else
   return internal_syscall(SYSCALL(lseek), fd, offset, whence);
-#endif
 }
 
 #if SANITIZER_LINUX
@@ -783,7 +774,7 @@
 #endif
 
 uptr internal_sigaltstack(const void *ss, void *oss) {
-  return internal_syscall_ptr(SYSCALL(sigaltstack), (uptr)ss, (uptr)oss);
+  return internal_syscall(SYSCALL(sigaltstack), (uptr)ss, (uptr)oss);
 }
 
 int internal_fork() {
@@ -794,6 +785,28 @@
 #endif
 }
 
+#if SANITIZER_FREEBSD || SANITIZER_OPENBSD
+int internal_sysctl(const int *name, unsigned int namelen, void *oldp,
+                    uptr *oldlenp, const void *newp, uptr newlen) {
+#if SANITIZER_OPENBSD
+  return sysctl(name, namelen, oldp, (size_t *)oldlenp, (void *)newp,
+                (size_t)newlen);
+#else
+  return sysctl(name, namelen, oldp, (size_t *)oldlenp, newp, (size_t)newlen);
+#endif
+}
+
+int internal_sysctlbyname(const char *sname, void *oldp, uptr *oldlenp,
+                          const void *newp, uptr newlen) {
+#if SANITIZER_OPENBSD
+  return sysctlbyname(sname, oldp, (size_t *)oldlenp, (void *)newp,
+                      (size_t)newlen);
+#else
+  return sysctlbyname(sname, oldp, (size_t *)oldlenp, newp, (size_t)newlen);
+#endif
+}
+#endif
+
 #if SANITIZER_LINUX
 #define SA_RESTORER 0x04000000
 // Doesn't set sa_restorer if the caller did not set it, so use with caution
@@ -863,8 +876,8 @@
 
 uptr internal_sigprocmask(int how, __sanitizer_sigset_t *set,
                           __sanitizer_sigset_t *oldset) {
-#if SANITIZER_FREEBSD || SANITIZER_NETBSD || SANITIZER_OPENBSD
-  return internal_syscall_ptr(SYSCALL(sigprocmask), how, set, oldset);
+#if SANITIZER_FREEBSD || SANITIZER_OPENBSD
+  return internal_syscall(SYSCALL(sigprocmask), how, set, oldset);
 #else
   __sanitizer_kernel_sigset_t *k_set = (__sanitizer_kernel_sigset_t *)set;
   __sanitizer_kernel_sigset_t *k_oldset = (__sanitizer_kernel_sigset_t *)oldset;
@@ -902,9 +915,20 @@
   const uptr bit = signum % (sizeof(k_set->sig[0]) * 8);
   return k_set->sig[idx] & (1 << bit);
 }
-#endif  // SANITIZER_LINUX
+#elif SANITIZER_FREEBSD
+void internal_sigdelset(__sanitizer_sigset_t *set, int signum) {
+  sigset_t *rset = reinterpret_cast<sigset_t *>(set);
+  sigdelset(rset, signum);
+}
+
+bool internal_sigismember(__sanitizer_sigset_t *set, int signum) {
+  sigset_t *rset = reinterpret_cast<sigset_t *>(set);
+  return sigismember(rset, signum);
+}
+#endif
 #endif // !SANITIZER_SOLARIS
 
+#if !SANITIZER_NETBSD
 // ThreadLister implementation.
 ThreadLister::ThreadLister(pid_t pid) : pid_(pid), buffer_(4096) {
   char task_directory_path[80];
@@ -991,6 +1015,7 @@
   if (!internal_iserror(descriptor_))
     internal_close(descriptor_);
 }
+#endif
 
 #if SANITIZER_WORDSIZE == 32
 // Take care of unusable kernel area in top gigabyte.
@@ -1093,8 +1118,9 @@
   const int Mib[4] = {CTL_KERN, KERN_PROC_ARGS, -1, KERN_PROC_PATHNAME};
 #endif
   const char *default_module_name = "kern.proc.pathname";
-  size_t Size = buf_len;
-  bool IsErr = (sysctl(Mib, ARRAY_SIZE(Mib), buf, &Size, NULL, 0) != 0);
+  uptr Size = buf_len;
+  bool IsErr =
+      (internal_sysctl(Mib, ARRAY_SIZE(Mib), buf, &Size, NULL, 0) != 0);
   int readlink_error = IsErr ? errno : 0;
   uptr module_name_len = Size;
 #else
@@ -1636,6 +1662,16 @@
 
 static atomic_uint32_t android_api_level;
 
+static AndroidApiLevel AndroidDetectApiLevelStatic() {
+#if __ANDROID_API__ <= 19
+  return ANDROID_KITKAT;
+#elif __ANDROID_API__ <= 22
+  return ANDROID_LOLLIPOP_MR1;
+#else
+  return ANDROID_POST_LOLLIPOP;
+#endif
+}
+
 static AndroidApiLevel AndroidDetectApiLevel() {
   if (!&dl_iterate_phdr)
     return ANDROID_KITKAT; // K or lower
@@ -1648,11 +1684,14 @@
   // interesting to detect.
 }
 
+extern "C" __attribute__((weak)) void* _DYNAMIC;
+
 AndroidApiLevel AndroidGetApiLevel() {
   AndroidApiLevel level =
       (AndroidApiLevel)atomic_load(&android_api_level, memory_order_relaxed);
   if (level) return level;
-  level = AndroidDetectApiLevel();
+  level = &_DYNAMIC == nullptr ? AndroidDetectApiLevelStatic()
+                               : AndroidDetectApiLevel();
   atomic_store(&android_api_level, level, memory_order_relaxed);
   return level;
 }
@@ -1959,13 +1998,13 @@
 #if SANITIZER_NETBSD
   int mib[3];
   int paxflags;
-  size_t len = sizeof(paxflags);
+  uptr len = sizeof(paxflags);
 
   mib[0] = CTL_PROC;
   mib[1] = internal_getpid();
   mib[2] = PROC_PID_PAXFLAGS;
 
-  if (UNLIKELY(sysctl(mib, 3, &paxflags, &len, NULL, 0) == -1)) {
+  if (UNLIKELY(internal_sysctl(mib, 3, &paxflags, &len, NULL, 0) == -1)) {
     Printf("sysctl failed\n");
     Die();
   }
diff --git a/lib/sanitizer_common/sanitizer_linux.h b/lib/sanitizer_common/sanitizer_linux.h
index 975d654..8322f4e 100644
--- a/lib/sanitizer_common/sanitizer_linux.h
+++ b/lib/sanitizer_common/sanitizer_linux.h
@@ -69,6 +69,8 @@
 uptr internal_clone(int (*fn)(void *), void *child_stack, int flags, void *arg,
                     int *parent_tidptr, void *newtls, int *child_tidptr);
 #endif
+#elif SANITIZER_FREEBSD
+void internal_sigdelset(__sanitizer_sigset_t *set, int signum);
 #endif  // SANITIZER_LINUX
 
 // This class reads thread IDs from /proc/<pid>/task using only syscalls.
diff --git a/lib/sanitizer_common/sanitizer_linux_libcdep.cc b/lib/sanitizer_common/sanitizer_linux_libcdep.cc
index 4962ff8..7859557 100644
--- a/lib/sanitizer_common/sanitizer_linux_libcdep.cc
+++ b/lib/sanitizer_common/sanitizer_linux_libcdep.cc
@@ -652,10 +652,10 @@
 #if SANITIZER_FREEBSD || SANITIZER_NETBSD || SANITIZER_OPENBSD
   u32 ncpu;
   int req[2];
-  size_t len = sizeof(ncpu);
+  uptr len = sizeof(ncpu);
   req[0] = CTL_HW;
   req[1] = HW_NCPU;
-  CHECK_EQ(sysctl(req, 2, &ncpu, &len, NULL, 0), 0);
+  CHECK_EQ(internal_sysctl(req, 2, &ncpu, &len, NULL, 0), 0);
   return ncpu;
 #elif SANITIZER_ANDROID && !defined(CPU_COUNT) && !defined(__aarch64__)
   // Fall back to /sys/devices/system/cpu on Android when cpu_set_t doesn't
diff --git a/lib/sanitizer_common/sanitizer_mac.cc b/lib/sanitizer_common/sanitizer_mac.cc
index 180d7c1..90bd088 100644
--- a/lib/sanitizer_common/sanitizer_mac.cc
+++ b/lib/sanitizer_common/sanitizer_mac.cc
@@ -213,6 +213,18 @@
   return fork();
 }
 
+int internal_sysctl(const int *name, unsigned int namelen, void *oldp,
+                    uptr *oldlenp, const void *newp, uptr newlen) {
+  return sysctl(const_cast<int *>(name), namelen, oldp, (size_t *)oldlenp,
+                const_cast<void *>(newp), (size_t)newlen);
+}
+
+int internal_sysctlbyname(const char *sname, void *oldp, uptr *oldlenp,
+                          const void *newp, uptr newlen) {
+  return sysctlbyname(sname, oldp, (size_t *)oldlenp, const_cast<void *>(newp),
+                      (size_t)newlen);
+}
+
 int internal_forkpty(int *amaster) {
   int master, slave;
   if (openpty(&master, &slave, nullptr, nullptr, nullptr) == -1) return -1;
@@ -499,9 +511,9 @@
   uptr len = 0, maxlen = sizeof(version) / sizeof(version[0]);
   for (uptr i = 0; i < maxlen; i++) version[i] = '\0';
   // Get the version length.
-  CHECK_NE(sysctl(mib, 2, 0, &len, 0, 0), -1);
+  CHECK_NE(internal_sysctl(mib, 2, 0, &len, 0, 0), -1);
   CHECK_LT(len, maxlen);
-  CHECK_NE(sysctl(mib, 2, version, &len, 0, 0), -1);
+  CHECK_NE(internal_sysctl(mib, 2, version, &len, 0, 0), -1);
   switch (version[0]) {
     case '9': return MACOS_VERSION_LEOPARD;
     case '1': {
@@ -511,6 +523,10 @@
         case '2': return MACOS_VERSION_MOUNTAIN_LION;
         case '3': return MACOS_VERSION_MAVERICKS;
         case '4': return MACOS_VERSION_YOSEMITE;
+        case '5': return MACOS_VERSION_EL_CAPITAN;
+        case '6': return MACOS_VERSION_SIERRA;
+        case '7': return MACOS_VERSION_HIGH_SIERRA;
+        case '8': return MACOS_VERSION_MOJAVE;
         default:
           if (IsDigit(version[1]))
             return MACOS_VERSION_UNKNOWN_NEWER;
@@ -890,10 +906,10 @@
     (sizeof(__sanitizer_task_vm_info) / sizeof(natural_t)))
 
 uptr GetTaskInfoMaxAddress() {
-  __sanitizer_task_vm_info vm_info = {};
+  __sanitizer_task_vm_info vm_info = {} /* zero initialize */;
   mach_msg_type_number_t count = __SANITIZER_TASK_VM_INFO_COUNT;
   int err = task_info(mach_task_self(), TASK_VM_INFO, (int *)&vm_info, &count);
-  if (err == 0) {
+  if (err == 0 && vm_info.max_address != 0) {
     return vm_info.max_address - 1;
   } else {
     // xnu cannot provide vm address limit
@@ -1060,14 +1076,16 @@
   // Do nothing.
 }
 
-// FIXME: implement on this platform.
 bool GetRandom(void *buffer, uptr length, bool blocking) {
-  UNIMPLEMENTED();
+  if (!buffer || !length || length > 256)
+    return false;
+  // arc4random never fails.
+  arc4random_buf(buffer, length);
+  return true;
 }
 
-// FIXME: implement on this platform.
 u32 GetNumberOfCPUs() {
-  UNIMPLEMENTED();
+  return (u32)sysconf(_SC_NPROCESSORS_ONLN);
 }
 
 }  // namespace __sanitizer
diff --git a/lib/sanitizer_common/sanitizer_mac.h b/lib/sanitizer_common/sanitizer_mac.h
index e022a2c..58423a7 100644
--- a/lib/sanitizer_common/sanitizer_mac.h
+++ b/lib/sanitizer_common/sanitizer_mac.h
@@ -40,6 +40,10 @@
   MACOS_VERSION_MOUNTAIN_LION,
   MACOS_VERSION_MAVERICKS,
   MACOS_VERSION_YOSEMITE,
+  MACOS_VERSION_EL_CAPITAN,
+  MACOS_VERSION_SIERRA,
+  MACOS_VERSION_HIGH_SIERRA,
+  MACOS_VERSION_MOJAVE,
   MACOS_VERSION_UNKNOWN_NEWER
 };
 
diff --git a/lib/sanitizer_common/sanitizer_netbsd.cc b/lib/sanitizer_common/sanitizer_netbsd.cc
new file mode 100644
index 0000000..83beb00
--- /dev/null
+++ b/lib/sanitizer_common/sanitizer_netbsd.cc
@@ -0,0 +1,330 @@
+//===-- sanitizer_netbsd.cc -----------------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file is shared between Sanitizer run-time libraries and implements
+// NetBSD-specific functions from sanitizer_libc.h.
+//===----------------------------------------------------------------------===//
+
+#include "sanitizer_platform.h"
+
+#if SANITIZER_NETBSD
+
+#include "sanitizer_common.h"
+#include "sanitizer_flags.h"
+#include "sanitizer_getauxval.h"
+#include "sanitizer_internal_defs.h"
+#include "sanitizer_libc.h"
+#include "sanitizer_linux.h"
+#include "sanitizer_mutex.h"
+#include "sanitizer_placement_new.h"
+#include "sanitizer_procmaps.h"
+
+#include <sys/param.h>
+#include <sys/types.h>
+
+#include <sys/exec.h>
+#include <sys/mman.h>
+#include <sys/ptrace.h>
+#include <sys/resource.h>
+#include <sys/stat.h>
+#include <sys/syscall.h>
+#include <sys/sysctl.h>
+#include <sys/time.h>
+
+#include <dlfcn.h>
+#include <errno.h>
+#include <fcntl.h>
+#include <limits.h>
+#include <link.h>
+#include <lwp.h>
+#include <pthread.h>
+#include <sched.h>
+#include <signal.h>
+#include <ucontext.h>
+#include <unistd.h>
+
+extern "C" void *__mmap(void *, size_t, int, int, int, int,
+                        off_t) SANITIZER_WEAK_ATTRIBUTE;
+extern "C" int __sysctl(const int *, unsigned int, void *, size_t *,
+                        const void *, size_t) SANITIZER_WEAK_ATTRIBUTE;
+extern "C" int _sys_close(int) SANITIZER_WEAK_ATTRIBUTE;
+extern "C" int _sys_open(const char *, int, ...) SANITIZER_WEAK_ATTRIBUTE;
+extern "C" ssize_t _sys_read(int, void *, size_t) SANITIZER_WEAK_ATTRIBUTE;
+extern "C" ssize_t _sys_write(int, const void *,
+                              size_t) SANITIZER_WEAK_ATTRIBUTE;
+extern "C" int __ftruncate(int, int, off_t) SANITIZER_WEAK_ATTRIBUTE;
+extern "C" ssize_t _sys_readlink(const char *, char *,
+                                 size_t) SANITIZER_WEAK_ATTRIBUTE;
+extern "C" int _sys_sched_yield() SANITIZER_WEAK_ATTRIBUTE;
+extern "C" int _sys___nanosleep50(const void *,
+                                  void *) SANITIZER_WEAK_ATTRIBUTE;
+extern "C" int _sys_execve(const char *, char *const[],
+                           char *const[]) SANITIZER_WEAK_ATTRIBUTE;
+extern "C" off_t __lseek(int, int, off_t, int) SANITIZER_WEAK_ATTRIBUTE;
+extern "C" int __fork() SANITIZER_WEAK_ATTRIBUTE;
+extern "C" int _sys___sigprocmask14(int, const void *,
+                                    void *) SANITIZER_WEAK_ATTRIBUTE;
+extern "C" int _sys___wait450(int wpid, int *, int,
+                              void *) SANITIZER_WEAK_ATTRIBUTE;
+
+namespace __sanitizer {
+
+static void *GetRealLibcAddress(const char *symbol) {
+  void *real = dlsym(RTLD_NEXT, symbol);
+  if (!real)
+    real = dlsym(RTLD_DEFAULT, symbol);
+  if (!real) {
+    Printf("GetRealLibcAddress failed for symbol=%s", symbol);
+    Die();
+  }
+  return real;
+}
+
+#define _REAL(func, ...) real##_##func(__VA_ARGS__)
+#define DEFINE__REAL(ret_type, func, ...)                              \
+  static ret_type (*real_##func)(__VA_ARGS__) = NULL;                  \
+  if (!real_##func) {                                                  \
+    real_##func = (ret_type(*)(__VA_ARGS__))GetRealLibcAddress(#func); \
+  }                                                                    \
+  CHECK(real_##func);
+
+// --------------- sanitizer_libc.h
+uptr internal_mmap(void *addr, uptr length, int prot, int flags, int fd,
+                   OFF_T offset) {
+  CHECK(&__mmap);
+  return (uptr)__mmap(addr, length, prot, flags, fd, 0, offset);
+}
+
+uptr internal_munmap(void *addr, uptr length) {
+  DEFINE__REAL(int, munmap, void *a, uptr b);
+  return _REAL(munmap, addr, length);
+}
+
+int internal_mprotect(void *addr, uptr length, int prot) {
+  DEFINE__REAL(int, mprotect, void *a, uptr b, int c);
+  return _REAL(mprotect, addr, length, prot);
+}
+
+uptr internal_close(fd_t fd) {
+  CHECK(&_sys_close);
+  return _sys_close(fd);
+}
+
+uptr internal_open(const char *filename, int flags) {
+  CHECK(&_sys_open);
+  return _sys_open(filename, flags);
+}
+
+uptr internal_open(const char *filename, int flags, u32 mode) {
+  CHECK(&_sys_open);
+  return _sys_open(filename, flags, mode);
+}
+
+uptr internal_read(fd_t fd, void *buf, uptr count) {
+  sptr res;
+  CHECK(&_sys_read);
+  HANDLE_EINTR(res, (sptr)_sys_read(fd, buf, (size_t)count));
+  return res;
+}
+
+uptr internal_write(fd_t fd, const void *buf, uptr count) {
+  sptr res;
+  CHECK(&_sys_write);
+  HANDLE_EINTR(res, (sptr)_sys_write(fd, buf, count));
+  return res;
+}
+
+uptr internal_ftruncate(fd_t fd, uptr size) {
+  sptr res;
+  CHECK(&__ftruncate);
+  HANDLE_EINTR(res, __ftruncate(fd, 0, (s64)size));
+  return res;
+}
+
+uptr internal_stat(const char *path, void *buf) {
+  DEFINE__REAL(int, __stat50, const char *a, void *b);
+  return _REAL(__stat50, path, buf);
+}
+
+uptr internal_lstat(const char *path, void *buf) {
+  DEFINE__REAL(int, __lstat50, const char *a, void *b);
+  return _REAL(__lstat50, path, buf);
+}
+
+uptr internal_fstat(fd_t fd, void *buf) {
+  DEFINE__REAL(int, __fstat50, int a, void *b);
+  return _REAL(__fstat50, fd, buf);
+}
+
+uptr internal_filesize(fd_t fd) {
+  struct stat st;
+  if (internal_fstat(fd, &st))
+    return -1;
+  return (uptr)st.st_size;
+}
+
+uptr internal_dup2(int oldfd, int newfd) {
+  DEFINE__REAL(int, dup2, int a, int b);
+  return _REAL(dup2, oldfd, newfd);
+}
+
+uptr internal_readlink(const char *path, char *buf, uptr bufsize) {
+  CHECK(&_sys_readlink);
+  return (uptr)_sys_readlink(path, buf, bufsize);
+}
+
+uptr internal_unlink(const char *path) {
+  DEFINE__REAL(int, unlink, const char *a);
+  return _REAL(unlink, path);
+}
+
+uptr internal_rename(const char *oldpath, const char *newpath) {
+  DEFINE__REAL(int, rename, const char *a, const char *b);
+  return _REAL(rename, oldpath, newpath);
+}
+
+uptr internal_sched_yield() {
+  CHECK(&_sys_sched_yield);
+  return _sys_sched_yield();
+}
+
+void internal__exit(int exitcode) {
+  DEFINE__REAL(void, _exit, int a);
+  _REAL(_exit, exitcode);
+  Die();  // Unreachable.
+}
+
+unsigned int internal_sleep(unsigned int seconds) {
+  struct timespec ts;
+  ts.tv_sec = 1;
+  ts.tv_nsec = 0;
+  CHECK(&_sys___nanosleep50);
+  int res = _sys___nanosleep50(&ts, &ts);
+  if (res)
+    return ts.tv_sec;
+  return 0;
+}
+
+uptr internal_execve(const char *filename, char *const argv[],
+                     char *const envp[]) {
+  CHECK(&_sys_execve);
+  return _sys_execve(filename, argv, envp);
+}
+
+tid_t GetTid() {
+  DEFINE__REAL(int, _lwp_self);
+  return _REAL(_lwp_self);
+}
+
+int TgKill(pid_t pid, tid_t tid, int sig) {
+  DEFINE__REAL(int, _lwp_kill, int a, int b);
+  (void)pid;
+  return _REAL(_lwp_kill, tid, sig);
+}
+
+u64 NanoTime() {
+  timeval tv;
+  DEFINE__REAL(int, __gettimeofday50, void *a, void *b);
+  internal_memset(&tv, 0, sizeof(tv));
+  _REAL(__gettimeofday50, &tv, 0);
+  return (u64)tv.tv_sec * 1000 * 1000 * 1000 + tv.tv_usec * 1000;
+}
+
+uptr internal_clock_gettime(__sanitizer_clockid_t clk_id, void *tp) {
+  DEFINE__REAL(int, __clock_gettime50, __sanitizer_clockid_t a, void *b);
+  return _REAL(__clock_gettime50, clk_id, tp);
+}
+
+uptr internal_ptrace(int request, int pid, void *addr, void *data) {
+  Printf("internal_ptrace not implemented for NetBSD");
+  Die();
+  return 0;
+}
+
+uptr internal_waitpid(int pid, int *status, int options) {
+  CHECK(&_sys___wait450);
+  return _sys___wait450(pid, status, options, 0 /* rusage */);
+}
+
+uptr internal_getpid() {
+  DEFINE__REAL(int, getpid);
+  return _REAL(getpid);
+}
+
+uptr internal_getppid() {
+  DEFINE__REAL(int, getppid);
+  return _REAL(getppid);
+}
+
+uptr internal_getdents(fd_t fd, void *dirp, unsigned int count) {
+  DEFINE__REAL(int, __getdents30, int a, void *b, size_t c);
+  return _REAL(__getdents30, fd, dirp, count);
+}
+
+uptr internal_lseek(fd_t fd, OFF_T offset, int whence) {
+  CHECK(&__lseek);
+  return __lseek(fd, 0, offset, whence);
+}
+
+uptr internal_prctl(int option, uptr arg2, uptr arg3, uptr arg4, uptr arg5) {
+  Printf("internal_prctl not implemented for NetBSD");
+  Die();
+  return 0;
+}
+
+uptr internal_sigaltstack(const void *ss, void *oss) {
+  DEFINE__REAL(int, __sigaltstack14, const void *a, void *b);
+  return _REAL(__sigaltstack14, ss, oss);
+}
+
+int internal_fork() {
+  CHECK(&__fork);
+  return __fork();
+}
+
+int internal_sysctl(const int *name, unsigned int namelen, void *oldp,
+                    uptr *oldlenp, const void *newp, uptr newlen) {
+  CHECK(&__sysctl);
+  return __sysctl(name, namelen, oldp, (size_t *)oldlenp, newp, (size_t)newlen);
+}
+
+int internal_sysctlbyname(const char *sname, void *oldp, uptr *oldlenp,
+                          const void *newp, uptr newlen) {
+  DEFINE__REAL(int, sysctlbyname, const char *a, void *b, size_t *c,
+               const void *d, size_t e);
+  return _REAL(sysctlbyname, sname, oldp, (size_t *)oldlenp, newp,
+               (size_t)newlen);
+}
+
+uptr internal_sigprocmask(int how, __sanitizer_sigset_t *set,
+                          __sanitizer_sigset_t *oldset) {
+  CHECK(&_sys___sigprocmask14);
+  return _sys___sigprocmask14(how, set, oldset);
+}
+
+void internal_sigfillset(__sanitizer_sigset_t *set) {
+  DEFINE__REAL(int, __sigfillset14, const void *a);
+  (void)_REAL(__sigfillset14, set);
+}
+
+void internal_sigemptyset(__sanitizer_sigset_t *set) {
+  DEFINE__REAL(int, __sigemptyset14, const void *a);
+  (void)_REAL(__sigemptyset14, set);
+}
+
+uptr intrnal_clone(int (*fn)(void *), void *child_stack, int flags, void *arg,
+                   int *parent_tidptr, void *newtls, int *child_tidptr) {
+  Printf("internal_clone not implemented for NetBSD");
+  Die();
+  return 0;
+}
+
+}  // namespace __sanitizer
+
+#endif
diff --git a/lib/sanitizer_common/sanitizer_openbsd.cc b/lib/sanitizer_common/sanitizer_openbsd.cc
index 2aea7cb..dc95510 100644
--- a/lib/sanitizer_common/sanitizer_openbsd.cc
+++ b/lib/sanitizer_common/sanitizer_openbsd.cc
@@ -54,9 +54,9 @@
 uptr ReadBinaryName(/*out*/char *buf, uptr buf_len) {
   // On OpenBSD we cannot get the full path
   struct kinfo_proc kp;
-  size_t kl;
+  uptr kl;
   const int Mib[4] = {CTL_KERN, KERN_PROC, KERN_PROC_PID, getpid()};
-  if (sysctl(Mib, ARRAY_SIZE(Mib), &kp, &kl, NULL, 0) != -1)
+  if (internal_sysctl(Mib, ARRAY_SIZE(Mib), &kp, &kl, NULL, 0) != -1)
     return internal_snprintf(buf,
                              (KI_MAXCOMLEN < buf_len ? KI_MAXCOMLEN : buf_len),
                              "%s", kp.p_comm);
@@ -64,23 +64,23 @@
 }
 
 static void GetArgsAndEnv(char ***argv, char ***envp) {
-  size_t nargv;
-  size_t nenv;
+  uptr nargv;
+  uptr nenv;
   int argvmib[4] = {CTL_KERN, KERN_PROC_ARGS, getpid(), KERN_PROC_ARGV};
   int envmib[4] = {CTL_KERN, KERN_PROC_ARGS, getpid(), KERN_PROC_ENV};
-  if (sysctl(argvmib, 4, NULL, &nargv, NULL, 0) == -1) {
+  if (internal_sysctl(argvmib, 4, NULL, &nargv, NULL, 0) == -1) {
     Printf("sysctl KERN_PROC_NARGV failed\n");
     Die();
   }
-  if (sysctl(envmib, 4, NULL, &nenv, NULL, 0) == -1) {
+  if (internal_sysctl(envmib, 4, NULL, &nenv, NULL, 0) == -1) {
     Printf("sysctl KERN_PROC_NENV failed\n");
     Die();
   }
-  if (sysctl(argvmib, 4, &argv, &nargv, NULL, 0) == -1) {
+  if (internal_sysctl(argvmib, 4, &argv, &nargv, NULL, 0) == -1) {
     Printf("sysctl KERN_PROC_ARGV failed\n");
     Die();
   }
-  if (sysctl(envmib, 4, &envp, &nenv, NULL, 0) == -1) {
+  if (internal_sysctl(envmib, 4, &envp, &nenv, NULL, 0) == -1) {
     Printf("sysctl KERN_PROC_ENV failed\n");
     Die();
   }
diff --git a/lib/sanitizer_common/sanitizer_platform.h b/lib/sanitizer_common/sanitizer_platform.h
index d81e255..106a147 100644
--- a/lib/sanitizer_common/sanitizer_platform.h
+++ b/lib/sanitizer_common/sanitizer_platform.h
@@ -235,7 +235,12 @@
 #if defined(__mips__)
 # define SANITIZER_MMAP_RANGE_SIZE FIRST_32_SECOND_64(1ULL << 32, 1ULL << 40)
 #elif defined(__aarch64__)
-# define SANITIZER_MMAP_RANGE_SIZE FIRST_32_SECOND_64(1ULL << 32, 1ULL << 48)
+# if SANITIZER_MAC
+// Darwin iOS/ARM64 has a 36-bit VMA, 64GiB VM
+#  define SANITIZER_MMAP_RANGE_SIZE FIRST_32_SECOND_64(1ULL << 32, 1ULL << 36)
+# else
+#  define SANITIZER_MMAP_RANGE_SIZE FIRST_32_SECOND_64(1ULL << 32, 1ULL << 48)
+# endif
 #else
 # define SANITIZER_MMAP_RANGE_SIZE FIRST_32_SECOND_64(1ULL << 32, 1ULL << 47)
 #endif
diff --git a/lib/sanitizer_common/sanitizer_platform_limits_posix.cc b/lib/sanitizer_common/sanitizer_platform_limits_posix.cc
index c27055f..cd1b73d 100644
--- a/lib/sanitizer_common/sanitizer_platform_limits_posix.cc
+++ b/lib/sanitizer_common/sanitizer_platform_limits_posix.cc
@@ -1037,7 +1037,11 @@
 CHECK_SIZE_AND_OFFSET(cmsghdr, cmsg_level);
 CHECK_SIZE_AND_OFFSET(cmsghdr, cmsg_type);
 
-#if SANITIZER_LINUX && (!defined(__ANDROID__) || __ANDROID_API__ >= 21)
+#ifndef __GLIBC_PREREQ
+#define __GLIBC_PREREQ(x, y) 0
+#endif
+
+#if SANITIZER_LINUX && (__ANDROID_API__ >= 21 || __GLIBC_PREREQ (2, 14))
 CHECK_TYPE_SIZE(mmsghdr);
 CHECK_SIZE_AND_OFFSET(mmsghdr, msg_hdr);
 CHECK_SIZE_AND_OFFSET(mmsghdr, msg_len);
@@ -1078,9 +1082,6 @@
 // Can't write checks for sa_handler and sa_sigaction due to them being
 // preprocessor macros.
 CHECK_STRUCT_SIZE_AND_OFFSET(sigaction, sa_mask);
-#ifndef __GLIBC_PREREQ
-#define __GLIBC_PREREQ(x, y) 0
-#endif
 #if !defined(__s390x__) || __GLIBC_PREREQ (2, 20)
 // On s390x glibc 2.19 and earlier sa_flags was unsigned long, and sa_resv
 // didn't exist.
@@ -1206,7 +1207,7 @@
 #endif
 
 #if SANITIZER_LINUX
-COMPILER_CHECK(sizeof(__sanitizer_mallinfo) == sizeof(struct mallinfo));
+COMPILER_CHECK(sizeof(__sanitizer_struct_mallinfo) == sizeof(struct mallinfo));
 #endif
 
 #if !SANITIZER_ANDROID
diff --git a/lib/sanitizer_common/sanitizer_platform_limits_posix.h b/lib/sanitizer_common/sanitizer_platform_limits_posix.h
index f89a113..3f67916 100644
--- a/lib/sanitizer_common/sanitizer_platform_limits_posix.h
+++ b/lib/sanitizer_common/sanitizer_platform_limits_posix.h
@@ -187,13 +187,13 @@
 #endif // SANITIZER_LINUX || SANITIZER_FREEBSD
 
 #if SANITIZER_ANDROID
-  struct __sanitizer_mallinfo {
+  struct __sanitizer_struct_mallinfo {
     uptr v[10];
   };
 #endif
 
 #if SANITIZER_LINUX && !SANITIZER_ANDROID
-  struct __sanitizer_mallinfo {
+  struct __sanitizer_struct_mallinfo {
     int v[10];
   };
 
diff --git a/lib/sanitizer_common/sanitizer_posix.h b/lib/sanitizer_common/sanitizer_posix.h
index da44700..3075a13 100644
--- a/lib/sanitizer_common/sanitizer_posix.h
+++ b/lib/sanitizer_common/sanitizer_posix.h
@@ -60,6 +60,11 @@
 int internal_fork();
 int internal_forkpty(int *amaster);
 
+int internal_sysctl(const int *name, unsigned int namelen, void *oldp,
+                    uptr *oldlenp, const void *newp, uptr newlen);
+int internal_sysctlbyname(const char *sname, void *oldp, uptr *oldlenp,
+                          const void *newp, uptr newlen);
+
 // These functions call appropriate pthread_ functions directly, bypassing
 // the interceptor. They are weak and may not be present in some tools.
 SANITIZER_WEAK_ATTRIBUTE
diff --git a/lib/sanitizer_common/sanitizer_procmaps_bsd.cc b/lib/sanitizer_common/sanitizer_procmaps_bsd.cc
index e41dc98..4cebd98 100644
--- a/lib/sanitizer_common/sanitizer_procmaps_bsd.cc
+++ b/lib/sanitizer_common/sanitizer_procmaps_bsd.cc
@@ -67,8 +67,8 @@
 #endif
   };
 
-  size_t Size = 0;
-  int Err = sysctl(Mib, ARRAY_SIZE(Mib), NULL, &Size, NULL, 0);
+  uptr Size = 0;
+  int Err = internal_sysctl(Mib, ARRAY_SIZE(Mib), NULL, &Size, NULL, 0);
   CHECK_EQ(Err, 0);
   CHECK_GT(Size, 0);
 
@@ -76,7 +76,7 @@
   size_t MmapedSize = Size * 4 / 3;
   void *VmMap = MmapOrDie(MmapedSize, "ReadProcMaps()");
   Size = MmapedSize;
-  Err = sysctl(Mib, ARRAY_SIZE(Mib), VmMap, &Size, NULL, 0);
+  Err = internal_sysctl(Mib, ARRAY_SIZE(Mib), VmMap, &Size, NULL, 0);
   CHECK_EQ(Err, 0);
   proc_maps->data = (char *)VmMap;
 #else
@@ -88,7 +88,7 @@
   if (Size > 0x10000)
     Size = 0x10000;
   Size = (Size / sizeof(struct kinfo_vmentry)) * sizeof(struct kinfo_vmentry);
-  Err = sysctl(Mib, ARRAY_SIZE(Mib), Mem, &Size, NULL, 0);
+  Err = internal_sysctl(Mib, ARRAY_SIZE(Mib), Mem, &Size, NULL, 0);
   CHECK_EQ(Err, 0);
   MmapedSize = Size;
   proc_maps->data = Mem;
diff --git a/lib/sanitizer_common/sanitizer_procmaps_mac.cc b/lib/sanitizer_common/sanitizer_procmaps_mac.cc
index 0167ab1..d90e4b9 100644
--- a/lib/sanitizer_common/sanitizer_procmaps_mac.cc
+++ b/lib/sanitizer_common/sanitizer_procmaps_mac.cc
@@ -140,12 +140,6 @@
 // early in the process, when dyld is one of the only images loaded,
 // so it will be hit after only a few iterations.
 static mach_header *get_dyld_image_header() {
-  mach_port_name_t port;
-  if (task_for_pid(mach_task_self(), internal_getpid(), &port) !=
-      KERN_SUCCESS) {
-    return nullptr;
-  }
-
   unsigned depth = 1;
   vm_size_t size = 0;
   vm_address_t address = 0;
@@ -154,7 +148,7 @@
 
   while (true) {
     struct vm_region_submap_info_64 info;
-    err = vm_region_recurse_64(port, &address, &size, &depth,
+    err = vm_region_recurse_64(mach_task_self(), &address, &size, &depth,
                                (vm_region_info_t)&info, &count);
     if (err != KERN_SUCCESS) return nullptr;
 
diff --git a/lib/sanitizer_common/sanitizer_ring_buffer.h b/lib/sanitizer_common/sanitizer_ring_buffer.h
new file mode 100644
index 0000000..d15f27f
--- /dev/null
+++ b/lib/sanitizer_common/sanitizer_ring_buffer.h
@@ -0,0 +1,162 @@
+//===-- sanitizer_ring_buffer.h ---------------------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Simple ring buffer.
+//
+//===----------------------------------------------------------------------===//
+#ifndef SANITIZER_RING_BUFFER_H
+#define SANITIZER_RING_BUFFER_H
+
+#include "sanitizer_common.h"
+
+namespace __sanitizer {
+// RingBuffer<T>: fixed-size ring buffer optimized for speed of push().
+// T should be a POD type and sizeof(T) should be divisible by sizeof(void*).
+// At creation, all elements are zero.
+template<class T>
+class RingBuffer {
+ public:
+  COMPILER_CHECK(sizeof(T) % sizeof(void *) == 0);
+  static RingBuffer *New(uptr Size) {
+    void *Ptr = MmapOrDie(SizeInBytes(Size), "RingBuffer");
+    RingBuffer *RB = reinterpret_cast<RingBuffer*>(Ptr);
+    uptr End = reinterpret_cast<uptr>(Ptr) + SizeInBytes(Size);
+    RB->last_ = RB->next_ = reinterpret_cast<T*>(End - sizeof(T));
+    return RB;
+  }
+  void Delete() {
+    UnmapOrDie(this, SizeInBytes(size()));
+  }
+  uptr size() const {
+    return last_ + 1 -
+           reinterpret_cast<T *>(reinterpret_cast<uptr>(this) +
+                                 2 * sizeof(T *));
+  }
+
+  static uptr SizeInBytes(uptr Size) {
+    return Size * sizeof(T) + 2 * sizeof(T*);
+  }
+
+  uptr SizeInBytes() { return SizeInBytes(size()); }
+
+  void push(T t) {
+    *next_ = t;
+    next_--;
+    // The condition below works only if sizeof(T) is divisible by sizeof(T*).
+    if (next_ <= reinterpret_cast<T*>(&next_))
+      next_ = last_;
+  }
+
+  T operator[](uptr Idx) const {
+    CHECK_LT(Idx, size());
+    sptr IdxNext = Idx + 1;
+    if (IdxNext > last_ - next_)
+      IdxNext -= size();
+    return next_[IdxNext];
+  }
+
+ private:
+  RingBuffer() {}
+  ~RingBuffer() {}
+  RingBuffer(const RingBuffer&) = delete;
+
+  // Data layout:
+  // LNDDDDDDDD
+  // D: data elements.
+  // L: last_, always points to the last data element.
+  // N: next_, initially equals to last_, is decremented on every push,
+  //    wraps around if it's less or equal than its own address.
+  T *last_;
+  T *next_;
+  T data_[1];  // flexible array.
+};
+
+// A ring buffer with externally provided storage that encodes its state in 8
+// bytes. Has significant constraints on size and alignment of storage.
+// See a comment in hwasan/hwasan_thread_list.h for the motivation behind this.
+#if SANITIZER_WORDSIZE == 64
+template <class T>
+class CompactRingBuffer {
+  // Top byte of long_ stores the buffer size in pages.
+  // Lower bytes store the address of the next buffer element.
+  static constexpr int kPageSizeBits = 12;
+  static constexpr int kSizeShift = 56;
+  static constexpr uptr kNextMask = (1ULL << kSizeShift) - 1;
+
+  uptr GetStorageSize() const { return (long_ >> kSizeShift) << kPageSizeBits; }
+
+  void Init(void *storage, uptr size) {
+    CHECK_EQ(sizeof(CompactRingBuffer<T>), sizeof(void *));
+    CHECK(IsPowerOfTwo(size));
+    CHECK_GE(size, 1 << kPageSizeBits);
+    CHECK_LE(size, 128 << kPageSizeBits);
+    CHECK_EQ(size % 4096, 0);
+    CHECK_EQ(size % sizeof(T), 0);
+    CHECK_EQ((uptr)storage % (size * 2), 0);
+    long_ = (uptr)storage | ((size >> kPageSizeBits) << kSizeShift);
+  }
+
+  void SetNext(const T *next) {
+    long_ = (long_ & ~kNextMask) | (uptr)next;
+  }
+
+ public:
+  CompactRingBuffer(void *storage, uptr size) {
+    Init(storage, size);
+  }
+
+  // A copy constructor of sorts.
+  CompactRingBuffer(const CompactRingBuffer &other, void *storage) {
+    uptr size = other.GetStorageSize();
+    internal_memcpy(storage, other.StartOfStorage(), size);
+    Init(storage, size);
+    uptr Idx = other.Next() - (const T *)other.StartOfStorage();
+    SetNext((const T *)storage + Idx);
+  }
+
+  T *Next() const { return (T *)(long_ & kNextMask); }
+
+  void *StartOfStorage() const {
+    return (void *)((uptr)Next() & ~(GetStorageSize() - 1));
+  }
+
+  void *EndOfStorage() const {
+    return (void *)((uptr)StartOfStorage() + GetStorageSize());
+  }
+
+  uptr size() const { return GetStorageSize() / sizeof(T); }
+
+  void push(T t) {
+    T *next = Next();
+    *next = t;
+    next++;
+    next = (T *)((uptr)next & ~GetStorageSize());
+    SetNext(next);
+  }
+
+  T operator[](uptr Idx) const {
+    CHECK_LT(Idx, size());
+    const T *Begin = (const T *)StartOfStorage();
+    sptr StorageIdx = Next() - Begin;
+    StorageIdx -= (sptr)(Idx + 1);
+    if (StorageIdx < 0)
+      StorageIdx += size();
+    return Begin[StorageIdx];
+  }
+
+ public:
+  ~CompactRingBuffer() {}
+  CompactRingBuffer(const CompactRingBuffer &) = delete;
+
+  uptr long_;
+};
+#endif
+}  // namespace __sanitizer
+
+#endif  // SANITIZER_RING_BUFFER_H
diff --git a/lib/sanitizer_common/sanitizer_stacktrace_libcdep.cc b/lib/sanitizer_common/sanitizer_stacktrace_libcdep.cc
index 747a4a7..c87b18e 100644
--- a/lib/sanitizer_common/sanitizer_stacktrace_libcdep.cc
+++ b/lib/sanitizer_common/sanitizer_stacktrace_libcdep.cc
@@ -114,11 +114,25 @@
     return;
   }
   InternalScopedString frame_desc(GetPageSizeCached());
-  RenderFrame(&frame_desc, fmt, 0, frame->info,
-              common_flags()->symbolize_vs_style,
-              common_flags()->strip_path_prefix);
-  internal_strncpy(out_buf, frame_desc.data(), out_buf_size);
-  out_buf[out_buf_size - 1] = 0;
+  uptr frame_num = 0;
+  // Reserve one byte for the final 0.
+  char *out_end = out_buf + out_buf_size - 1;
+  for (SymbolizedStack *cur = frame; cur && out_buf < out_end;
+       cur = cur->next) {
+    frame_desc.clear();
+    RenderFrame(&frame_desc, fmt, frame_num++, cur->info,
+                common_flags()->symbolize_vs_style,
+                common_flags()->strip_path_prefix);
+    if (!frame_desc.length())
+      continue;
+    // Reserve one byte for the terminating 0.
+    uptr n = out_end - out_buf - 1;
+    internal_strncpy(out_buf, frame_desc.data(), n);
+    out_buf += __sanitizer::Min<uptr>(n, frame_desc.length());
+    *out_buf++ = 0;
+  }
+  CHECK(out_buf <= out_end);
+  *out_buf = 0;
 }
 
 SANITIZER_INTERFACE_ATTRIBUTE
diff --git a/lib/sanitizer_common/sanitizer_syscall_generic.inc b/lib/sanitizer_common/sanitizer_syscall_generic.inc
index e4ed1b4..ddfb263 100644
--- a/lib/sanitizer_common/sanitizer_syscall_generic.inc
+++ b/lib/sanitizer_common/sanitizer_syscall_generic.inc
@@ -11,46 +11,21 @@
 //
 //===----------------------------------------------------------------------===//
 
-#if SANITIZER_FREEBSD || SANITIZER_MAC || SANITIZER_NETBSD || \
-  SANITIZER_OPENBSD || SANITIZER_SOLARIS
+// NetBSD uses libc calls directly
+#if !SANITIZER_NETBSD
+
+#if SANITIZER_FREEBSD || SANITIZER_MAC || SANITIZER_OPENBSD || SANITIZER_SOLARIS
 # define SYSCALL(name) SYS_ ## name
 #else
 # define SYSCALL(name) __NR_ ## name
 #endif
 
-#if SANITIZER_NETBSD
-// We use 3 kinds of internal_syscall's for different types of retval in order
-// to address differences in calling conventions (e.g. registers to place the
-// return value in).
-//   - internal_syscall     for 32-bit length (int, pid_t)
-//   - internal_syscall64   for 64-bit length (off_t)
-//   - internal_syscall_ptr for pointer and (s)size_t
-# define  internal_syscall      syscall
-# define  internal_syscall64    __syscall
-// Handle syscall renames manually
-# define SYS_stat SYS___stat50
-# define SYS_lstat SYS___lstat50
-# define SYS_fstat SYS___fstat50
-# define SYS_gettimeofday SYS___gettimeofday50
-# define SYS_wait4 SYS___wait450
-# define SYS_getdents SYS___getdents30
-# define SYS_sigaltstack SYS___sigaltstack14
-# define SYS_sigprocmask SYS___sigprocmask14
-# define SYS_nanosleep SYS___nanosleep50
-# define SYS_clock_gettime SYS___clock_gettime50
-# if SANITIZER_WORDSIZE == 64
-#  define internal_syscall_ptr  __syscall
-# else
-#  define internal_syscall_ptr  syscall
-# endif
-#elif defined(__x86_64__) && (SANITIZER_FREEBSD || SANITIZER_MAC)
+#if defined(__x86_64__) && (SANITIZER_FREEBSD || SANITIZER_MAC)
 # define internal_syscall __syscall
-# define internal_syscall64 __syscall
-# define internal_syscall_ptr __syscall
 # else
 # define internal_syscall syscall
-# define internal_syscall64 syscall
-# define internal_syscall_ptr syscall
+#endif
+
 #endif
 
 bool internal_iserror(uptr retval, int *rverrno) {
diff --git a/lib/sanitizer_common/sanitizer_syscall_linux_aarch64.inc b/lib/sanitizer_common/sanitizer_syscall_linux_aarch64.inc
index 1f05ed9..7ab1d76 100644
--- a/lib/sanitizer_common/sanitizer_syscall_linux_aarch64.inc
+++ b/lib/sanitizer_common/sanitizer_syscall_linux_aarch64.inc
@@ -127,9 +127,6 @@
 
 #define internal_syscall(...) __SYSCALL_DISP(__internal_syscall, __VA_ARGS__)
 
-#define internal_syscall_ptr internal_syscall
-#define internal_syscall64 internal_syscall
-
 // Helper function used to avoid cobbler errno.
 bool internal_iserror(uptr retval, int *rverrno) {
   if (retval >= (uptr)-4095) {
diff --git a/lib/sanitizer_common/sanitizer_syscall_linux_arm.inc b/lib/sanitizer_common/sanitizer_syscall_linux_arm.inc
index a3fdb9e..b4fd096 100644
--- a/lib/sanitizer_common/sanitizer_syscall_linux_arm.inc
+++ b/lib/sanitizer_common/sanitizer_syscall_linux_arm.inc
@@ -127,9 +127,6 @@
 
 #define internal_syscall(...) __SYSCALL_DISP(__internal_syscall, __VA_ARGS__)
 
-#define internal_syscall_ptr internal_syscall
-#define internal_syscall64 internal_syscall
-
 // Helper function used to avoid cobbler errno.
 bool internal_iserror(uptr retval, int *rverrno) {
   if (retval >= (uptr)-4095) {
diff --git a/lib/sanitizer_common/sanitizer_syscall_linux_x86_64.inc b/lib/sanitizer_common/sanitizer_syscall_linux_x86_64.inc
index 327aaa8..9853a6a 100644
--- a/lib/sanitizer_common/sanitizer_syscall_linux_x86_64.inc
+++ b/lib/sanitizer_common/sanitizer_syscall_linux_x86_64.inc
@@ -20,9 +20,6 @@
   return retval;
 }
 
-#define internal_syscall_ptr internal_syscall
-#define internal_syscall64 internal_syscall
-
 template <typename T1>
 static uptr internal_syscall(u64 nr, T1 arg1) {
   u64 retval;
diff --git a/lib/sanitizer_common/sanitizer_termination.cc b/lib/sanitizer_common/sanitizer_termination.cc
index 8243fc0..35e4403 100644
--- a/lib/sanitizer_common/sanitizer_termination.cc
+++ b/lib/sanitizer_common/sanitizer_termination.cc
@@ -84,3 +84,12 @@
 }
 
 } // namespace __sanitizer
+
+using namespace __sanitizer;  // NOLINT
+
+extern "C" {
+SANITIZER_INTERFACE_ATTRIBUTE
+void __sanitizer_set_death_callback(void (*callback)(void)) {
+  SetUserDieCallback(callback);
+}
+}  // extern "C"
diff --git a/lib/sanitizer_common/sanitizer_win.cc b/lib/sanitizer_common/sanitizer_win.cc
index 38e567d..e8f5919 100644
--- a/lib/sanitizer_common/sanitizer_win.cc
+++ b/lib/sanitizer_common/sanitizer_win.cc
@@ -29,6 +29,10 @@
 #include "sanitizer_placement_new.h"
 #include "sanitizer_win_defs.h"
 
+#if defined(PSAPI_VERSION) && PSAPI_VERSION == 1
+#pragma comment(lib, "psapi")
+#endif
+
 // A macro to tell the compiler that this part of the code cannot be reached,
 // if the compiler supports this feature. Since we're using this in
 // code that is called when terminating the process, the expansion of the
diff --git a/lib/sanitizer_common/sanitizer_win_defs.h b/lib/sanitizer_common/sanitizer_win_defs.h
index 077ff9c..10fc2d0 100644
--- a/lib/sanitizer_common/sanitizer_win_defs.h
+++ b/lib/sanitizer_common/sanitizer_win_defs.h
@@ -17,17 +17,27 @@
 #if SANITIZER_WINDOWS
 
 #ifndef WINAPI
-#ifdef _M_IX86
+#if defined(_M_IX86) || defined(__i386__)
 #define WINAPI __stdcall
 #else
 #define WINAPI
 #endif
 #endif
 
-#if defined(_WIN64)
-#define WIN_SYM_PREFIX
-#else
+#if defined(_M_IX86) || defined(__i386__)
 #define WIN_SYM_PREFIX "_"
+#else
+#define WIN_SYM_PREFIX
+#endif
+
+// For MinGW, the /export: directives contain undecorated symbols, contrary to
+// link/lld-link. The GNU linker doesn't support /alternatename and /include
+// though, thus lld-link in MinGW mode interprets them in the same way as
+// in the default mode.
+#ifdef __MINGW32__
+#define WIN_EXPORT_PREFIX
+#else
+#define WIN_EXPORT_PREFIX WIN_SYM_PREFIX
 #endif
 
 // Intermediate macro to ensure the parameter is expanded before stringified.
@@ -62,8 +72,8 @@
   __pragma(comment(linker, "/include:" WIN_SYM_PREFIX STRINGIFY(Name)))
 
 #define WIN_EXPORT(ExportedName, Name)                                         \
-  __pragma(comment(linker, "/export:" WIN_SYM_PREFIX STRINGIFY(ExportedName)   \
-                                  "=" WIN_SYM_PREFIX STRINGIFY(Name)))
+  __pragma(comment(linker, "/export:" WIN_EXPORT_PREFIX STRINGIFY(ExportedName)\
+                                  "=" WIN_EXPORT_PREFIX STRINGIFY(Name)))
 
 // We cannot define weak functions on Windows, but we can use WIN_WEAK_ALIAS()
 // which defines an alias to a default implementation, and only works when
diff --git a/lib/sanitizer_common/tests/CMakeLists.txt b/lib/sanitizer_common/tests/CMakeLists.txt
index 401682b..4642c59 100644
--- a/lib/sanitizer_common/tests/CMakeLists.txt
+++ b/lib/sanitizer_common/tests/CMakeLists.txt
@@ -26,6 +26,7 @@
   sanitizer_posix_test.cc
   sanitizer_printf_test.cc
   sanitizer_procmaps_test.cc
+  sanitizer_ring_buffer_test.cc
   sanitizer_quarantine_test.cc
   sanitizer_stackdepot_test.cc
   sanitizer_stacktrace_printer_test.cc
diff --git a/lib/sanitizer_common/tests/sanitizer_allocator_test.cc b/lib/sanitizer_common/tests/sanitizer_allocator_test.cc
index ef4c10b..05fef25 100644
--- a/lib/sanitizer_common/tests/sanitizer_allocator_test.cc
+++ b/lib/sanitizer_common/tests/sanitizer_allocator_test.cc
@@ -89,11 +89,20 @@
   static const uptr kFlags = 0;
 };
 
+struct AP64Dense {
+  static const uptr kSpaceBeg = kAllocatorSpace;
+  static const uptr kSpaceSize = kAllocatorSize;
+  static const uptr kMetadataSize = 16;
+  typedef DenseSizeClassMap SizeClassMap;
+  typedef NoOpMapUnmapCallback MapUnmapCallback;
+  static const uptr kFlags = 0;
+};
 
 typedef SizeClassAllocator64<AP64> Allocator64;
 typedef SizeClassAllocator64<AP64Dyn> Allocator64Dynamic;
 typedef SizeClassAllocator64<AP64Compact> Allocator64Compact;
 typedef SizeClassAllocator64<AP64VeryCompact> Allocator64VeryCompact;
+typedef SizeClassAllocator64<AP64Dense> Allocator64Dense;
 #elif defined(__mips64)
 static const u64 kAddressSpaceSize = 1ULL << 40;
 #elif defined(__aarch64__)
@@ -144,6 +153,10 @@
   TestSizeClassMap<InternalSizeClassMap>();
 }
 
+TEST(SanitizerCommon, DenseSizeClassMap) {
+  TestSizeClassMap<VeryCompactSizeClassMap>();
+}
+
 template <class Allocator>
 void TestSizeClassAllocator() {
   Allocator *a = new Allocator;
@@ -226,9 +239,14 @@
 }
 
 #if !SANITIZER_ANDROID
+//FIXME(kostyak): find values so that those work on Android as well.
 TEST(SanitizerCommon, SizeClassAllocator64Compact) {
   TestSizeClassAllocator<Allocator64Compact>();
 }
+
+TEST(SanitizerCommon, SizeClassAllocator64Dense) {
+  TestSizeClassAllocator<Allocator64Dense>();
+}
 #endif
 
 TEST(SanitizerCommon, SizeClassAllocator64VeryCompact) {
diff --git a/lib/sanitizer_common/tests/sanitizer_common_test.cc b/lib/sanitizer_common/tests/sanitizer_common_test.cc
index 0177484..6b091de 100644
--- a/lib/sanitizer_common/tests/sanitizer_common_test.cc
+++ b/lib/sanitizer_common/tests/sanitizer_common_test.cc
@@ -354,7 +354,8 @@
   EXPECT_STREQ("012345678", str.data());
 }
 
-#if SANITIZER_LINUX
+#if SANITIZER_LINUX || SANITIZER_FREEBSD || \
+  SANITIZER_OPENBSD || SANITIZER_MAC || SANITIZER_IOS
 TEST(SanitizerCommon, GetRandom) {
   u8 buffer_1[32], buffer_2[32];
   for (bool blocking : { false, true }) {
diff --git a/lib/sanitizer_common/tests/sanitizer_ring_buffer_test.cc b/lib/sanitizer_common/tests/sanitizer_ring_buffer_test.cc
new file mode 100644
index 0000000..80aa57c
--- /dev/null
+++ b/lib/sanitizer_common/tests/sanitizer_ring_buffer_test.cc
@@ -0,0 +1,99 @@
+//===-- sanitizer_vector_test.cc ------------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file is a part of *Sanitizer runtime.
+//
+//===----------------------------------------------------------------------===//
+#include "sanitizer_common/sanitizer_ring_buffer.h"
+#include "gtest/gtest.h"
+
+namespace __sanitizer {
+
+struct LargeStruct {
+  int64_t v;
+  int64_t extra[3];
+
+  explicit LargeStruct(int64_t v) : v(v) {}
+  operator int64_t() { return v; }
+};
+
+struct Struct10Bytes {
+  short t[3];
+};
+
+TEST(RingBuffer, Construct) {
+  RingBuffer<int64_t> *RBlong = RingBuffer<int64_t>::New(20);
+  EXPECT_EQ(RBlong->size(), 20U);
+  RBlong->Delete();
+}
+
+template <class T> void TestRB() {
+  RingBuffer<T> *RB;
+  const size_t Sizes[] = {1, 2, 3, 5, 8, 16, 20, 40, 10000};
+  for (size_t Size : Sizes) {
+    RB = RingBuffer<T>::New(Size);
+    EXPECT_EQ(RB->size(), Size);
+    RB->Delete();
+  }
+
+  RB = RingBuffer<T>::New(4);
+  EXPECT_EQ(RB->size(), 4U);
+#define EXPECT_RING_BUFFER(a0, a1, a2, a3) \
+  EXPECT_EQ((int64_t)(*RB)[0], (int64_t)a0);                 \
+  EXPECT_EQ((int64_t)(*RB)[1], (int64_t)a1);                 \
+  EXPECT_EQ((int64_t)(*RB)[2], (int64_t)a2);                 \
+  EXPECT_EQ((int64_t)(*RB)[3], (int64_t)a3);
+
+  RB->push(T(1)); EXPECT_RING_BUFFER(1, 0, 0, 0);
+  RB->push(T(2)); EXPECT_RING_BUFFER(2, 1, 0, 0);
+  RB->push(T(3)); EXPECT_RING_BUFFER(3, 2, 1, 0);
+  RB->push(T(4)); EXPECT_RING_BUFFER(4, 3, 2, 1);
+  RB->push(T(5)); EXPECT_RING_BUFFER(5, 4, 3, 2);
+  RB->push(T(6)); EXPECT_RING_BUFFER(6, 5, 4, 3);
+  RB->push(T(7)); EXPECT_RING_BUFFER(7, 6, 5, 4);
+  RB->push(T(8)); EXPECT_RING_BUFFER(8, 7, 6, 5);
+  RB->push(T(9)); EXPECT_RING_BUFFER(9, 8, 7, 6);
+  RB->push(T(10)); EXPECT_RING_BUFFER(10, 9, 8, 7);
+  RB->push(T(11)); EXPECT_RING_BUFFER(11, 10, 9, 8);
+  RB->push(T(12)); EXPECT_RING_BUFFER(12, 11, 10, 9);
+
+#undef EXPECT_RING_BUFFER
+}
+
+#if SANITIZER_WORDSIZE == 64
+TEST(RingBuffer, int64) {
+  TestRB<int64_t>();
+}
+
+TEST(RingBuffer, LargeStruct) {
+  TestRB<LargeStruct>();
+}
+
+template<typename T>
+CompactRingBuffer<T> *AllocCompactRingBuffer(size_t count) {
+  size_t sz = sizeof(T) * count;
+  EXPECT_EQ(0ULL, sz % 4096);
+  void *p = MmapAlignedOrDieOnFatalError(sz, sz * 2, "CompactRingBuffer");
+  return new CompactRingBuffer<T>(p, sz);
+}
+
+TEST(CompactRingBuffer, int64) {
+  const size_t page_sizes[] = {1, 2, 4, 128};
+
+  for (size_t pages : page_sizes) {
+    size_t count = 4096 * pages / sizeof(int64_t);
+    auto R = AllocCompactRingBuffer<int64_t>(count);
+    int64_t top = count * 3 + 13;
+    for (int64_t i = 0; i < top; ++i) R->push(i);
+    for (int64_t i = 0; i < (int64_t)count; ++i)
+      EXPECT_EQ(top - i - 1, (*R)[i]);
+  }
+}
+#endif
+}  // namespace __sanitizer
diff --git a/lib/scudo/CMakeLists.txt b/lib/scudo/CMakeLists.txt
index 0646c3d..79f69e9 100644
--- a/lib/scudo/CMakeLists.txt
+++ b/lib/scudo/CMakeLists.txt
@@ -17,6 +17,14 @@
 # Use gc-sections by default to avoid unused code being pulled in.
 list(APPEND SCUDO_DYNAMIC_LINK_FLAGS -Wl,--gc-sections)
 
+if(ANDROID)
+# Put most Sanitizer shared libraries in the global group. For more details, see
+# android-changes-for-ndk-developers.md#changes-to-library-search-order
+  if (COMPILER_RT_HAS_Z_GLOBAL)
+    list(APPEND SCUDO_DYNAMIC_LINK_FLAGS -Wl,-z,global)
+  endif()
+endif()
+
 # The minimal Scudo runtime does not inlude the UBSan runtime.
 set(SCUDO_MINIMAL_OBJECT_LIBS
   RTSanitizerCommonNoTermination
diff --git a/lib/scudo/scudo_allocator.cpp b/lib/scudo/scudo_allocator.cpp
index 4a11bf5..fb04fb2 100644
--- a/lib/scudo/scudo_allocator.cpp
+++ b/lib/scudo/scudo_allocator.cpp
@@ -129,16 +129,9 @@
             computeChecksum(Ptr, &NewUnpackedHeader));
   }
 
-  // Nulls out a chunk header. When returning the chunk to the backend, there
-  // is no need to store a valid ChunkAvailable header, as this would be
-  // computationally expensive. Zeroing out serves the same purpose by making
-  // the header invalid. In the extremely rare event where 0 would be a valid
-  // checksum for the chunk, the state of the chunk is ChunkAvailable anyway.
+  // Ensure that ChunkAvailable is 0, so that if a 0 checksum is ever valid
+  // for a fully nulled out header, its state will be available anyway.
   COMPILER_CHECK(ChunkAvailable == 0);
-  static INLINE void eraseHeader(void *Ptr) {
-    const PackedHeader NullPackedHeader = 0;
-    atomic_store_relaxed(getAtomicHeader(Ptr), NullPackedHeader);
-  }
 
   // Loads and unpacks the header, verifying the checksum in the process.
   static INLINE
@@ -185,7 +178,9 @@
     Chunk::loadHeader(Ptr, &Header);
     if (UNLIKELY(Header.State != ChunkQuarantine))
       dieWithMessage("invalid chunk state when recycling address %p\n", Ptr);
-    Chunk::eraseHeader(Ptr);
+    UnpackedHeader NewHeader = Header;
+    NewHeader.State = ChunkAvailable;
+    Chunk::compareExchangeHeader(Ptr, &NewHeader, &Header);
     void *BackendPtr = Chunk::getBackendPtr(Ptr, &Header);
     if (Header.ClassId)
       getBackend().deallocatePrimary(Cache_, BackendPtr, Header.ClassId);
@@ -264,7 +259,8 @@
     Quarantine.Init(
         static_cast<uptr>(getFlags()->QuarantineSizeKb) << 10,
         static_cast<uptr>(getFlags()->ThreadLocalQuarantineSizeKb) << 10);
-    QuarantineChunksUpToSize = getFlags()->QuarantineChunksUpToSize;
+    QuarantineChunksUpToSize = (Quarantine.GetCacheSize() == 0) ? 0 :
+        getFlags()->QuarantineChunksUpToSize;
     DeallocationTypeMismatch = getFlags()->DeallocationTypeMismatch;
     DeleteSizeMismatch = getFlags()->DeleteSizeMismatch;
     ZeroContents = getFlags()->ZeroContents;
@@ -389,10 +385,11 @@
   // quarantine chunk size threshold.
   void quarantineOrDeallocateChunk(void *Ptr, UnpackedHeader *Header,
                                    uptr Size) {
-    const bool BypassQuarantine = (Quarantine.GetCacheSize() == 0) ||
-        (Size > QuarantineChunksUpToSize);
+    const bool BypassQuarantine = !Size || (Size > QuarantineChunksUpToSize);
     if (BypassQuarantine) {
-      Chunk::eraseHeader(Ptr);
+      UnpackedHeader NewHeader = *Header;
+      NewHeader.State = ChunkAvailable;
+      Chunk::compareExchangeHeader(Ptr, &NewHeader, Header);
       void *BackendPtr = Chunk::getBackendPtr(Ptr, Header);
       if (Header->ClassId) {
         bool UnlockRequired;
@@ -675,7 +672,7 @@
 }
 
 void *scudoPvalloc(uptr Size) {
-  uptr PageSize = GetPageSizeCached();
+  const uptr PageSize = GetPageSizeCached();
   if (UNLIKELY(CheckForPvallocOverflow(Size, PageSize))) {
     errno = ENOMEM;
     if (Instance.canReturnNull())
diff --git a/lib/tsan/CMakeLists.txt b/lib/tsan/CMakeLists.txt
index 4a2ea3f..d501d0c 100644
--- a/lib/tsan/CMakeLists.txt
+++ b/lib/tsan/CMakeLists.txt
@@ -158,6 +158,15 @@
         VERBATIM)
     elseif(arch STREQUAL "aarch64")
       add_asm_sources(TSAN_ASM_SOURCES rtl/tsan_rtl_aarch64.S)
+      # Sanity check for Go runtime.
+      set(BUILDGO_SCRIPT ${CMAKE_CURRENT_SOURCE_DIR}/go/buildgo.sh)
+      add_custom_target(GotsanRuntimeCheck
+	COMMAND env "CC=${CMAKE_C_COMPILER} ${CMAKE_C_COMPILER_ARG1}"
+		IN_TMPDIR=1 SILENT=1 ${BUILDGO_SCRIPT}
+	DEPENDS clang_rt.tsan-${arch} ${BUILDGO_SCRIPT}
+	WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/go
+	COMMENT "Checking TSan Go runtime..."
+	VERBATIM)
     elseif(arch MATCHES "powerpc64|powerpc64le")
       add_asm_sources(TSAN_ASM_SOURCES rtl/tsan_rtl_ppc64.S)
       # Sanity check for Go runtime.
diff --git a/lib/tsan/check_analyze.sh b/lib/tsan/check_analyze.sh
index 9b5abc3..b2beb85 100755
--- a/lib/tsan/check_analyze.sh
+++ b/lib/tsan/check_analyze.sh
@@ -34,18 +34,18 @@
   fi
 }
 
-for f in write1 write2 write4 write8; do
+for f in write1 write2 write4 write8 read2 read4 read8; do
+  check $f rsp 1
+  check $f push 1
+  check $f pop 6
+done
+
+for f in read1; do
   check $f rsp 1
   check $f push 2
   check $f pop 12
 done
 
-for f in read1 read2 read4 read8; do
-  check $f rsp 1
-  check $f push 3
-  check $f pop 18
-done
-
 for f in func_entry func_exit; do
   check $f rsp 0
   check $f push 0
diff --git a/lib/tsan/go/buildgo.sh b/lib/tsan/go/buildgo.sh
index 7f570ca..eec4cf1 100755
--- a/lib/tsan/go/buildgo.sh
+++ b/lib/tsan/go/buildgo.sh
@@ -55,13 +55,19 @@
 		"
 	if [ "`uname -a | grep ppc64le`" != "" ]; then
 		SUFFIX="linux_ppc64le"
+		ARCHCFLAGS="-m64"
 	elif [ "`uname -a | grep x86_64`" != "" ]; then
 		SUFFIX="linux_amd64"
+		ARCHCFLAGS="-m64"
 		OSCFLAGS="$OSCFLAGS -ffreestanding -Wno-unused-const-variable -Werror -Wno-unknown-warning-option"
+	elif [ "`uname -a | grep aarch64`" != "" ]; then
+		SUFFIX="linux_arm64"
+		ARCHCFLAGS=""
 	fi
 elif [ "`uname -a | grep FreeBSD`" != "" ]; then
 	SUFFIX="freebsd_amd64"
 	OSCFLAGS="-fno-strict-aliasing -fPIC -Werror"
+	ARCHCFLAGS="-m64"
 	OSLDFLAGS="-lpthread -fPIC -fpie"
 	SRCS="
 		$SRCS
@@ -77,6 +83,7 @@
 elif [ "`uname -a | grep NetBSD`" != "" ]; then
 	SUFFIX="netbsd_amd64"
 	OSCFLAGS="-fno-strict-aliasing -fPIC -Werror"
+	ARCHCFLAGS="-m64"
 	OSLDFLAGS="-lpthread -fPIC -fpie"
 	SRCS="
 		$SRCS
@@ -87,11 +94,13 @@
 		../../sanitizer_common/sanitizer_procmaps_common.cc
 		../../sanitizer_common/sanitizer_linux.cc
 		../../sanitizer_common/sanitizer_linux_libcdep.cc
+		../../sanitizer_common/sanitizer_netbsd.cc
 		../../sanitizer_common/sanitizer_stoptheworld_linux_libcdep.cc
 	"
 elif [ "`uname -a | grep Darwin`" != "" ]; then
 	SUFFIX="darwin_amd64"
 	OSCFLAGS="-fPIC -Wno-unused-const-variable -Wno-unknown-warning-option -mmacosx-version-min=10.7"
+	ARCHCFLAGS="-m64"
 	OSLDFLAGS="-lpthread -fPIC -fpie -mmacosx-version-min=10.7"
 	SRCS="
 		$SRCS
@@ -104,6 +113,7 @@
 elif [ "`uname -a | grep MINGW`" != "" ]; then
 	SUFFIX="windows_amd64"
 	OSCFLAGS="-Wno-error=attributes -Wno-attributes -Wno-unused-const-variable -Wno-unknown-warning-option"
+	ARCHCFLAGS="-m64"
 	OSLDFLAGS=""
 	SRCS="
 		$SRCS
@@ -136,7 +146,7 @@
 	cat $F >> $DIR/gotsan.cc
 done
 
-FLAGS=" -I../rtl -I../.. -I../../sanitizer_common -I../../../include -std=c++11 -m64 -Wall -fno-exceptions -fno-rtti -DSANITIZER_GO=1 -DSANITIZER_DEADLOCK_DETECTOR_VERSION=2 $OSCFLAGS"
+FLAGS=" -I../rtl -I../.. -I../../sanitizer_common -I../../../include -std=c++11 -Wall -fno-exceptions -fno-rtti -DSANITIZER_GO=1 -DSANITIZER_DEADLOCK_DETECTOR_VERSION=2 $OSCFLAGS $ARCHCFLAGS"
 if [ "$DEBUG" = "" ]; then
 	FLAGS="$FLAGS -DSANITIZER_DEBUG=0 -O3 -fomit-frame-pointer"
 	if [ "$SUFFIX" = "linux_ppc64le" ]; then
@@ -153,7 +163,7 @@
 fi
 $CC $DIR/gotsan.cc -c -o $DIR/race_$SUFFIX.syso $FLAGS $CFLAGS
 
-$CC $OSCFLAGS test.c $DIR/race_$SUFFIX.syso -m64 -g -o $DIR/test $OSLDFLAGS $LDFLAGS
+$CC $OSCFLAGS $ARCHCFLAGS test.c $DIR/race_$SUFFIX.syso -g -o $DIR/test $OSLDFLAGS $LDFLAGS
 
 export GORACE="exitcode=0 atexit_sleep_ms=0"
 if [ "$SILENT" != "1" ]; then
diff --git a/lib/tsan/rtl/tsan_interceptors.cc b/lib/tsan/rtl/tsan_interceptors.cc
index 901997b..5e64d11 100644
--- a/lib/tsan/rtl/tsan_interceptors.cc
+++ b/lib/tsan/rtl/tsan_interceptors.cc
@@ -508,7 +508,8 @@
   uptr mangled_sp = env[6];
 #elif SANITIZER_MAC
 # ifdef __aarch64__
-    uptr mangled_sp = env[13];
+  uptr mangled_sp =
+      (GetMacosVersion() >= MACOS_VERSION_MOJAVE) ? env[12] : env[13];
 # else
     uptr mangled_sp = env[2];
 # endif
diff --git a/lib/tsan/rtl/tsan_libdispatch_mac.cc b/lib/tsan/rtl/tsan_libdispatch_mac.cc
index d6c1ca6..df22888 100644
--- a/lib/tsan/rtl/tsan_libdispatch_mac.cc
+++ b/lib/tsan/rtl/tsan_libdispatch_mac.cc
@@ -185,11 +185,8 @@
   TSAN_INTERCEPTOR(void, name, dispatch_queue_t q,                           \
                    DISPATCH_NOESCAPE dispatch_block_t block) {               \
     SCOPED_TSAN_INTERCEPTOR(name, q, block);                                 \
-    SCOPED_TSAN_INTERCEPTOR_USER_CALLBACK_START();                           \
-    dispatch_block_t heap_block = Block_copy(block);                         \
-    SCOPED_TSAN_INTERCEPTOR_USER_CALLBACK_END();                             \
     tsan_block_context_t new_context = {                                     \
-        q, heap_block, &invoke_and_release_block, false, true, barrier, 0};  \
+        q, block, &invoke_block, false, true, barrier, 0};                   \
     Release(thr, pc, (uptr)&new_context);                                    \
     SCOPED_TSAN_INTERCEPTOR_USER_CALLBACK_START();                           \
     REAL(name##_f)(q, &new_context, dispatch_callback_wrap);                 \
diff --git a/lib/tsan/rtl/tsan_platform.h b/lib/tsan/rtl/tsan_platform.h
index 6b3e6ba..70ae617 100644
--- a/lib/tsan/rtl/tsan_platform.h
+++ b/lib/tsan/rtl/tsan_platform.h
@@ -458,6 +458,32 @@
   static const uptr kAppMemEnd     = 0x00e000000000ull;
 };
 
+#elif SANITIZER_GO && defined(__aarch64__)
+
+/* Go on linux/aarch64 (48-bit VMA)
+0000 0000 1000 - 0000 1000 0000: executable
+0000 1000 0000 - 00c0 0000 0000: -
+00c0 0000 0000 - 00e0 0000 0000: heap
+00e0 0000 0000 - 2000 0000 0000: -
+2000 0000 0000 - 3000 0000 0000: shadow
+3000 0000 0000 - 3000 0000 0000: -
+3000 0000 0000 - 4000 0000 0000: metainfo (memory blocks and sync objects)
+4000 0000 0000 - 6000 0000 0000: -
+6000 0000 0000 - 6200 0000 0000: traces
+6200 0000 0000 - 8000 0000 0000: -
+*/
+
+struct Mapping48 {
+  static const uptr kMetaShadowBeg = 0x300000000000ull;
+  static const uptr kMetaShadowEnd = 0x400000000000ull;
+  static const uptr kTraceMemBeg   = 0x600000000000ull;
+  static const uptr kTraceMemEnd   = 0x620000000000ull;
+  static const uptr kShadowBeg     = 0x200000000000ull;
+  static const uptr kShadowEnd     = 0x300000000000ull;
+  static const uptr kAppMemBeg     = 0x000000001000ull;
+  static const uptr kAppMemEnd     = 0x00e000000000ull;
+};
+
 // Indicates the runtime will define the memory regions at runtime.
 #define TSAN_RUNTIME_VMA 1
 
@@ -525,8 +551,10 @@
 uptr MappingArchImpl(void) {
 #if defined(__aarch64__) && !defined(__APPLE__)
   switch (vmaSize) {
+#if !SANITIZER_GO
     case 39: return MappingImpl<Mapping39, Type>();
     case 42: return MappingImpl<Mapping42, Type>();
+#endif
     case 48: return MappingImpl<Mapping48, Type>();
   }
   DCHECK(0);
@@ -682,8 +710,10 @@
 bool IsAppMem(uptr mem) {
 #if defined(__aarch64__) && !defined(__APPLE__)
   switch (vmaSize) {
+#if !SANITIZER_GO
     case 39: return IsAppMemImpl<Mapping39>(mem);
     case 42: return IsAppMemImpl<Mapping42>(mem);
+#endif
     case 48: return IsAppMemImpl<Mapping48>(mem);
   }
   DCHECK(0);
@@ -713,8 +743,10 @@
 bool IsShadowMem(uptr mem) {
 #if defined(__aarch64__) && !defined(__APPLE__)
   switch (vmaSize) {
+#if !SANITIZER_GO
     case 39: return IsShadowMemImpl<Mapping39>(mem);
     case 42: return IsShadowMemImpl<Mapping42>(mem);
+#endif
     case 48: return IsShadowMemImpl<Mapping48>(mem);
   }
   DCHECK(0);
@@ -744,8 +776,10 @@
 bool IsMetaMem(uptr mem) {
 #if defined(__aarch64__) && !defined(__APPLE__)
   switch (vmaSize) {
+#if !SANITIZER_GO
     case 39: return IsMetaMemImpl<Mapping39>(mem);
     case 42: return IsMetaMemImpl<Mapping42>(mem);
+#endif
     case 48: return IsMetaMemImpl<Mapping48>(mem);
   }
   DCHECK(0);
@@ -785,8 +819,10 @@
 uptr MemToShadow(uptr x) {
 #if defined(__aarch64__) && !defined(__APPLE__)
   switch (vmaSize) {
+#if !SANITIZER_GO
     case 39: return MemToShadowImpl<Mapping39>(x);
     case 42: return MemToShadowImpl<Mapping42>(x);
+#endif
     case 48: return MemToShadowImpl<Mapping48>(x);
   }
   DCHECK(0);
@@ -828,8 +864,10 @@
 u32 *MemToMeta(uptr x) {
 #if defined(__aarch64__) && !defined(__APPLE__)
   switch (vmaSize) {
+#if !SANITIZER_GO
     case 39: return MemToMetaImpl<Mapping39>(x);
     case 42: return MemToMetaImpl<Mapping42>(x);
+#endif
     case 48: return MemToMetaImpl<Mapping48>(x);
   }
   DCHECK(0);
@@ -884,8 +922,10 @@
 uptr ShadowToMem(uptr s) {
 #if defined(__aarch64__) && !defined(__APPLE__)
   switch (vmaSize) {
+#if !SANITIZER_GO
     case 39: return ShadowToMemImpl<Mapping39>(s);
     case 42: return ShadowToMemImpl<Mapping42>(s);
+#endif
     case 48: return ShadowToMemImpl<Mapping48>(s);
   }
   DCHECK(0);
@@ -923,8 +963,10 @@
 uptr GetThreadTrace(int tid) {
 #if defined(__aarch64__) && !defined(__APPLE__)
   switch (vmaSize) {
+#if !SANITIZER_GO
     case 39: return GetThreadTraceImpl<Mapping39>(tid);
     case 42: return GetThreadTraceImpl<Mapping42>(tid);
+#endif
     case 48: return GetThreadTraceImpl<Mapping48>(tid);
   }
   DCHECK(0);
@@ -957,8 +999,10 @@
 uptr GetThreadTraceHeader(int tid) {
 #if defined(__aarch64__) && !defined(__APPLE__)
   switch (vmaSize) {
+#if !SANITIZER_GO
     case 39: return GetThreadTraceHeaderImpl<Mapping39>(tid);
     case 42: return GetThreadTraceHeaderImpl<Mapping42>(tid);
+#endif
     case 48: return GetThreadTraceHeaderImpl<Mapping48>(tid);
   }
   DCHECK(0);
diff --git a/lib/tsan/rtl/tsan_platform_linux.cc b/lib/tsan/rtl/tsan_platform_linux.cc
index de989b7..d2ce607 100644
--- a/lib/tsan/rtl/tsan_platform_linux.cc
+++ b/lib/tsan/rtl/tsan_platform_linux.cc
@@ -212,11 +212,19 @@
   vmaSize =
     (MostSignificantSetBitIndex(GET_CURRENT_FRAME()) + 1);
 #if defined(__aarch64__)
+# if !SANITIZER_GO
   if (vmaSize != 39 && vmaSize != 42 && vmaSize != 48) {
     Printf("FATAL: ThreadSanitizer: unsupported VMA range\n");
     Printf("FATAL: Found %zd - Supported 39, 42 and 48\n", vmaSize);
     Die();
   }
+#else
+  if (vmaSize != 48) {
+    Printf("FATAL: ThreadSanitizer: unsupported VMA range\n");
+    Printf("FATAL: Found %zd - Supported 48\n", vmaSize);
+    Die();
+  }
+#endif
 #elif defined(__powerpc64__)
 # if !SANITIZER_GO
   if (vmaSize != 44 && vmaSize != 46 && vmaSize != 47) {
diff --git a/lib/tsan/rtl/tsan_platform_mac.cc b/lib/tsan/rtl/tsan_platform_mac.cc
index f8d7324..7e3a473 100644
--- a/lib/tsan/rtl/tsan_platform_mac.cc
+++ b/lib/tsan/rtl/tsan_platform_mac.cc
@@ -240,6 +240,9 @@
 #endif
 }
 
+static const uptr kPthreadSetjmpXorKeySlot = 0x7;
+extern "C" uptr __tsan_darwin_setjmp_xor_key = 0;
+
 void InitializePlatform() {
   DisableCoreDumperIfNecessary();
 #if !SANITIZER_GO
@@ -251,6 +254,11 @@
   prev_pthread_introspection_hook =
       pthread_introspection_hook_install(&my_pthread_introspection_hook);
 #endif
+
+  if (GetMacosVersion() >= MACOS_VERSION_MOJAVE) {
+    __tsan_darwin_setjmp_xor_key =
+        (uptr)pthread_getspecific(kPthreadSetjmpXorKeySlot);
+  }
 }
 
 #if !SANITIZER_GO
diff --git a/lib/tsan/rtl/tsan_rtl_aarch64.S b/lib/tsan/rtl/tsan_rtl_aarch64.S
index 844c2e2..3d02bf2 100644
--- a/lib/tsan/rtl/tsan_rtl_aarch64.S
+++ b/lib/tsan/rtl/tsan_rtl_aarch64.S
@@ -120,8 +120,10 @@
   add     x0, x29, 32
   eor     x1, x2, x0
 #else
+  adrp    x2, ___tsan_darwin_setjmp_xor_key@page
+  ldr     x2, [x2, ___tsan_darwin_setjmp_xor_key@pageoff]
   add     x0, x29, 32
-  mov     x1, x0
+  eor     x1, x2, x0
 #endif
 
   // call tsan interceptor
@@ -178,8 +180,10 @@
   add     x0, x29, 32
   eor     x1, x2, x0
 #else
+  adrp    x2, ___tsan_darwin_setjmp_xor_key@page
+  ldr     x2, [x2, ___tsan_darwin_setjmp_xor_key@pageoff]
   add     x0, x29, 32
-  mov     x1, x0
+  eor     x1, x2, x0
 #endif
 
   // call tsan interceptor
@@ -238,8 +242,10 @@
   add     x0, x29, 32
   eor     x1, x2, x0
 #else
+  adrp    x2, ___tsan_darwin_setjmp_xor_key@page
+  ldr     x2, [x2, ___tsan_darwin_setjmp_xor_key@pageoff]
   add     x0, x29, 32
-  mov     x1, x0
+  eor     x1, x2, x0
 #endif
 
   // call tsan interceptor
diff --git a/lib/tsan/rtl/tsan_rtl_amd64.S b/lib/tsan/rtl/tsan_rtl_amd64.S
index 8af61bf..34ef51c 100644
--- a/lib/tsan/rtl/tsan_rtl_amd64.S
+++ b/lib/tsan/rtl/tsan_rtl_amd64.S
@@ -196,6 +196,7 @@
 #elif defined(__APPLE__)
   lea 16(%rsp), %rdi
   mov %rdi, %rsi
+  xorq ___tsan_darwin_setjmp_xor_key(%rip), %rsi
 #elif defined(__linux__)
   lea 16(%rsp), %rdi
   mov %rdi, %rsi
@@ -244,6 +245,7 @@
 #elif defined(__APPLE__)
   lea 16(%rsp), %rdi
   mov %rdi, %rsi
+  xorq ___tsan_darwin_setjmp_xor_key(%rip), %rsi
 #elif defined(__linux__)
   lea 16(%rsp), %rdi
   mov %rdi, %rsi
@@ -299,6 +301,7 @@
 #elif defined(__APPLE__)
   lea 32(%rsp), %rdi
   mov %rdi, %rsi
+  xorq ___tsan_darwin_setjmp_xor_key(%rip), %rsi
 #elif defined(__linux__)
   lea 32(%rsp), %rdi
   mov %rdi, %rsi
diff --git a/lib/xray/CMakeLists.txt b/lib/xray/CMakeLists.txt
index 8e18f55..4bb25d4 100644
--- a/lib/xray/CMakeLists.txt
+++ b/lib/xray/CMakeLists.txt
@@ -147,7 +147,7 @@
   add_compiler_rt_object_libraries(RTXray
     OS ${XRAY_SUPPORTED_OS}
     ARCHS ${XRAY_SUPPORTED_ARCH}
-    SOURCES ${x86_64_SOURCES}
+    SOURCES ${XRAY_SOURCES} ${x86_64_SOURCES}
     ADDITIONAL_HEADERS ${XRAY_IMPL_HEADERS}
     CFLAGS ${XRAY_CFLAGS}
     DEFS ${XRAY_COMMON_DEFINITIONS}
diff --git a/lib/xray/tests/CMakeLists.txt b/lib/xray/tests/CMakeLists.txt
index 11f3731..2f167e3 100644
--- a/lib/xray/tests/CMakeLists.txt
+++ b/lib/xray/tests/CMakeLists.txt
@@ -19,9 +19,16 @@
   ${XRAY_CFLAGS}
   ${COMPILER_RT_UNITTEST_CFLAGS}
   ${COMPILER_RT_GTEST_CFLAGS}
+  ${COMPILER_RT_GMOCK_CFLAGS}
   -I${COMPILER_RT_SOURCE_DIR}/include
   -I${COMPILER_RT_SOURCE_DIR}/lib/xray
-  -I${COMPILER_RT_SOURCE_DIR}/lib)
+  -I${COMPILER_RT_SOURCE_DIR}/lib
+  )
+
+# We add the include directories one at a time in our CFLAGS.
+foreach (DIR ${LLVM_INCLUDE_DIR} ${LLVM_MAIN_INCLUDE_DIR})
+  list(APPEND XRAY_UNITTEST_CFLAGS -I${DIR})
+endforeach()
 
 function(add_xray_lib library)
   add_library(${library} STATIC ${ARGN})
@@ -42,10 +49,27 @@
 set(XRAY_TEST_ARCH ${XRAY_SUPPORTED_ARCH})
 set(XRAY_UNITTEST_LINK_FLAGS
   ${CMAKE_THREAD_LIBS_INIT}
-	-l${SANITIZER_CXX_ABI_LIBRARY}
-  -fxray-instrument
-  )
+  -l${SANITIZER_CXX_ABI_LIBRARY})
+
 if (NOT APPLE)
+  # Needed by LLVMSupport.
+  append_list_if(
+    COMPILER_RT_HAS_TERMINFO
+    -l${COMPILER_RT_TERMINFO_LIB} XRAY_UNITTEST_LINK_FLAGS)
+
+  if (COMPILER_RT_STANDALONE_BUILD)
+    append_list_if(COMPILER_RT_HAS_LLVMXRAY ${LLVM_XRAY_LDFLAGS} XRAY_UNITTEST_LINK_FLAGS)
+    append_list_if(COMPILER_RT_HAS_LLVMXRAY ${LLVM_XRAY_LIBLIST} XRAY_UNITTEST_LINK_FLAGS)
+  else()
+    # We add the library directories one at a time in our CFLAGS.
+    foreach (DIR ${LLVM_LIBRARY_DIR})
+      list(APPEND XRAY_UNITTEST_LINK_FLAGS -L${DIR})
+    endforeach()
+
+    # We also add the actual libraries to link as dependencies.
+    list(APPEND XRAY_UNITTEST_LINK_FLAGS -lLLVMXRay -lLLVMSupport -lLLVMTestingSupport)
+  endif()
+
   append_list_if(COMPILER_RT_HAS_LIBM -lm XRAY_UNITTEST_LINK_FLAGS)
   append_list_if(COMPILER_RT_HAS_LIBRT -lrt XRAY_UNITTEST_LINK_FLAGS)
   append_list_if(COMPILER_RT_HAS_LIBDL -ldl XRAY_UNITTEST_LINK_FLAGS)
@@ -62,17 +86,21 @@
       generate_compiler_rt_tests(TEST_OBJECTS
         XRayUnitTests "${testname}-${arch}-Test" "${arch}"
         SOURCES ${TEST_SOURCES} ${COMPILER_RT_GTEST_SOURCE}
+                ${COMPILER_RT_GMOCK_SOURCE}
+
         # Note that any change in the implementations will cause all the unit
         # tests to be re-built. This is by design, but may be cumbersome during
         # the build/test cycle.
         COMPILE_DEPS ${TEST_SOURCES} ${COMPILER_RT_GTEST_SOURCE}
         ${XRAY_HEADERS} ${XRAY_ALL_SOURCE_FILES_ABS_PATHS}
         RUNTIME "${XRAY_RUNTIME_LIBS}"
-        DEPS gtest xray llvm-xray
+        DEPS gtest xray llvm-xray LLVMXRay LLVMTestingSupport
         CFLAGS ${XRAY_UNITTEST_CFLAGS}
-        LINK_FLAGS ${TARGET_LINK_FLAGS} ${XRAY_UNITTEST_LINK_FLAGS})
+        LINK_FLAGS ${TARGET_LINK_FLAGS} ${XRAY_UNITTEST_LINK_FLAGS}
+        )
       set_target_properties(XRayUnitTests
-        PROPERTIES RUNTIME_OUTPUT_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
+        PROPERTIES
+        RUNTIME_OUTPUT_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
     endforeach()
   endif()
 endmacro()
diff --git a/lib/xray/tests/unit/CMakeLists.txt b/lib/xray/tests/unit/CMakeLists.txt
index b42eb50..d0ead94 100644
--- a/lib/xray/tests/unit/CMakeLists.txt
+++ b/lib/xray/tests/unit/CMakeLists.txt
@@ -1,12 +1,8 @@
-add_xray_unittest(XRayBufferQueueTest SOURCES
-  buffer_queue_test.cc xray_unit_test_main.cc)
-add_xray_unittest(XRayFDRLoggingTest SOURCES
-  fdr_logging_test.cc xray_unit_test_main.cc)
-add_xray_unittest(XRayAllocatorTest SOURCES
-  allocator_test.cc xray_unit_test_main.cc)
-add_xray_unittest(XRaySegmentedArrayTest SOURCES
-  segmented_array_test.cc xray_unit_test_main.cc)
-add_xray_unittest(XRayFunctionCallTrieTest SOURCES
-  function_call_trie_test.cc xray_unit_test_main.cc)
-add_xray_unittest(XRayProfileCollectorTest SOURCES
-  profile_collector_test.cc xray_unit_test_main.cc)
+add_xray_unittest(XRayTest SOURCES
+  buffer_queue_test.cc
+  allocator_test.cc
+  segmented_array_test.cc
+  function_call_trie_test.cc
+  profile_collector_test.cc
+  fdr_log_writer_test.cc
+  xray_unit_test_main.cc)
diff --git a/lib/xray/tests/unit/fdr_log_writer_test.cc b/lib/xray/tests/unit/fdr_log_writer_test.cc
new file mode 100644
index 0000000..3a2138c
--- /dev/null
+++ b/lib/xray/tests/unit/fdr_log_writer_test.cc
@@ -0,0 +1,92 @@
+//===-- fdr_log_writer_test.cc --------------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file is a part of XRay, a function call tracing system.
+//
+//===----------------------------------------------------------------------===//
+#include <time.h>
+
+#include "xray/xray_records.h"
+#include "xray_fdr_log_writer.h"
+#include "llvm/Support/DataExtractor.h"
+#include "llvm/Testing/Support/Error.h"
+#include "llvm/XRay/Trace.h"
+#include "gmock/gmock.h"
+#include "gtest/gtest.h"
+
+namespace __xray {
+namespace {
+
+static constexpr size_t kSize = 4096;
+
+using ::llvm::HasValue;
+using ::testing::Eq;
+using ::testing::SizeIs;
+
+// Exercise the common code path where we initialize a buffer and are able to
+// write some records successfully.
+TEST(FdrLogWriterTest, WriteSomeRecords) {
+  bool Success = false;
+  BufferQueue Buffers(kSize, 1, Success);
+  BufferQueue::Buffer B;
+  ASSERT_EQ(Buffers.getBuffer(B), BufferQueue::ErrorCode::Ok);
+
+  FDRLogWriter Writer(B);
+  MetadataRecord Preamble[] = {
+      createMetadataRecord<MetadataRecord::RecordKinds::NewBuffer>(int32_t{1}),
+      createMetadataRecord<MetadataRecord::RecordKinds::WalltimeMarker>(
+          int64_t{1}, int32_t{2}),
+      createMetadataRecord<MetadataRecord::RecordKinds::Pid>(int32_t{1}),
+  };
+  ASSERT_THAT(Writer.writeMetadataRecords(Preamble),
+              Eq(sizeof(MetadataRecord) * 3));
+  ASSERT_TRUE(Writer.writeMetadata<MetadataRecord::RecordKinds::NewCPUId>(1));
+  ASSERT_TRUE(
+      Writer.writeFunction(FDRLogWriter::FunctionRecordKind::Enter, 1, 1));
+  ASSERT_TRUE(
+      Writer.writeFunction(FDRLogWriter::FunctionRecordKind::Exit, 1, 1));
+  ASSERT_EQ(Buffers.releaseBuffer(B), BufferQueue::ErrorCode::Ok);
+  ASSERT_EQ(B.Data, nullptr);
+  ASSERT_EQ(Buffers.finalize(), BufferQueue::ErrorCode::Ok);
+
+  // We then need to go through each element of the Buffers, and re-create a
+  // flat buffer that we would see if they were laid out in a file. This also
+  // means we need to write out the header manually.
+  // TODO: Isolate the file header writing.
+  std::string Serialized;
+  std::aligned_storage<sizeof(XRayFileHeader), alignof(XRayFileHeader)>::type
+      HeaderStorage;
+  auto *Header = reinterpret_cast<XRayFileHeader *>(&HeaderStorage);
+  new (Header) XRayFileHeader();
+  Header->Version = 3;
+  Header->Type = FileTypes::FDR_LOG;
+  Header->CycleFrequency = 3e9;
+  Header->ConstantTSC = 1;
+  Header->NonstopTSC = 1;
+  Serialized.append(reinterpret_cast<const char *>(&HeaderStorage),
+                    sizeof(XRayFileHeader));
+  size_t BufferCount = 0;
+  Buffers.apply([&](const BufferQueue::Buffer &B) {
+    ++BufferCount;
+    auto Size = atomic_load_relaxed(&B.Extents);
+    auto Extents =
+        createMetadataRecord<MetadataRecord::RecordKinds::BufferExtents>(Size);
+    Serialized.append(reinterpret_cast<const char *>(&Extents),
+                      sizeof(Extents));
+    Serialized.append(reinterpret_cast<const char *>(B.Data), Size);
+  });
+  ASSERT_EQ(BufferCount, 1u);
+
+  llvm::DataExtractor DE(Serialized, true, 8);
+  auto TraceOrErr = llvm::xray::loadTrace(DE);
+  EXPECT_THAT_EXPECTED(TraceOrErr, HasValue(SizeIs(2)));
+}
+
+} // namespace
+} // namespace __xray
diff --git a/lib/xray/tests/unit/fdr_logging_test.cc b/lib/xray/tests/unit/fdr_logging_test.cc
deleted file mode 100644
index b6961ef..0000000
--- a/lib/xray/tests/unit/fdr_logging_test.cc
+++ /dev/null
@@ -1,202 +0,0 @@
-//===-- fdr_logging_test.cc -----------------------------------------------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This file is a part of XRay, a function call tracing system.
-//
-//===----------------------------------------------------------------------===//
-#include "sanitizer_common/sanitizer_common.h"
-#include "xray_fdr_logging.h"
-#include "gtest/gtest.h"
-
-#include <array>
-#include <fcntl.h>
-#include <iostream>
-#include <sys/mman.h>
-#include <sys/stat.h>
-#include <sys/syscall.h>
-#include <sys/types.h>
-#include <system_error>
-#include <thread>
-#include <unistd.h>
-
-#include "xray/xray_records.h"
-
-namespace __xray {
-namespace {
-
-constexpr auto kBufferSize = 16384;
-constexpr auto kBufferMax = 10;
-
-struct ScopedFileCloserAndDeleter {
-  explicit ScopedFileCloserAndDeleter(int Fd, const char *Filename)
-      : Fd(Fd), Filename(Filename) {}
-
-  ~ScopedFileCloserAndDeleter() {
-    if (Map)
-      munmap(Map, Size);
-    if (Fd) {
-      close(Fd);
-      unlink(Filename);
-    }
-  }
-
-  void registerMap(void *M, size_t S) {
-    Map = M;
-    Size = S;
-  }
-
-  int Fd;
-  const char *Filename;
-  void *Map = nullptr;
-  size_t Size = 0;
-};
-
-TEST(FDRLoggingTest, Simple) {
-  FDRLoggingOptions Options;
-  Options.ReportErrors = true;
-  char TmpFilename[] = "fdr-logging-test.XXXXXX";
-  Options.Fd = mkstemp(TmpFilename);
-  ASSERT_NE(Options.Fd, -1);
-  ASSERT_EQ(fdrLoggingInit(kBufferSize, kBufferMax, &Options,
-                           sizeof(FDRLoggingOptions)),
-            XRayLogInitStatus::XRAY_LOG_INITIALIZED);
-  fdrLoggingHandleArg0(1, XRayEntryType::ENTRY);
-  fdrLoggingHandleArg0(1, XRayEntryType::EXIT);
-  ASSERT_EQ(fdrLoggingFinalize(), XRayLogInitStatus::XRAY_LOG_FINALIZED);
-  ASSERT_EQ(fdrLoggingFlush(), XRayLogFlushStatus::XRAY_LOG_FLUSHED);
-
-  // To do this properly, we have to close the file descriptor then re-open the
-  // file for reading this time.
-  ASSERT_EQ(close(Options.Fd), 0);
-  int Fd = open(TmpFilename, O_RDONLY);
-  ASSERT_NE(-1, Fd);
-  ScopedFileCloserAndDeleter Guard(Fd, TmpFilename);
-  auto Size = lseek(Fd, 0, SEEK_END);
-  ASSERT_NE(Size, 0);
-  // Map the file contents.
-  void *Map = mmap(NULL, Size, PROT_READ, MAP_PRIVATE, Fd, 0);
-  const char *Contents = static_cast<const char *>(Map);
-  Guard.registerMap(Map, Size);
-  ASSERT_NE(Contents, nullptr);
-
-  XRayFileHeader H;
-  memcpy(&H, Contents, sizeof(XRayFileHeader));
-  ASSERT_EQ(H.Version, 3);
-  ASSERT_EQ(H.Type, FileTypes::FDR_LOG);
-
-  // We require one buffer at least to have the "extents" metadata record,
-  // followed by the NewBuffer record.
-  MetadataRecord MDR0, MDR1;
-  memcpy(&MDR0, Contents + sizeof(XRayFileHeader), sizeof(MetadataRecord));
-  memcpy(&MDR1, Contents + sizeof(XRayFileHeader) + sizeof(MetadataRecord),
-         sizeof(MetadataRecord));
-  ASSERT_EQ(MDR0.RecordKind,
-            uint8_t(MetadataRecord::RecordKinds::BufferExtents));
-  ASSERT_EQ(MDR1.RecordKind, uint8_t(MetadataRecord::RecordKinds::NewBuffer));
-}
-
-TEST(FDRLoggingTest, Multiple) {
-  FDRLoggingOptions Options;
-  char TmpFilename[] = "fdr-logging-test.XXXXXX";
-  Options.Fd = mkstemp(TmpFilename);
-  ASSERT_NE(Options.Fd, -1);
-  ASSERT_EQ(fdrLoggingInit(kBufferSize, kBufferMax, &Options,
-                           sizeof(FDRLoggingOptions)),
-            XRayLogInitStatus::XRAY_LOG_INITIALIZED);
-  for (uint64_t I = 0; I < 100; ++I) {
-    fdrLoggingHandleArg0(1, XRayEntryType::ENTRY);
-    fdrLoggingHandleArg0(1, XRayEntryType::EXIT);
-  }
-  ASSERT_EQ(fdrLoggingFinalize(), XRayLogInitStatus::XRAY_LOG_FINALIZED);
-  ASSERT_EQ(fdrLoggingFlush(), XRayLogFlushStatus::XRAY_LOG_FLUSHED);
-
-  // To do this properly, we have to close the file descriptor then re-open the
-  // file for reading this time.
-  ASSERT_EQ(close(Options.Fd), 0);
-  int Fd = open(TmpFilename, O_RDONLY);
-  ASSERT_NE(-1, Fd);
-  ScopedFileCloserAndDeleter Guard(Fd, TmpFilename);
-  auto Size = lseek(Fd, 0, SEEK_END);
-  ASSERT_NE(Size, 0);
-  // Map the file contents.
-  void *Map = mmap(NULL, Size, PROT_READ, MAP_PRIVATE, Fd, 0);
-  const char *Contents = static_cast<const char *>(Map);
-  Guard.registerMap(Map, Size);
-  ASSERT_NE(Contents, nullptr);
-
-  XRayFileHeader H;
-  memcpy(&H, Contents, sizeof(XRayFileHeader));
-  ASSERT_EQ(H.Version, 3);
-  ASSERT_EQ(H.Type, FileTypes::FDR_LOG);
-
-  MetadataRecord MDR0, MDR1;
-  memcpy(&MDR0, Contents + sizeof(XRayFileHeader), sizeof(MetadataRecord));
-  memcpy(&MDR1, Contents + sizeof(XRayFileHeader) + sizeof(MetadataRecord),
-         sizeof(MetadataRecord));
-  ASSERT_EQ(MDR0.RecordKind,
-            uint8_t(MetadataRecord::RecordKinds::BufferExtents));
-  ASSERT_EQ(MDR1.RecordKind, uint8_t(MetadataRecord::RecordKinds::NewBuffer));
-}
-
-TEST(FDRLoggingTest, MultiThreadedCycling) {
-  FDRLoggingOptions Options;
-  char TmpFilename[] = "fdr-logging-test.XXXXXX";
-  Options.Fd = mkstemp(TmpFilename);
-  ASSERT_NE(Options.Fd, -1);
-  ASSERT_EQ(fdrLoggingInit(kBufferSize, 1, &Options, sizeof(FDRLoggingOptions)),
-            XRayLogInitStatus::XRAY_LOG_INITIALIZED);
-
-  // Now we want to create one thread, do some logging, then create another one,
-  // in succession and making sure that we're able to get thread records from
-  // the latest thread (effectively being able to recycle buffers).
-  std::array<tid_t, 2> Threads;
-  for (uint64_t I = 0; I < 2; ++I) {
-    std::thread t{[I, &Threads] {
-      fdrLoggingHandleArg0(I + 1, XRayEntryType::ENTRY);
-      fdrLoggingHandleArg0(I + 1, XRayEntryType::EXIT);
-      Threads[I] = GetTid();
-    }};
-    t.join();
-  }
-  ASSERT_EQ(fdrLoggingFinalize(), XRayLogInitStatus::XRAY_LOG_FINALIZED);
-  ASSERT_EQ(fdrLoggingFlush(), XRayLogFlushStatus::XRAY_LOG_FLUSHED);
-
-  // To do this properly, we have to close the file descriptor then re-open the
-  // file for reading this time.
-  ASSERT_EQ(close(Options.Fd), 0);
-  int Fd = open(TmpFilename, O_RDONLY);
-  ASSERT_NE(-1, Fd);
-  ScopedFileCloserAndDeleter Guard(Fd, TmpFilename);
-  auto Size = lseek(Fd, 0, SEEK_END);
-  ASSERT_NE(Size, 0);
-  // Map the file contents.
-  void *Map = mmap(NULL, Size, PROT_READ, MAP_PRIVATE, Fd, 0);
-  const char *Contents = static_cast<const char *>(Map);
-  Guard.registerMap(Map, Size);
-  ASSERT_NE(Contents, nullptr);
-
-  XRayFileHeader H;
-  memcpy(&H, Contents, sizeof(XRayFileHeader));
-  ASSERT_EQ(H.Version, 3);
-  ASSERT_EQ(H.Type, FileTypes::FDR_LOG);
-
-  MetadataRecord MDR0, MDR1;
-  memcpy(&MDR0, Contents + sizeof(XRayFileHeader), sizeof(MetadataRecord));
-  memcpy(&MDR1, Contents + sizeof(XRayFileHeader) + sizeof(MetadataRecord),
-         sizeof(MetadataRecord));
-  ASSERT_EQ(MDR0.RecordKind,
-            uint8_t(MetadataRecord::RecordKinds::BufferExtents));
-  ASSERT_EQ(MDR1.RecordKind, uint8_t(MetadataRecord::RecordKinds::NewBuffer));
-  int32_t Latest = 0;
-  memcpy(&Latest, MDR1.Data, sizeof(int32_t));
-  ASSERT_EQ(Latest, static_cast<int32_t>(Threads[1]));
-}
-
-} // namespace
-} // namespace __xray
diff --git a/lib/xray/xray_allocator.h b/lib/xray/xray_allocator.h
index 8244815..f77bccb 100644
--- a/lib/xray/xray_allocator.h
+++ b/lib/xray/xray_allocator.h
@@ -20,18 +20,70 @@
 #include "sanitizer_common/sanitizer_internal_defs.h"
 #include "sanitizer_common/sanitizer_mutex.h"
 #include "sanitizer_common/sanitizer_posix.h"
+#include "xray_defs.h"
 #include "xray_utils.h"
-#include <sys/mman.h>
 #include <cstddef>
 #include <cstdint>
-
-#ifndef MAP_NORESERVE
-// no-op on NetBSD (at least), unsupported flag on FreeBSD basically because unneeded
-#define MAP_NORESERVE 0
-#endif
+#include <sys/mman.h>
 
 namespace __xray {
 
+// We implement our own memory allocation routine which will bypass the
+// internal allocator. This allows us to manage the memory directly, using
+// mmap'ed memory to back the allocators.
+template <class T> T *allocate() XRAY_NEVER_INSTRUMENT {
+  uptr RoundedSize = RoundUpTo(sizeof(T), GetPageSizeCached());
+  uptr B = internal_mmap(NULL, RoundedSize, PROT_READ | PROT_WRITE,
+                         MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
+  int ErrNo;
+  if (UNLIKELY(internal_iserror(B, &ErrNo))) {
+    if (Verbosity())
+      Report(
+          "XRay Profiling: Failed to allocate memory of size %d; Error = %d.\n",
+          RoundedSize, B);
+    return nullptr;
+  }
+  return reinterpret_cast<T *>(B);
+}
+
+template <class T> void deallocate(T *B) XRAY_NEVER_INSTRUMENT {
+  if (B == nullptr)
+    return;
+  uptr RoundedSize = RoundUpTo(sizeof(T), GetPageSizeCached());
+  internal_munmap(B, RoundedSize);
+}
+
+template <class T = uint8_t> T *allocateBuffer(size_t S) XRAY_NEVER_INSTRUMENT {
+  uptr RoundedSize = RoundUpTo(S * sizeof(T), GetPageSizeCached());
+  uptr B = internal_mmap(NULL, RoundedSize, PROT_READ | PROT_WRITE,
+                         MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
+  int ErrNo;
+  if (UNLIKELY(internal_iserror(B, &ErrNo))) {
+    if (Verbosity())
+      Report(
+          "XRay Profiling: Failed to allocate memory of size %d; Error = %d.\n",
+          RoundedSize, B);
+    return nullptr;
+  }
+  return reinterpret_cast<T *>(B);
+}
+
+template <class T> void deallocateBuffer(T *B, size_t S) XRAY_NEVER_INSTRUMENT {
+  if (B == nullptr)
+    return;
+  uptr RoundedSize = RoundUpTo(S * sizeof(T), GetPageSizeCached());
+  internal_munmap(B, RoundedSize);
+}
+
+template <class T, class... U>
+T *initArray(size_t N, U &&... Us) XRAY_NEVER_INSTRUMENT {
+  auto A = allocateBuffer<T>(N);
+  if (A != nullptr)
+    while (N > 0)
+      new (A + (--N)) T(std::forward<U>(Us)...);
+  return A;
+}
+
 /// The Allocator type hands out fixed-sized chunks of memory that are
 /// cache-line aligned and sized. This is useful for placement of
 /// performance-sensitive data in memory that's frequently accessed. The
@@ -59,19 +111,16 @@
 
 private:
   const size_t MaxMemory{0};
-  void *BackingStore = nullptr;
-  void *AlignedNextBlock = nullptr;
+  uint8_t *BackingStore = nullptr;
+  uint8_t *AlignedNextBlock = nullptr;
   size_t AllocatedBlocks = 0;
   SpinMutex Mutex{};
 
-  void *Alloc() {
+  void *Alloc() XRAY_NEVER_INSTRUMENT {
     SpinMutexLock Lock(&Mutex);
     if (UNLIKELY(BackingStore == nullptr)) {
-      BackingStore = reinterpret_cast<void *>(
-          internal_mmap(NULL, MaxMemory, PROT_READ | PROT_WRITE,
-                        MAP_PRIVATE | MAP_ANONYMOUS | MAP_NORESERVE, 0, 0));
-      if (BackingStore == MAP_FAILED) {
-        BackingStore = nullptr;
+      BackingStore = allocateBuffer(MaxMemory);
+      if (BackingStore == nullptr) {
         if (Verbosity())
           Report("XRay Profiling: Failed to allocate memory for allocator.\n");
         return nullptr;
@@ -84,7 +133,7 @@
       auto AlignedNextBlockNum = nearest_boundary(
           reinterpret_cast<uintptr_t>(AlignedNextBlock), kCacheLineSize);
       if (diff(AlignedNextBlockNum, BackingStoreNum) > ptrdiff_t(MaxMemory)) {
-        munmap(BackingStore, MaxMemory);
+        deallocateBuffer(BackingStore, MaxMemory);
         AlignedNextBlock = BackingStore = nullptr;
         if (Verbosity())
           Report("XRay Profiling: Cannot obtain enough memory from "
@@ -92,7 +141,7 @@
         return nullptr;
       }
 
-      AlignedNextBlock = reinterpret_cast<void *>(AlignedNextBlockNum);
+      AlignedNextBlock = reinterpret_cast<uint8_t *>(AlignedNextBlockNum);
 
       // Assert that AlignedNextBlock is cache-line aligned.
       DCHECK_EQ(reinterpret_cast<uintptr_t>(AlignedNextBlock) % kCacheLineSize,
@@ -105,21 +154,21 @@
     // Align the pointer we'd like to return to an appropriate alignment, then
     // advance the pointer from where to start allocations.
     void *Result = AlignedNextBlock;
-    AlignedNextBlock = reinterpret_cast<void *>(
-        reinterpret_cast<char *>(AlignedNextBlock) + N);
+    AlignedNextBlock = reinterpret_cast<uint8_t *>(
+        reinterpret_cast<uint8_t *>(AlignedNextBlock) + N);
     ++AllocatedBlocks;
     return Result;
   }
 
 public:
-  explicit Allocator(size_t M)
+  explicit Allocator(size_t M) XRAY_NEVER_INSTRUMENT
       : MaxMemory(nearest_boundary(M, kCacheLineSize)) {}
 
-  Block Allocate() { return {Alloc()}; }
+  Block Allocate() XRAY_NEVER_INSTRUMENT { return {Alloc()}; }
 
-  ~Allocator() NOEXCEPT {
+  ~Allocator() NOEXCEPT XRAY_NEVER_INSTRUMENT {
     if (BackingStore != nullptr) {
-      internal_munmap(BackingStore, MaxMemory);
+      deallocateBuffer(BackingStore, MaxMemory);
     }
   }
 };
diff --git a/lib/xray/xray_basic_logging.cc b/lib/xray/xray_basic_logging.cc
index 585ca64..ee28d59 100644
--- a/lib/xray/xray_basic_logging.cc
+++ b/lib/xray/xray_basic_logging.cc
@@ -38,8 +38,9 @@
 
 namespace __xray {
 
-SpinMutex LogMutex;
+static SpinMutex LogMutex;
 
+namespace {
 // We use elements of this type to record the entry TSC of every function ID we
 // see as we're tracing a particular thread's execution.
 struct alignas(16) StackEntry {
@@ -62,11 +63,18 @@
   int Fd = -1;
 };
 
+struct BasicLoggingOptions {
+  int DurationFilterMicros = 0;
+  size_t MaxStackDepth = 0;
+  size_t ThreadBufferSize = 0;
+};
+} // namespace
+
 static pthread_key_t PThreadKey;
 
 static atomic_uint8_t BasicInitialized{0};
 
-BasicLoggingOptions GlobalOptions;
+struct BasicLoggingOptions GlobalOptions;
 
 thread_local atomic_uint8_t Guard{0};
 
@@ -360,8 +368,8 @@
   fsync(TLD.Fd);
 }
 
-XRayLogInitStatus basicLoggingInit(size_t BufferSize, size_t BufferMax,
-                                   void *Options,
+XRayLogInitStatus basicLoggingInit(UNUSED size_t BufferSize,
+                                   UNUSED size_t BufferMax, void *Options,
                                    size_t OptionsSize) XRAY_NEVER_INSTRUMENT {
   uint8_t Expected = 0;
   if (!atomic_compare_exchange_strong(&BasicInitialized, &Expected, 1,
@@ -385,43 +393,32 @@
              "using emulation instead.\n");
   });
 
-  if (BufferSize == 0 && BufferMax == 0 && Options != nullptr) {
-    FlagParser P;
-    BasicFlags F;
-    F.setDefaults();
-    registerXRayBasicFlags(&P, &F);
-    P.ParseString(useCompilerDefinedBasicFlags());
-    auto *EnvOpts = GetEnv("XRAY_BASIC_OPTIONS");
-    if (EnvOpts == nullptr)
-      EnvOpts = "";
+  FlagParser P;
+  BasicFlags F;
+  F.setDefaults();
+  registerXRayBasicFlags(&P, &F);
+  P.ParseString(useCompilerDefinedBasicFlags());
+  auto *EnvOpts = GetEnv("XRAY_BASIC_OPTIONS");
+  if (EnvOpts == nullptr)
+    EnvOpts = "";
 
-    P.ParseString(EnvOpts);
+  P.ParseString(EnvOpts);
 
-    // If XRAY_BASIC_OPTIONS was not defined, then we use the deprecated options
-    // set through XRAY_OPTIONS instead.
-    if (internal_strlen(EnvOpts) == 0) {
-      F.func_duration_threshold_us =
-          flags()->xray_naive_log_func_duration_threshold_us;
-      F.max_stack_depth = flags()->xray_naive_log_max_stack_depth;
-      F.thread_buffer_size = flags()->xray_naive_log_thread_buffer_size;
-    }
-
-    P.ParseString(static_cast<const char *>(Options));
-    GlobalOptions.ThreadBufferSize = F.thread_buffer_size;
-    GlobalOptions.DurationFilterMicros = F.func_duration_threshold_us;
-    GlobalOptions.MaxStackDepth = F.max_stack_depth;
-    *basicFlags() = F;
-  } else if (OptionsSize != sizeof(BasicLoggingOptions)) {
-    Report("Invalid options size, potential ABI mismatch; expected %d got %d",
-           sizeof(BasicLoggingOptions), OptionsSize);
-    return XRayLogInitStatus::XRAY_LOG_UNINITIALIZED;
-  } else {
-    if (Verbosity())
-      Report("XRay Basic: struct-based init is deprecated, please use "
-             "string-based configuration instead.\n");
-    GlobalOptions = *reinterpret_cast<BasicLoggingOptions *>(Options);
+  // If XRAY_BASIC_OPTIONS was not defined, then we use the deprecated options
+  // set through XRAY_OPTIONS instead.
+  if (internal_strlen(EnvOpts) == 0) {
+    F.func_duration_threshold_us =
+        flags()->xray_naive_log_func_duration_threshold_us;
+    F.max_stack_depth = flags()->xray_naive_log_max_stack_depth;
+    F.thread_buffer_size = flags()->xray_naive_log_thread_buffer_size;
   }
 
+  P.ParseString(static_cast<const char *>(Options));
+  GlobalOptions.ThreadBufferSize = F.thread_buffer_size;
+  GlobalOptions.DurationFilterMicros = F.func_duration_threshold_us;
+  GlobalOptions.MaxStackDepth = F.max_stack_depth;
+  *basicFlags() = F;
+
   atomic_store(&ThresholdTicks,
                atomic_load(&TicksPerSec, memory_order_acquire) *
                    GlobalOptions.DurationFilterMicros / 1000000,
diff --git a/lib/xray/xray_buffer_queue.cc b/lib/xray/xray_buffer_queue.cc
index 3ce7289..5a88ecd 100644
--- a/lib/xray/xray_buffer_queue.cc
+++ b/lib/xray/xray_buffer_queue.cc
@@ -16,87 +16,42 @@
 #include "sanitizer_common/sanitizer_common.h"
 #include "sanitizer_common/sanitizer_libc.h"
 #include "sanitizer_common/sanitizer_posix.h"
+#include "xray_allocator.h"
+#include "xray_defs.h"
 #include <memory>
 #include <sys/mman.h>
 
-#ifndef MAP_NORESERVE
-// no-op on NetBSD (at least), unsupported flag on FreeBSD
-#define MAP_NORESERVE 0
-#endif
-
 using namespace __xray;
 using namespace __sanitizer;
 
-template <class T> static T *allocRaw(size_t N) {
-  // TODO: Report errors?
-  // We use MAP_NORESERVE on platforms where it's supported to ensure that the
-  // pages we're allocating for XRay never end up in pages that can be swapped
-  // in/out. We're doing this because for FDR mode, we want to ensure that
-  // writes to the buffers stay resident in memory to prevent XRay itself from
-  // causing swapping/thrashing.
-  //
-  // In the case when XRay pages cannot be swapped in/out or there's not enough
-  // RAM to back these pages, we're willing to cause a segmentation fault
-  // instead of introducing latency in the measurement. We assume here that
-  // there are enough pages that are swappable in/out outside of the buffers
-  // being used by FDR mode (which are bounded and configurable anyway) to allow
-  // us to keep using always-resident memory.
-  //
-  // TODO: Make this configurable?
-  void *A = reinterpret_cast<void *>(
-      internal_mmap(NULL, N * sizeof(T), PROT_WRITE | PROT_READ,
-                    MAP_ANONYMOUS | MAP_PRIVATE | MAP_NORESERVE, -1, 0));
-  return (A == MAP_FAILED) ? nullptr : reinterpret_cast<T *>(A);
-}
-
-template <class T> static void deallocRaw(T *ptr, size_t N) {
-  // TODO: Report errors?
-  if (ptr != nullptr)
-    internal_munmap(ptr, N);
-}
-
-template <class T> static T *initArray(size_t N) {
-  auto A = allocRaw<T>(N);
-  if (A != nullptr)
-    while (N > 0)
-      new (A + (--N)) T();
-  return A;
-}
-
-BufferQueue::BufferQueue(size_t B, size_t N, bool &Success)
-    : BufferSize(B), Buffers(initArray<BufferQueue::BufferRep>(N)),
-      BufferCount(N), Finalizing{0}, OwnedBuffers(initArray<void *>(N)),
-      Next(Buffers), First(Buffers), LiveBuffers(0) {
-  if (Buffers == nullptr) {
+BufferQueue::BufferQueue(size_t B, size_t N,
+                         bool &Success) XRAY_NEVER_INSTRUMENT
+    : BufferSize(B),
+      BufferCount(N),
+      Mutex(),
+      Finalizing{0},
+      BackingStore(allocateBuffer(B *N)),
+      Buffers(initArray<BufferQueue::BufferRep>(N)),
+      Next(Buffers),
+      First(Buffers),
+      LiveBuffers(0) {
+  if (BackingStore == nullptr) {
     Success = false;
     return;
   }
-  if (OwnedBuffers == nullptr) {
-    // Clean up the buffers we've already allocated.
-    for (auto B = Buffers, E = Buffers + BufferCount; B != E; ++B)
-      B->~BufferRep();
-    deallocRaw(Buffers, N);
+  if (Buffers == nullptr) {
+    deallocateBuffer(BackingStore, BufferSize * BufferCount);
     Success = false;
     return;
-  };
+  }
 
   for (size_t i = 0; i < N; ++i) {
     auto &T = Buffers[i];
-    void *Tmp = allocRaw<char>(BufferSize);
-    if (Tmp == nullptr) {
-      Success = false;
-      return;
-    }
-    auto *Extents = allocRaw<BufferExtents>(1);
-    if (Extents == nullptr) {
-      Success = false;
-      return;
-    }
     auto &Buf = T.Buff;
-    Buf.Data = Tmp;
+    Buf.Data = reinterpret_cast<char *>(BackingStore) + (BufferSize * i);
     Buf.Size = B;
-    Buf.Extents = Extents;
-    OwnedBuffers[i] = Tmp;
+    atomic_store(&Buf.Extents, 0, memory_order_release);
+    T.Used = false;
   }
   Success = true;
 }
@@ -104,13 +59,17 @@
 BufferQueue::ErrorCode BufferQueue::getBuffer(Buffer &Buf) {
   if (atomic_load(&Finalizing, memory_order_acquire))
     return ErrorCode::QueueFinalizing;
+
   SpinMutexLock Guard(&Mutex);
   if (LiveBuffers == BufferCount)
     return ErrorCode::NotEnoughMemory;
 
   auto &T = *Next;
   auto &B = T.Buff;
-  Buf = B;
+  auto Extents = atomic_load(&B.Extents, memory_order_acquire);
+  atomic_store(&Buf.Extents, Extents, memory_order_release);
+  Buf.Data = B.Data;
+  Buf.Size = B.Size;
   T.Used = true;
   ++LiveBuffers;
 
@@ -121,15 +80,11 @@
 }
 
 BufferQueue::ErrorCode BufferQueue::releaseBuffer(Buffer &Buf) {
-  // Blitz through the buffers array to find the buffer.
-  bool Found = false;
-  for (auto I = OwnedBuffers, E = OwnedBuffers + BufferCount; I != E; ++I) {
-    if (*I == Buf.Data) {
-      Found = true;
-      break;
-    }
-  }
-  if (!Found)
+  // Check whether the buffer being referred to is within the bounds of the
+  // backing store's range.
+  if (Buf.Data < BackingStore ||
+      Buf.Data >
+          reinterpret_cast<char *>(BackingStore) + (BufferCount * BufferSize))
     return ErrorCode::UnrecognizedBuffer;
 
   SpinMutexLock Guard(&Mutex);
@@ -140,10 +95,14 @@
     return ErrorCode::NotEnoughMemory;
 
   // Now that the buffer has been released, we mark it as "used".
-  First->Buff = Buf;
+  auto Extents = atomic_load(&Buf.Extents, memory_order_acquire);
+  atomic_store(&First->Buff.Extents, Extents, memory_order_release);
+  First->Buff.Data = Buf.Data;
+  First->Buff.Size = Buf.Size;
   First->Used = true;
   Buf.Data = nullptr;
   Buf.Size = 0;
+  atomic_store(&Buf.Extents, 0, memory_order_release);
   --LiveBuffers;
   if (++First == (Buffers + BufferCount))
     First = Buffers;
@@ -158,14 +117,8 @@
 }
 
 BufferQueue::~BufferQueue() {
-  for (auto I = Buffers, E = Buffers + BufferCount; I != E; ++I) {
-    auto &T = *I;
-    auto &Buf = T.Buff;
-    deallocRaw(Buf.Data, Buf.Size);
-    deallocRaw(Buf.Extents, 1);
-  }
   for (auto B = Buffers, E = Buffers + BufferCount; B != E; ++B)
     B->~BufferRep();
-  deallocRaw(Buffers, BufferCount);
-  deallocRaw(OwnedBuffers, BufferCount);
+  deallocateBuffer(Buffers, BufferCount);
+  deallocateBuffer(BackingStore, BufferSize * BufferCount);
 }
diff --git a/lib/xray/xray_buffer_queue.h b/lib/xray/xray_buffer_queue.h
index e76fa79..c1fa9fa 100644
--- a/lib/xray/xray_buffer_queue.h
+++ b/lib/xray/xray_buffer_queue.h
@@ -18,7 +18,9 @@
 #include "sanitizer_common/sanitizer_atomic.h"
 #include "sanitizer_common/sanitizer_common.h"
 #include "sanitizer_common/sanitizer_mutex.h"
+#include "xray_defs.h"
 #include <cstddef>
+#include <cstdint>
 
 namespace __xray {
 
@@ -29,14 +31,10 @@
 /// trace collection.
 class BufferQueue {
 public:
-  struct alignas(64) BufferExtents {
-    atomic_uint64_t Size;
-  };
-
   struct Buffer {
+    atomic_uint64_t Extents{0};
     void *Data = nullptr;
     size_t Size = 0;
-    BufferExtents *Extents;
   };
 
   struct BufferRep {
@@ -76,8 +74,10 @@
 
     T *operator->() const { return &(Buffers[Offset].Buff); }
 
-    Iterator(BufferRep *Root, size_t O, size_t M)
-        : Buffers(Root), Offset(O), Max(M) {
+    Iterator(BufferRep *Root, size_t O, size_t M) XRAY_NEVER_INSTRUMENT
+        : Buffers(Root),
+          Offset(O),
+          Max(M) {
       // We want to advance to the first Offset where the 'Used' property is
       // true, or to the end of the list/queue.
       while (!Buffers[Offset].Used && Offset != Max) {
@@ -107,16 +107,18 @@
   // Size of each individual Buffer.
   size_t BufferSize;
 
-  BufferRep *Buffers;
-
   // Amount of pre-allocated buffers.
   size_t BufferCount;
 
   SpinMutex Mutex;
   atomic_uint8_t Finalizing;
 
-  // Pointers to buffers managed/owned by the BufferQueue.
-  void **OwnedBuffers;
+  // A pointer to a contiguous block of memory to serve as the backing store for
+  // all the individual buffers handed out.
+  uint8_t *BackingStore;
+
+  // A dynamically allocated array of BufferRep instances.
+  BufferRep *Buffers;
 
   // Pointer to the next buffer to be handed out.
   BufferRep *Next;
@@ -198,7 +200,7 @@
   /// Applies the provided function F to each Buffer in the queue, only if the
   /// Buffer is marked 'used' (i.e. has been the result of getBuffer(...) and a
   /// releaseBuffer(...) operation).
-  template <class F> void apply(F Fn) {
+  template <class F> void apply(F Fn) XRAY_NEVER_INSTRUMENT {
     SpinMutexLock G(&Mutex);
     for (auto I = begin(), E = end(); I != E; ++I)
       Fn(*I);
diff --git a/lib/xray/xray_fdr_log_records.h b/lib/xray/xray_fdr_log_records.h
index 87096d4..e7b1ee5 100644
--- a/lib/xray/xray_fdr_log_records.h
+++ b/lib/xray/xray_fdr_log_records.h
@@ -12,6 +12,9 @@
 //===----------------------------------------------------------------------===//
 #ifndef XRAY_XRAY_FDR_LOG_RECORDS_H
 #define XRAY_XRAY_FDR_LOG_RECORDS_H
+#include <cstdint>
+
+namespace __xray {
 
 enum class RecordType : uint8_t { Function, Metadata };
 
@@ -68,4 +71,6 @@
 
 static_assert(sizeof(FunctionRecord) == 8, "Wrong size for FunctionRecord.");
 
+} // namespace __xray
+
 #endif // XRAY_XRAY_FDR_LOG_RECORDS_H
diff --git a/lib/xray/xray_fdr_log_writer.h b/lib/xray/xray_fdr_log_writer.h
new file mode 100644
index 0000000..28af356
--- /dev/null
+++ b/lib/xray/xray_fdr_log_writer.h
@@ -0,0 +1,119 @@
+//===-- xray_fdr_log_writer.h ---------------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file is a part of XRay, a function call tracing system.
+//
+//===----------------------------------------------------------------------===//
+#ifndef COMPILER_RT_LIB_XRAY_XRAY_FDR_LOG_WRITER_H_
+#define COMPILER_RT_LIB_XRAY_XRAY_FDR_LOG_WRITER_H_
+
+#include "xray_buffer_queue.h"
+#include "xray_fdr_log_records.h"
+#include <functional>
+#include <tuple>
+#include <type_traits>
+#include <utility>
+
+namespace __xray {
+
+template <size_t Index> struct SerializerImpl {
+  template <class Tuple,
+            typename std::enable_if<
+                Index<std::tuple_size<
+                          typename std::remove_reference<Tuple>::type>::value,
+                      int>::type = 0> static void serializeTo(char *Buffer,
+                                                              Tuple &&T) {
+    auto P = reinterpret_cast<const char *>(&std::get<Index>(T));
+    constexpr auto Size = sizeof(std::get<Index>(T));
+    internal_memcpy(Buffer, P, Size);
+    SerializerImpl<Index + 1>::serializeTo(Buffer + Size,
+                                           std::forward<Tuple>(T));
+  }
+
+  template <class Tuple,
+            typename std::enable_if<
+                Index >= std::tuple_size<typename std::remove_reference<
+                             Tuple>::type>::value,
+                int>::type = 0>
+  static void serializeTo(char *, Tuple &&){};
+};
+
+using Serializer = SerializerImpl<0>;
+
+template <MetadataRecord::RecordKinds Kind, class... DataTypes>
+MetadataRecord createMetadataRecord(DataTypes &&... Ds) {
+  MetadataRecord R;
+  R.Type = 1;
+  R.RecordKind = static_cast<uint8_t>(Kind);
+  Serializer::serializeTo(R.Data,
+                          std::make_tuple(std::forward<DataTypes>(Ds)...));
+  return R;
+}
+
+class FDRLogWriter {
+  BufferQueue::Buffer &Buffer;
+  char *NextRecord = nullptr;
+
+  template <class T> void writeRecord(const T &R) {
+    internal_memcpy(NextRecord, reinterpret_cast<const char *>(&R), sizeof(T));
+    NextRecord += sizeof(T);
+    atomic_fetch_add(&Buffer.Extents, sizeof(T), memory_order_acq_rel);
+  }
+
+public:
+  explicit FDRLogWriter(BufferQueue::Buffer &B, char *P)
+      : Buffer(B), NextRecord(P) {
+    DCHECK_NE(Buffer.Data, nullptr);
+    DCHECK_NE(NextRecord, nullptr);
+  }
+
+  explicit FDRLogWriter(BufferQueue::Buffer &B)
+      : FDRLogWriter(B, static_cast<char *>(B.Data)) {}
+
+  template <MetadataRecord::RecordKinds Kind, class... Data>
+  bool writeMetadata(Data &&... Ds) {
+    // TODO: Check boundary conditions:
+    // 1) Buffer is full, and cannot handle one metadata record.
+    // 2) Buffer queue is finalising.
+    writeRecord(createMetadataRecord<Kind>(std::forward<Data>(Ds)...));
+    return true;
+  }
+
+  template <size_t N> size_t writeMetadataRecords(MetadataRecord (&Recs)[N]) {
+    constexpr auto Size = sizeof(MetadataRecord) * N;
+    internal_memcpy(NextRecord, reinterpret_cast<const char *>(Recs), Size);
+    NextRecord += Size;
+    atomic_fetch_add(&Buffer.Extents, Size, memory_order_acq_rel);
+    return Size;
+  }
+
+  enum class FunctionRecordKind : uint8_t {
+    Enter = 0x00,
+    Exit = 0x01,
+    TailExit = 0x02,
+    EnterArg = 0x03,
+  };
+
+  bool writeFunction(FunctionRecordKind Kind, int32_t FuncId, int32_t Delta) {
+    FunctionRecord R;
+    R.Type = 0;
+    R.RecordKind = uint8_t(Kind);
+    R.FuncId = FuncId;
+    R.TSCDelta = Delta;
+    writeRecord(R);
+    return true;
+  }
+
+  char *getNextRecord() const { return NextRecord; }
+
+}; // namespace __xray
+
+} // namespace __xray
+
+#endif // COMPILER-RT_LIB_XRAY_XRAY_FDR_LOG_WRITER_H_
diff --git a/lib/xray/xray_fdr_logging.cc b/lib/xray/xray_fdr_logging.cc
index 6cb2dfa..2d6af44 100644
--- a/lib/xray/xray_fdr_logging.cc
+++ b/lib/xray/xray_fdr_logging.cc
@@ -30,9 +30,11 @@
 #include "sanitizer_common/sanitizer_common.h"
 #include "xray/xray_interface.h"
 #include "xray/xray_records.h"
+#include "xray_allocator.h"
 #include "xray_buffer_queue.h"
 #include "xray_defs.h"
 #include "xray_fdr_flags.h"
+#include "xray_fdr_log_writer.h"
 #include "xray_flags.h"
 #include "xray_recursion_guard.h"
 #include "xray_tsc.h"
@@ -40,14 +42,16 @@
 
 namespace __xray {
 
-atomic_sint32_t LoggingStatus = {XRayLogInitStatus::XRAY_LOG_UNINITIALIZED};
+static atomic_sint32_t LoggingStatus = {
+    XRayLogInitStatus::XRAY_LOG_UNINITIALIZED};
 
+namespace {
 // Group together thread-local-data in a struct, then hide it behind a function
 // call so that it can be initialized on first use instead of as a global. We
 // force the alignment to 64-bytes for x86 cache line alignment, as this
 // structure is used in the hot path of implementation.
 struct alignas(64) ThreadLocalData {
-  BufferQueue::Buffer Buffer;
+  BufferQueue::Buffer Buffer{};
   char *RecordPtr = nullptr;
   // The number of FunctionEntry records immediately preceding RecordPtr.
   uint8_t NumConsecutiveFnEnters = 0;
@@ -70,6 +74,7 @@
   // FDRLogging, and that we're going to clean it up when the thread exits.
   BufferQueue *BQ = nullptr;
 };
+} // namespace
 
 static_assert(std::is_trivially_destructible<ThreadLocalData>::value,
               "ThreadLocalData must be trivially destructible");
@@ -81,15 +86,12 @@
 static pthread_key_t Key;
 
 // Global BufferQueue.
+static std::aligned_storage<sizeof(BufferQueue)>::type BufferQueueStorage;
 static BufferQueue *BQ = nullptr;
 
 static atomic_sint32_t LogFlushStatus = {
     XRayLogFlushStatus::XRAY_LOG_NOT_FLUSHING};
 
-static FDRLoggingOptions FDROptions;
-
-static SpinMutex FDROptionsMutex;
-
 // This function will initialize the thread-local data structure used by the FDR
 // logging implementation and return a reference to it. The implementation
 // details require a bit of care to maintain.
@@ -140,60 +142,35 @@
 
 static void writeNewBufferPreamble(tid_t Tid, timespec TS,
                                    pid_t Pid) XRAY_NEVER_INSTRUMENT {
-  static constexpr int InitRecordsCount = 3;
+  static_assert(sizeof(time_t) <= 8, "time_t needs to be at most 8 bytes");
   auto &TLD = getThreadLocalData();
-  MetadataRecord Metadata[InitRecordsCount];
-  {
-    // Write out a MetadataRecord to signify that this is the start of a new
-    // buffer, associated with a particular thread, with a new CPU.  For the
-    // data, we have 15 bytes to squeeze as much information as we can.  At this
-    // point we only write down the following bytes:
-    //   - Thread ID (tid_t, cast to 4 bytes type due to Darwin being 8 bytes)
-    auto &NewBuffer = Metadata[0];
-    NewBuffer.Type = uint8_t(RecordType::Metadata);
-    NewBuffer.RecordKind = uint8_t(MetadataRecord::RecordKinds::NewBuffer);
-    int32_t tid = static_cast<int32_t>(Tid);
-    internal_memcpy(&NewBuffer.Data, &tid, sizeof(tid));
-  }
+  MetadataRecord Metadata[] = {
+      // Write out a MetadataRecord to signify that this is the start of a new
+      // buffer, associated with a particular thread, with a new CPU. For the
+      // data, we have 15 bytes to squeeze as much information as we can. At
+      // this point we only write down the following bytes:
+      //   - Thread ID (tid_t, cast to 4 bytes type due to Darwin being 8 bytes)
+      createMetadataRecord<MetadataRecord::RecordKinds::NewBuffer>(
+          static_cast<int32_t>(Tid)),
 
-  // Also write the WalltimeMarker record.
-  {
-    static_assert(sizeof(time_t) <= 8, "time_t needs to be at most 8 bytes");
-    auto &WalltimeMarker = Metadata[1];
-    WalltimeMarker.Type = uint8_t(RecordType::Metadata);
-    WalltimeMarker.RecordKind =
-        uint8_t(MetadataRecord::RecordKinds::WalltimeMarker);
+      // Also write the WalltimeMarker record. We only really need microsecond
+      // precision here, and enforce across platforms that we need 64-bit
+      // seconds and 32-bit microseconds encoded in the Metadata record.
+      createMetadataRecord<MetadataRecord::RecordKinds::WalltimeMarker>(
+          static_cast<int64_t>(TS.tv_sec),
+          static_cast<int32_t>(TS.tv_nsec / 1000)),
 
-    // We only really need microsecond precision here, and enforce across
-    // platforms that we need 64-bit seconds and 32-bit microseconds encoded in
-    // the Metadata record.
-    int32_t Micros = TS.tv_nsec / 1000;
-    int64_t Seconds = TS.tv_sec;
-    internal_memcpy(WalltimeMarker.Data, &Seconds, sizeof(Seconds));
-    internal_memcpy(WalltimeMarker.Data + sizeof(Seconds), &Micros,
-                    sizeof(Micros));
-  }
-
-  // Also write the Pid record.
-  {
-    // Write out a MetadataRecord that contains the current pid
-    auto &PidMetadata = Metadata[2];
-    PidMetadata.Type = uint8_t(RecordType::Metadata);
-    PidMetadata.RecordKind = uint8_t(MetadataRecord::RecordKinds::Pid);
-    int32_t pid = static_cast<int32_t>(Pid);
-    internal_memcpy(&PidMetadata.Data, &pid, sizeof(pid));
-  }
+      // Also write the Pid record.
+      createMetadataRecord<MetadataRecord::RecordKinds::Pid>(
+          static_cast<int32_t>(Pid)),
+  };
 
   TLD.NumConsecutiveFnEnters = 0;
   TLD.NumTailCalls = 0;
   if (TLD.BQ == nullptr || TLD.BQ->finalizing())
     return;
-  internal_memcpy(TLD.RecordPtr, Metadata, sizeof(Metadata));
-  TLD.RecordPtr += sizeof(Metadata);
-  // Since we write out the extents as the first metadata record of the
-  // buffer, we need to write out the extents including the extents record.
-  atomic_store(&TLD.Buffer.Extents->Size, sizeof(Metadata),
-               memory_order_release);
+  FDRLogWriter Writer(TLD.Buffer);
+  TLD.RecordPtr += Writer.writeMetadataRecords(Metadata);
 }
 
 static void setupNewBuffer(int (*wall_clock_reader)(
@@ -201,6 +178,7 @@
   auto &TLD = getThreadLocalData();
   auto &B = TLD.Buffer;
   TLD.RecordPtr = static_cast<char *>(B.Data);
+  atomic_store(&B.Extents, 0, memory_order_release);
   tid_t Tid = GetTid();
   timespec TS{0, 0};
   pid_t Pid = internal_getpid();
@@ -213,91 +191,90 @@
 
 static void incrementExtents(size_t Add) {
   auto &TLD = getThreadLocalData();
-  atomic_fetch_add(&TLD.Buffer.Extents->Size, Add, memory_order_acq_rel);
+  atomic_fetch_add(&TLD.Buffer.Extents, Add, memory_order_acq_rel);
 }
 
 static void decrementExtents(size_t Subtract) {
   auto &TLD = getThreadLocalData();
-  atomic_fetch_sub(&TLD.Buffer.Extents->Size, Subtract, memory_order_acq_rel);
+  atomic_fetch_sub(&TLD.Buffer.Extents, Subtract, memory_order_acq_rel);
 }
 
 static void writeNewCPUIdMetadata(uint16_t CPU,
                                   uint64_t TSC) XRAY_NEVER_INSTRUMENT {
   auto &TLD = getThreadLocalData();
-  MetadataRecord NewCPUId;
-  NewCPUId.Type = uint8_t(RecordType::Metadata);
-  NewCPUId.RecordKind = uint8_t(MetadataRecord::RecordKinds::NewCPUId);
+  FDRLogWriter W(TLD.Buffer, TLD.RecordPtr);
 
   // The data for the New CPU will contain the following bytes:
   //   - CPU ID (uint16_t, 2 bytes)
   //   - Full TSC (uint64_t, 8 bytes)
   // Total = 10 bytes.
-  internal_memcpy(&NewCPUId.Data, &CPU, sizeof(CPU));
-  internal_memcpy(&NewCPUId.Data[sizeof(CPU)], &TSC, sizeof(TSC));
-  internal_memcpy(TLD.RecordPtr, &NewCPUId, sizeof(MetadataRecord));
-  TLD.RecordPtr += sizeof(MetadataRecord);
+  W.writeMetadata<MetadataRecord::RecordKinds::NewCPUId>(CPU, TSC);
+  TLD.RecordPtr = W.getNextRecord();
   TLD.NumConsecutiveFnEnters = 0;
   TLD.NumTailCalls = 0;
-  incrementExtents(sizeof(MetadataRecord));
 }
 
 static void writeTSCWrapMetadata(uint64_t TSC) XRAY_NEVER_INSTRUMENT {
   auto &TLD = getThreadLocalData();
-  MetadataRecord TSCWrap;
-  TSCWrap.Type = uint8_t(RecordType::Metadata);
-  TSCWrap.RecordKind = uint8_t(MetadataRecord::RecordKinds::TSCWrap);
+  FDRLogWriter W(TLD.Buffer, TLD.RecordPtr);
 
   // The data for the TSCWrap record contains the following bytes:
   //   - Full TSC (uint64_t, 8 bytes)
   // Total = 8 bytes.
-  internal_memcpy(&TSCWrap.Data, &TSC, sizeof(TSC));
-  internal_memcpy(TLD.RecordPtr, &TSCWrap, sizeof(MetadataRecord));
-  TLD.RecordPtr += sizeof(MetadataRecord);
+  W.writeMetadata<MetadataRecord::RecordKinds::TSCWrap>(TSC);
+  TLD.RecordPtr = W.getNextRecord();
   TLD.NumConsecutiveFnEnters = 0;
   TLD.NumTailCalls = 0;
-  incrementExtents(sizeof(MetadataRecord));
 }
 
 // Call Argument metadata records store the arguments to a function in the
 // order of their appearance; holes are not supported by the buffer format.
 static void writeCallArgumentMetadata(uint64_t A) XRAY_NEVER_INSTRUMENT {
   auto &TLD = getThreadLocalData();
-  MetadataRecord CallArg;
-  CallArg.Type = uint8_t(RecordType::Metadata);
-  CallArg.RecordKind = uint8_t(MetadataRecord::RecordKinds::CallArgument);
-
-  internal_memcpy(CallArg.Data, &A, sizeof(A));
-  internal_memcpy(TLD.RecordPtr, &CallArg, sizeof(MetadataRecord));
-  TLD.RecordPtr += sizeof(MetadataRecord);
-  incrementExtents(sizeof(MetadataRecord));
+  FDRLogWriter W(TLD.Buffer, TLD.RecordPtr);
+  W.writeMetadata<MetadataRecord::RecordKinds::CallArgument>(A);
+  TLD.RecordPtr = W.getNextRecord();
 }
 
-static void writeFunctionRecord(int FuncId, uint32_t TSCDelta,
+static void writeFunctionRecord(int32_t FuncId, uint32_t TSCDelta,
                                 XRayEntryType EntryType) XRAY_NEVER_INSTRUMENT {
-  FunctionRecord FuncRecord;
-  FuncRecord.Type = uint8_t(RecordType::Function);
-  // Only take 28 bits of the function id.
-  FuncRecord.FuncId = FuncId & ~(0x0F << 28);
-  FuncRecord.TSCDelta = TSCDelta;
+  constexpr int32_t MaxFuncId = (1 << 29) - 1;
+  if (UNLIKELY(FuncId > MaxFuncId)) {
+    if (Verbosity())
+      Report("Warning: Function ID '%d' > max function id: '%d'", FuncId,
+             MaxFuncId);
+    return;
+  }
 
   auto &TLD = getThreadLocalData();
+  FDRLogWriter W(TLD.Buffer, TLD.RecordPtr);
+
+  // Only take 28 bits of the function id.
+  //
+  // We need to be careful about the sign bit and the bitwise operations being
+  // performed here. In effect, we want to truncate the value of the function id
+  // to the first 28 bits. To do this properly, this means we need to mask the
+  // function id with (2 ^ 28) - 1 == 0x0fffffff.
+  //
+  auto TruncatedId = FuncId & MaxFuncId;
+  auto Kind = FDRLogWriter::FunctionRecordKind::Enter;
+
   switch (EntryType) {
   case XRayEntryType::ENTRY:
     ++TLD.NumConsecutiveFnEnters;
-    FuncRecord.RecordKind = uint8_t(FunctionRecord::RecordKinds::FunctionEnter);
     break;
   case XRayEntryType::LOG_ARGS_ENTRY:
     // We should not rewind functions with logged args.
     TLD.NumConsecutiveFnEnters = 0;
     TLD.NumTailCalls = 0;
-    FuncRecord.RecordKind = uint8_t(FunctionRecord::RecordKinds::FunctionEnter);
+    Kind = FDRLogWriter::FunctionRecordKind::EnterArg;
     break;
   case XRayEntryType::EXIT:
     // If we've decided to log the function exit, we will never erase the log
     // before it.
     TLD.NumConsecutiveFnEnters = 0;
     TLD.NumTailCalls = 0;
-    FuncRecord.RecordKind = uint8_t(FunctionRecord::RecordKinds::FunctionExit);
+    Kind = FDRLogWriter::FunctionRecordKind::Exit;
     break;
   case XRayEntryType::TAIL:
     // If we just entered the function we're tail exiting from or erased every
@@ -312,8 +289,7 @@
       TLD.NumTailCalls = 0;
       TLD.NumConsecutiveFnEnters = 0;
     }
-    FuncRecord.RecordKind =
-        uint8_t(FunctionRecord::RecordKinds::FunctionTailExit);
+    Kind = FDRLogWriter::FunctionRecordKind::TailExit;
     break;
   case XRayEntryType::CUSTOM_EVENT: {
     // This is a bug in patching, so we'll report it once and move on.
@@ -334,9 +310,8 @@
   }
   }
 
-  internal_memcpy(TLD.RecordPtr, &FuncRecord, sizeof(FunctionRecord));
-  TLD.RecordPtr += sizeof(FunctionRecord);
-  incrementExtents(sizeof(FunctionRecord));
+  W.writeFunction(Kind, TruncatedId, TSCDelta);
+  TLD.RecordPtr = W.getNextRecord();
 }
 
 static atomic_uint64_t TicksPerSec{0};
@@ -345,7 +320,8 @@
 // Re-point the thread local pointer into this thread's Buffer before the recent
 // "Function Entry" record and any "Tail Call Exit" records after that.
 static void rewindRecentCall(uint64_t TSC, uint64_t &LastTSC,
-                             uint64_t &LastFunctionEntryTSC, int32_t FuncId) {
+                             uint64_t &LastFunctionEntryTSC,
+                             int32_t FuncId) XRAY_NEVER_INSTRUMENT {
   auto &TLD = getThreadLocalData();
   TLD.RecordPtr -= FunctionRecSize;
   decrementExtents(FunctionRecSize);
@@ -410,6 +386,9 @@
 static bool releaseThreadLocalBuffer(BufferQueue &BQArg) {
   auto &TLD = getThreadLocalData();
   auto EC = BQArg.releaseBuffer(TLD.Buffer);
+  if (TLD.Buffer.Data == nullptr)
+    return true;
+
   if (EC != BufferQueue::ErrorCode::Ok) {
     Report("Failed to release buffer at %p; error=%s\n", TLD.Buffer.Data,
            BufferQueue::getErrorString(EC));
@@ -521,6 +500,7 @@
     writeNewCPUIdMetadata(CPU, TSC);
     return 0;
   }
+
   // If the delta is greater than the range for a uint32_t, then we write out
   // the TSC wrap metadata entry with the full TSC, and the TSC for the
   // function record be 0.
@@ -570,7 +550,7 @@
 
   auto &TLD = getThreadLocalData();
 
-  if (TLD.BQ == nullptr)
+  if (TLD.BQ == nullptr && BQ != nullptr)
     TLD.BQ = BQ;
 
   if (!isLogInitializedAndReady(TLD.BQ, TSC, CPU, wall_clock_reader))
@@ -596,14 +576,16 @@
   //       1. When the delta between the TSC we get and the previous TSC for the
   //          same CPU is outside of the uint32_t range, we end up having to
   //          write a MetadataRecord to indicate a "tsc wrap" before the actual
-  //          FunctionRecord.
+  //          FunctionRecord. This means we have: 1 MetadataRecord + 1 Function
+  //          Record.
   //       2. When we learn that we've moved CPUs, we need to write a
   //          MetadataRecord to indicate a "cpu change", and thus write out the
   //          current TSC for that CPU before writing out the actual
-  //          FunctionRecord.
-  //       3. When we learn about a new CPU ID, we need to write down a "new cpu
-  //          id" MetadataRecord before writing out the actual FunctionRecord.
-  //       4. The second MetadataRecord is the optional function call argument.
+  //          FunctionRecord. This means we have: 1 MetadataRecord + 1 Function
+  //          Record.
+  //       3. Given the previous two cases, in addition we can add at most one
+  //          function argument record. This means we have: 2 MetadataRecord + 1
+  //          Function Record.
   //
   // So the math we need to do is to determine whether writing 40 bytes past the
   // current pointer exceeds the buffer's maximum size. If we don't have enough
@@ -615,20 +597,21 @@
     return;
   }
 
-  // By this point, we are now ready to write up to 40 bytes (explained above).
-  DCHECK((TLD.RecordPtr + MaxSize) - static_cast<char *>(TLD.Buffer.Data) >=
-             static_cast<ptrdiff_t>(MetadataRecSize) &&
-         "Misconfigured BufferQueue provided; Buffer size not large enough.");
-
   auto RecordTSCDelta = writeCurrentCPUTSC(TLD, TSC, CPU);
   TLD.LastTSC = TSC;
   TLD.CurrentCPU = CPU;
   switch (Entry) {
   case XRayEntryType::ENTRY:
-  case XRayEntryType::LOG_ARGS_ENTRY:
     // Update the thread local state for the next invocation.
     TLD.LastFunctionEntryTSC = TSC;
     break;
+  case XRayEntryType::LOG_ARGS_ENTRY:
+    // Update the thread local state for the next invocation, but also prevent
+    // rewinding when we have arguments logged.
+    TLD.LastFunctionEntryTSC = TSC;
+    TLD.NumConsecutiveFnEnters = 0;
+    TLD.NumTailCalls = 0;
+    break;
   case XRayEntryType::TAIL:
   case XRayEntryType::EXIT:
     // Break out and write the exit record if we can't erase any functions.
@@ -741,7 +724,8 @@
 
   static BufferQueue::const_iterator It{};
   static BufferQueue::const_iterator End{};
-  static void *CurrentBuffer{nullptr};
+  static uint8_t *CurrentBuffer{nullptr};
+  static size_t SerializedBufferSize = 0;
   if (B.Data == static_cast<void *>(&Header) && B.Size == sizeof(Header)) {
     // From this point on, we provide raw access to the raw buffer we're getting
     // from the BufferQueue. We're relying on the iterators from the current
@@ -751,7 +735,7 @@
   }
 
   if (CurrentBuffer != nullptr) {
-    InternalFree(CurrentBuffer);
+    deallocateBuffer(CurrentBuffer, SerializedBufferSize);
     CurrentBuffer = nullptr;
   }
 
@@ -762,9 +746,9 @@
   // out to disk. The difference here would be that we still write "empty"
   // buffers, or at least go through the iterators faithfully to let the
   // handlers see the empty buffers in the queue.
-  auto BufferSize = atomic_load(&It->Extents->Size, memory_order_acquire);
-  auto SerializedBufferSize = BufferSize + sizeof(MetadataRecord);
-  CurrentBuffer = InternalAlloc(SerializedBufferSize);
+  auto BufferSize = atomic_load(&It->Extents, memory_order_acquire);
+  SerializedBufferSize = BufferSize + sizeof(MetadataRecord);
+  CurrentBuffer = allocateBuffer(SerializedBufferSize);
   if (CurrentBuffer == nullptr)
     return {nullptr, 0};
 
@@ -832,7 +816,6 @@
       if (TLD.RecordPtr != nullptr && TLD.BQ != nullptr)
         releaseThreadLocalBuffer(*TLD.BQ);
       BQ->~BufferQueue();
-      InternalFree(BQ);
       BQ = nullptr;
     }
   });
@@ -855,15 +838,7 @@
   //      (fixed-sized) and let the tools reading the buffers deal with the data
   //      afterwards.
   //
-  int Fd = -1;
-  {
-    // FIXME: Remove this section of the code, when we remove the struct-based
-    // configuration API.
-    SpinMutexLock Guard(&FDROptionsMutex);
-    Fd = FDROptions.Fd;
-  }
-  if (Fd == -1)
-    Fd = getLogFD();
+  int Fd = getLogFD();
   if (Fd == -1) {
     auto Result = XRayLogFlushStatus::XRAY_LOG_NOT_FLUSHING;
     atomic_store(&LogFlushStatus, Result, memory_order_release);
@@ -875,6 +850,13 @@
   retryingWriteAll(Fd, reinterpret_cast<char *>(&Header),
                    reinterpret_cast<char *>(&Header) + sizeof(Header));
 
+  // Release the current thread's buffer before we attempt to write out all the
+  // buffers. This ensures that in case we had only a single thread going, that
+  // we are able to capture the data nonetheless.
+  auto &TLD = getThreadLocalData();
+  if (TLD.RecordPtr != nullptr && TLD.BQ != nullptr)
+    releaseThreadLocalBuffer(*TLD.BQ);
+
   BQ->apply([&](const BufferQueue::Buffer &B) {
     // Starting at version 2 of the FDR logging implementation, we only write
     // the records identified by the extents of the buffer. We use the Extents
@@ -882,7 +864,7 @@
     // still use a Metadata record, but fill in the extents instead for the
     // data.
     MetadataRecord ExtentsRecord;
-    auto BufferExtents = atomic_load(&B.Extents->Size, memory_order_acquire);
+    auto BufferExtents = atomic_load(&B.Extents, memory_order_acquire);
     DCHECK(BufferExtents <= B.Size);
     ExtentsRecord.Type = uint8_t(RecordType::Metadata);
     ExtentsRecord.RecordKind =
@@ -914,7 +896,12 @@
 
   // Do special things to make the log finalize itself, and not allow any more
   // operations to be performed until re-initialized.
-  BQ->finalize();
+  if (BQ == nullptr) {
+    if (Verbosity())
+      Report("Attempting to finalize an uninitialized global buffer!\n");
+  } else {
+    BQ->finalize();
+  }
 
   atomic_store(&LoggingStatus, XRayLogInitStatus::XRAY_LOG_FINALIZED,
                memory_order_release);
@@ -986,11 +973,17 @@
   //   - The metadata record we're going to write. (16 bytes)
   //   - The additional data we're going to write. Currently, that's the size
   //   of the event we're going to dump into the log as free-form bytes.
-  if (!prepareBuffer(TSC, CPU, clock_gettime, MetadataRecSize + EventSize)) {
+  if (!prepareBuffer(TSC, CPU, clock_gettime,
+                     MetadataRecSize + ReducedEventSize)) {
     TLD.BQ = nullptr;
     return;
   }
 
+  // We need to reset the counts for the number of functions we're able to
+  // rewind.
+  TLD.NumConsecutiveFnEnters = 0;
+  TLD.NumTailCalls = 0;
+
   // Write the custom event metadata record, which consists of the following
   // information:
   //   - 8 bytes (64-bits) for the full TSC when the event started.
@@ -1001,11 +994,12 @@
       uint8_t(MetadataRecord::RecordKinds::CustomEventMarker);
   constexpr auto TSCSize = sizeof(TC.TSC);
   internal_memcpy(&CustomEvent.Data, &ReducedEventSize, sizeof(int32_t));
-  internal_memcpy(&CustomEvent.Data[sizeof(int32_t)], &TSC, TSCSize);
+  internal_memcpy(&CustomEvent.Data + sizeof(int32_t), &TSC, TSCSize);
   internal_memcpy(TLD.RecordPtr, &CustomEvent, sizeof(CustomEvent));
   TLD.RecordPtr += sizeof(CustomEvent);
   internal_memcpy(TLD.RecordPtr, Event, ReducedEventSize);
-  incrementExtents(MetadataRecSize + EventSize);
+  TLD.RecordPtr += ReducedEventSize;
+  incrementExtents(MetadataRecSize + ReducedEventSize);
   endBufferIfFull();
 }
 
@@ -1031,7 +1025,8 @@
   //   - The metadata record we're going to write. (16 bytes)
   //   - The additional data we're going to write. Currently, that's the size
   //   of the event we're going to dump into the log as free-form bytes.
-  if (!prepareBuffer(TSC, CPU, clock_gettime, MetadataRecSize + EventSize)) {
+  if (!prepareBuffer(TSC, CPU, clock_gettime,
+                     MetadataRecSize + ReducedEventSize)) {
     TLD.BQ = nullptr;
     return;
   }
@@ -1056,12 +1051,13 @@
 
   TLD.RecordPtr += sizeof(TypedEvent);
   internal_memcpy(TLD.RecordPtr, Event, ReducedEventSize);
+  TLD.RecordPtr += ReducedEventSize;
   incrementExtents(MetadataRecSize + EventSize);
   endBufferIfFull();
 }
 
-XRayLogInitStatus fdrLoggingInit(size_t BufferSize, size_t BufferMax,
-                                 void *Options,
+XRayLogInitStatus fdrLoggingInit(UNUSED size_t BufferSize,
+                                 UNUSED size_t BufferMax, void *Options,
                                  size_t OptionsSize) XRAY_NEVER_INSTRUMENT {
   if (Options == nullptr)
     return XRayLogInitStatus::XRAY_LOG_UNINITIALIZED;
@@ -1075,76 +1071,51 @@
     return static_cast<XRayLogInitStatus>(CurrentStatus);
   }
 
-  // Because of __xray_log_init_mode(...) which guarantees that this will be
-  // called with BufferSize == 0 and BufferMax == 0 we parse the configuration
-  // provided in the Options pointer as a string instead.
-  if (BufferSize == 0 && BufferMax == 0) {
-    if (Verbosity())
-      Report("Initializing FDR mode with options: %s\n",
-             static_cast<const char *>(Options));
+  if (Verbosity())
+    Report("Initializing FDR mode with options: %s\n",
+           static_cast<const char *>(Options));
 
-    // TODO: Factor out the flags specific to the FDR mode implementation. For
-    // now, use the global/single definition of the flags, since the FDR mode
-    // flags are already defined there.
-    FlagParser FDRParser;
-    FDRFlags FDRFlags;
-    registerXRayFDRFlags(&FDRParser, &FDRFlags);
-    FDRFlags.setDefaults();
+  // TODO: Factor out the flags specific to the FDR mode implementation. For
+  // now, use the global/single definition of the flags, since the FDR mode
+  // flags are already defined there.
+  FlagParser FDRParser;
+  FDRFlags FDRFlags;
+  registerXRayFDRFlags(&FDRParser, &FDRFlags);
+  FDRFlags.setDefaults();
 
-    // Override first from the general XRAY_DEFAULT_OPTIONS compiler-provided
-    // options until we migrate everyone to use the XRAY_FDR_OPTIONS
-    // compiler-provided options.
-    FDRParser.ParseString(useCompilerDefinedFlags());
-    FDRParser.ParseString(useCompilerDefinedFDRFlags());
-    auto *EnvOpts = GetEnv("XRAY_FDR_OPTIONS");
-    if (EnvOpts == nullptr)
-      EnvOpts = "";
-    FDRParser.ParseString(EnvOpts);
+  // Override first from the general XRAY_DEFAULT_OPTIONS compiler-provided
+  // options until we migrate everyone to use the XRAY_FDR_OPTIONS
+  // compiler-provided options.
+  FDRParser.ParseString(useCompilerDefinedFlags());
+  FDRParser.ParseString(useCompilerDefinedFDRFlags());
+  auto *EnvOpts = GetEnv("XRAY_FDR_OPTIONS");
+  if (EnvOpts == nullptr)
+    EnvOpts = "";
+  FDRParser.ParseString(EnvOpts);
 
-    // FIXME: Remove this when we fully remove the deprecated flags.
-    if (internal_strlen(EnvOpts) == 0) {
-      FDRFlags.func_duration_threshold_us =
-          flags()->xray_fdr_log_func_duration_threshold_us;
-      FDRFlags.grace_period_ms = flags()->xray_fdr_log_grace_period_ms;
-    }
-
-    // The provided options should always override the compiler-provided and
-    // environment-variable defined options.
-    FDRParser.ParseString(static_cast<const char *>(Options));
-    *fdrFlags() = FDRFlags;
-    BufferSize = FDRFlags.buffer_size;
-    BufferMax = FDRFlags.buffer_max;
-    SpinMutexLock Guard(&FDROptionsMutex);
-    FDROptions.Fd = -1;
-    FDROptions.ReportErrors = true;
-  } else if (OptionsSize != sizeof(FDRLoggingOptions)) {
-    // FIXME: This is deprecated, and should really be removed.
-    // At this point we use the flag parser specific to the FDR mode
-    // implementation.
-    if (Verbosity())
-      Report("Cannot initialize FDR logging; wrong size for options: %d\n",
-             OptionsSize);
-    return static_cast<XRayLogInitStatus>(
-        atomic_load(&LoggingStatus, memory_order_acquire));
-  } else {
-    if (Verbosity())
-      Report("XRay FDR: struct-based init is deprecated, please use "
-             "string-based configuration instead.\n");
-    SpinMutexLock Guard(&FDROptionsMutex);
-    internal_memcpy(&FDROptions, Options, OptionsSize);
+  // FIXME: Remove this when we fully remove the deprecated flags.
+  if (internal_strlen(EnvOpts) == 0) {
+    FDRFlags.func_duration_threshold_us =
+        flags()->xray_fdr_log_func_duration_threshold_us;
+    FDRFlags.grace_period_ms = flags()->xray_fdr_log_grace_period_ms;
   }
 
+  // The provided options should always override the compiler-provided and
+  // environment-variable defined options.
+  FDRParser.ParseString(static_cast<const char *>(Options));
+  *fdrFlags() = FDRFlags;
+  BufferSize = FDRFlags.buffer_size;
+  BufferMax = FDRFlags.buffer_max;
+
   bool Success = false;
 
   if (BQ != nullptr) {
     BQ->~BufferQueue();
-    InternalFree(BQ);
     BQ = nullptr;
   }
 
   if (BQ == nullptr) {
-    BQ = reinterpret_cast<BufferQueue *>(
-        InternalAlloc(sizeof(BufferQueue), nullptr, 64));
+    BQ = reinterpret_cast<BufferQueue *>(&BufferQueueStorage);
     new (BQ) BufferQueue(BufferSize, BufferMax, Success);
   }
 
@@ -1152,7 +1123,6 @@
     Report("BufferQueue init failed.\n");
     if (BQ != nullptr) {
       BQ->~BufferQueue();
-      InternalFree(BQ);
       BQ = nullptr;
     }
     return XRayLogInitStatus::XRAY_LOG_UNINITIALIZED;
@@ -1170,6 +1140,8 @@
       auto &TLD = *reinterpret_cast<ThreadLocalData *>(TLDPtr);
       if (TLD.BQ == nullptr)
         return;
+      if (TLD.Buffer.Data == nullptr)
+        return;
       auto EC = TLD.BQ->releaseBuffer(TLD.Buffer);
       if (EC != BufferQueue::ErrorCode::Ok)
         Report("At thread exit, failed to release buffer at %p; error=%s\n",
@@ -1209,11 +1181,22 @@
   };
   auto RegistrationResult = __xray_log_register_mode("xray-fdr", Impl);
   if (RegistrationResult != XRayLogRegisterStatus::XRAY_REGISTRATION_OK &&
-      Verbosity())
+      Verbosity()) {
     Report("Cannot register XRay FDR mode to 'xray-fdr'; error = %d\n",
            RegistrationResult);
-  if (flags()->xray_fdr_log || !internal_strcmp(flags()->xray_mode, "xray-fdr"))
-    __xray_set_log_impl(Impl);
+    return false;
+  }
+
+  if (flags()->xray_fdr_log ||
+      !internal_strcmp(flags()->xray_mode, "xray-fdr")) {
+    auto SelectResult = __xray_log_select_mode("xray-fdr");
+    if (SelectResult != XRayLogRegisterStatus::XRAY_REGISTRATION_OK &&
+        Verbosity()) {
+      Report("Cannot select XRay FDR mode as 'xray-fdr'; error = %d\n",
+             SelectResult);
+      return false;
+    }
+  }
   return true;
 }
 
diff --git a/lib/xray/xray_function_call_trie.h b/lib/xray/xray_function_call_trie.h
index 2acf14a..f4c2fc3 100644
--- a/lib/xray/xray_function_call_trie.h
+++ b/lib/xray/xray_function_call_trie.h
@@ -15,7 +15,7 @@
 #ifndef XRAY_FUNCTION_CALL_TRIE_H
 #define XRAY_FUNCTION_CALL_TRIE_H
 
-#include "sanitizer_common/sanitizer_allocator_internal.h"
+#include "xray_defs.h"
 #include "xray_profiling_flags.h"
 #include "xray_segmented_array.h"
 #include <memory> // For placement new.
@@ -120,9 +120,11 @@
     // We add a constructor here to allow us to inplace-construct through
     // Array<...>'s AppendEmplace.
     Node(Node *P, NodeIdPairAllocatorType &A, int64_t CC, int64_t CLT,
-         int32_t F)
-        : Parent(P), Callees(A), CallCount(CC), CumulativeLocalTime(CLT),
-          FId(F) {}
+         int32_t F) XRAY_NEVER_INSTRUMENT : Parent(P),
+                                            Callees(A),
+                                            CallCount(CC),
+                                            CumulativeLocalTime(CLT),
+                                            FId(F) {}
 
     // TODO: Include the compact histogram.
   };
@@ -134,7 +136,8 @@
 
     // We add a constructor here to allow us to inplace-construct through
     // Array<...>'s AppendEmplace.
-    ShadowStackEntry(uint64_t T, Node *N) : EntryTSC{T}, NodePtr{N} {}
+    ShadowStackEntry(uint64_t T, Node *N) XRAY_NEVER_INSTRUMENT : EntryTSC{T},
+                                                                  NodePtr{N} {}
   };
 
   using NodeArray = Array<Node>;
@@ -158,8 +161,9 @@
     Allocators(const Allocators &) = delete;
     Allocators &operator=(const Allocators &) = delete;
 
-    Allocators(Allocators &&O)
-        : NodeAllocator(O.NodeAllocator), RootAllocator(O.RootAllocator),
+    Allocators(Allocators &&O) XRAY_NEVER_INSTRUMENT
+        : NodeAllocator(O.NodeAllocator),
+          RootAllocator(O.RootAllocator),
           ShadowStackAllocator(O.ShadowStackAllocator),
           NodeIdPairAllocator(O.NodeIdPairAllocator) {
       O.NodeAllocator = nullptr;
@@ -168,7 +172,7 @@
       O.NodeIdPairAllocator = nullptr;
     }
 
-    Allocators &operator=(Allocators &&O) {
+    Allocators &operator=(Allocators &&O) XRAY_NEVER_INSTRUMENT {
       {
         auto Tmp = O.NodeAllocator;
         O.NodeAllocator = this->NodeAllocator;
@@ -192,58 +196,54 @@
       return *this;
     }
 
-    ~Allocators() {
+    ~Allocators() XRAY_NEVER_INSTRUMENT {
       // Note that we cannot use delete on these pointers, as they need to be
       // returned to the sanitizer_common library's internal memory tracking
       // system.
       if (NodeAllocator != nullptr) {
         NodeAllocator->~NodeAllocatorType();
-        InternalFree(NodeAllocator);
+        deallocate(NodeAllocator);
         NodeAllocator = nullptr;
       }
       if (RootAllocator != nullptr) {
         RootAllocator->~RootAllocatorType();
-        InternalFree(RootAllocator);
+        deallocate(RootAllocator);
         RootAllocator = nullptr;
       }
       if (ShadowStackAllocator != nullptr) {
         ShadowStackAllocator->~ShadowStackAllocatorType();
-        InternalFree(ShadowStackAllocator);
+        deallocate(ShadowStackAllocator);
         ShadowStackAllocator = nullptr;
       }
       if (NodeIdPairAllocator != nullptr) {
         NodeIdPairAllocator->~NodeIdPairAllocatorType();
-        InternalFree(NodeIdPairAllocator);
+        deallocate(NodeIdPairAllocator);
         NodeIdPairAllocator = nullptr;
       }
     }
   };
 
   // TODO: Support configuration of options through the arguments.
-  static Allocators InitAllocators() {
+  static Allocators InitAllocators() XRAY_NEVER_INSTRUMENT {
     return InitAllocatorsCustom(profilingFlags()->per_thread_allocator_max);
   }
 
-  static Allocators InitAllocatorsCustom(uptr Max) {
+  static Allocators InitAllocatorsCustom(uptr Max) XRAY_NEVER_INSTRUMENT {
     Allocators A;
-    auto NodeAllocator = reinterpret_cast<Allocators::NodeAllocatorType *>(
-        InternalAlloc(sizeof(Allocators::NodeAllocatorType)));
+    auto NodeAllocator = allocate<Allocators::NodeAllocatorType>();
     new (NodeAllocator) Allocators::NodeAllocatorType(Max);
     A.NodeAllocator = NodeAllocator;
 
-    auto RootAllocator = reinterpret_cast<Allocators::RootAllocatorType *>(
-        InternalAlloc(sizeof(Allocators::RootAllocatorType)));
+    auto RootAllocator = allocate<Allocators::RootAllocatorType>();
     new (RootAllocator) Allocators::RootAllocatorType(Max);
     A.RootAllocator = RootAllocator;
 
     auto ShadowStackAllocator =
-        reinterpret_cast<Allocators::ShadowStackAllocatorType *>(
-            InternalAlloc(sizeof(Allocators::ShadowStackAllocatorType)));
+        allocate<Allocators::ShadowStackAllocatorType>();
     new (ShadowStackAllocator) Allocators::ShadowStackAllocatorType(Max);
     A.ShadowStackAllocator = ShadowStackAllocator;
 
-    auto NodeIdPairAllocator = reinterpret_cast<NodeIdPairAllocatorType *>(
-        InternalAlloc(sizeof(NodeIdPairAllocatorType)));
+    auto NodeIdPairAllocator = allocate<NodeIdPairAllocatorType>();
     new (NodeIdPairAllocator) NodeIdPairAllocatorType(Max);
     A.NodeIdPairAllocator = NodeIdPairAllocator;
     return A;
@@ -256,12 +256,13 @@
   NodeIdPairAllocatorType *NodeIdPairAllocator = nullptr;
 
 public:
-  explicit FunctionCallTrie(const Allocators &A)
-      : Nodes(*A.NodeAllocator), Roots(*A.RootAllocator),
+  explicit FunctionCallTrie(const Allocators &A) XRAY_NEVER_INSTRUMENT
+      : Nodes(*A.NodeAllocator),
+        Roots(*A.RootAllocator),
         ShadowStack(*A.ShadowStackAllocator),
         NodeIdPairAllocator(A.NodeIdPairAllocator) {}
 
-  void enterFunction(const int32_t FId, uint64_t TSC) {
+  void enterFunction(const int32_t FId, uint64_t TSC) XRAY_NEVER_INSTRUMENT {
     DCHECK_NE(FId, 0);
     // This function primarily deals with ensuring that the ShadowStack is
     // consistent and ready for when an exit event is encountered.
@@ -301,7 +302,7 @@
     return;
   }
 
-  void exitFunction(int32_t FId, uint64_t TSC) {
+  void exitFunction(int32_t FId, uint64_t TSC) XRAY_NEVER_INSTRUMENT {
     // When we exit a function, we look up the ShadowStack to see whether we've
     // entered this function before. We do as little processing here as we can,
     // since most of the hard work would have already been done at function
@@ -323,7 +324,7 @@
     }
   }
 
-  const RootArray &getRoots() const { return Roots; }
+  const RootArray &getRoots() const XRAY_NEVER_INSTRUMENT { return Roots; }
 
   // The deepCopyInto operation will update the provided FunctionCallTrie by
   // re-creating the contents of this particular FunctionCallTrie in the other
@@ -338,7 +339,7 @@
   // synchronisation of both "this" and |O|.
   //
   // This function must *not* be called with a non-empty FunctionCallTrie |O|.
-  void deepCopyInto(FunctionCallTrie &O) const {
+  void deepCopyInto(FunctionCallTrie &O) const XRAY_NEVER_INSTRUMENT {
     DCHECK(O.getRoots().empty());
 
     // We then push the root into a stack, to use as the parent marker for new
@@ -394,7 +395,7 @@
   //
   // This function is *not* thread-safe, and may require external
   // synchronisation of both "this" and |O|.
-  void mergeInto(FunctionCallTrie &O) const {
+  void mergeInto(FunctionCallTrie &O) const XRAY_NEVER_INSTRUMENT {
     struct NodeAndTarget {
       FunctionCallTrie::Node *OrigNode;
       FunctionCallTrie::Node *TargetNode;
diff --git a/lib/xray/xray_init.cc b/lib/xray/xray_init.cc
index b4e0697..8886a60 100644
--- a/lib/xray/xray_init.cc
+++ b/lib/xray/xray_init.cc
@@ -27,6 +27,15 @@
 extern const XRaySledEntry __stop_xray_instr_map[] __attribute__((weak));
 extern const XRayFunctionSledIndex __start_xray_fn_idx[] __attribute__((weak));
 extern const XRayFunctionSledIndex __stop_xray_fn_idx[] __attribute__((weak));
+
+#if SANITIZER_MAC
+// HACK: This is a temporary workaround to make XRay build on 
+// Darwin, but it will probably not work at runtime.
+const XRaySledEntry __start_xray_instr_map[] = {};
+extern const XRaySledEntry __stop_xray_instr_map[] = {};
+extern const XRayFunctionSledIndex __start_xray_fn_idx[] = {};
+extern const XRayFunctionSledIndex __stop_xray_fn_idx[] = {};
+#endif
 }
 
 using namespace __xray;
diff --git a/lib/xray/xray_profile_collector.cc b/lib/xray/xray_profile_collector.cc
index 17a611e..a2a8f1f 100644
--- a/lib/xray/xray_profile_collector.cc
+++ b/lib/xray/xray_profile_collector.cc
@@ -13,10 +13,11 @@
 //
 //===----------------------------------------------------------------------===//
 #include "xray_profile_collector.h"
-#include "sanitizer_common/sanitizer_allocator_internal.h"
 #include "sanitizer_common/sanitizer_common.h"
-#include "sanitizer_common/sanitizer_vector.h"
+#include "xray_allocator.h"
+#include "xray_defs.h"
 #include "xray_profiling_flags.h"
+#include "xray_segmented_array.h"
 #include <memory>
 #include <pthread.h>
 #include <utility>
@@ -29,7 +30,7 @@
 SpinMutex GlobalMutex;
 struct ThreadTrie {
   tid_t TId;
-  FunctionCallTrie *Trie;
+  typename std::aligned_storage<sizeof(FunctionCallTrie)>::type TrieStorage;
 };
 
 struct ProfileBuffer {
@@ -56,65 +57,51 @@
   u64 ThreadId;
 };
 
-// These need to be pointers that point to heap/internal-allocator-allocated
-// objects because these are accessed even at program exit.
-Vector<ThreadTrie> *ThreadTries = nullptr;
-Vector<ProfileBuffer> *ProfileBuffers = nullptr;
-FunctionCallTrie::Allocators *GlobalAllocators = nullptr;
+using ThreadTriesArray = Array<ThreadTrie>;
+using ProfileBufferArray = Array<ProfileBuffer>;
+using ThreadTriesArrayAllocator = typename ThreadTriesArray::AllocatorType;
+using ProfileBufferArrayAllocator = typename ProfileBufferArray::AllocatorType;
+
+// These need to be global aligned storage to avoid dynamic initialization. We
+// need these to be aligned to allow us to placement new objects into the
+// storage, and have pointers to those objects be appropriately aligned.
+static typename std::aligned_storage<sizeof(FunctionCallTrie::Allocators)>::type
+    AllocatorStorage;
+static typename std::aligned_storage<sizeof(ThreadTriesArray)>::type
+    ThreadTriesStorage;
+static typename std::aligned_storage<sizeof(ProfileBufferArray)>::type
+    ProfileBuffersStorage;
+static typename std::aligned_storage<sizeof(ThreadTriesArrayAllocator)>::type
+    ThreadTriesArrayAllocatorStorage;
+static typename std::aligned_storage<sizeof(ProfileBufferArrayAllocator)>::type
+    ProfileBufferArrayAllocatorStorage;
+
+static ThreadTriesArray *ThreadTries = nullptr;
+static ThreadTriesArrayAllocator *ThreadTriesAllocator = nullptr;
+static ProfileBufferArray *ProfileBuffers = nullptr;
+static ProfileBufferArrayAllocator *ProfileBuffersAllocator = nullptr;
+static FunctionCallTrie::Allocators *GlobalAllocators = nullptr;
 
 } // namespace
 
-void post(const FunctionCallTrie &T, tid_t TId) {
+void post(const FunctionCallTrie &T, tid_t TId) XRAY_NEVER_INSTRUMENT {
   static pthread_once_t Once = PTHREAD_ONCE_INIT;
-  pthread_once(&Once, +[] {
-    SpinMutexLock Lock(&GlobalMutex);
-    GlobalAllocators = reinterpret_cast<FunctionCallTrie::Allocators *>(
-        InternalAlloc(sizeof(FunctionCallTrie::Allocators)));
-    new (GlobalAllocators) FunctionCallTrie::Allocators();
-    *GlobalAllocators = FunctionCallTrie::InitAllocatorsCustom(
-        profilingFlags()->global_allocator_max);
-    ThreadTries = reinterpret_cast<Vector<ThreadTrie> *>(
-        InternalAlloc(sizeof(Vector<ThreadTrie>)));
-    new (ThreadTries) Vector<ThreadTrie>();
-    ProfileBuffers = reinterpret_cast<Vector<ProfileBuffer> *>(
-        InternalAlloc(sizeof(Vector<ProfileBuffer>)));
-    new (ProfileBuffers) Vector<ProfileBuffer>();
-  });
-  DCHECK_NE(GlobalAllocators, nullptr);
-  DCHECK_NE(ThreadTries, nullptr);
-  DCHECK_NE(ProfileBuffers, nullptr);
+  pthread_once(&Once, +[] { reset(); });
 
   ThreadTrie *Item = nullptr;
   {
     SpinMutexLock Lock(&GlobalMutex);
-    if (GlobalAllocators == nullptr)
+    if (GlobalAllocators == nullptr || ThreadTries == nullptr)
       return;
 
-    Item = ThreadTries->PushBack();
+    Item = ThreadTries->Append({});
     Item->TId = TId;
-
-    // Here we're using the internal allocator instead of the managed allocator
-    // because:
-    //
-    // 1) We're not using the segmented array data structure to host
-    //    FunctionCallTrie objects. We're using a Vector (from sanitizer_common)
-    //    which works like a std::vector<...> keeping elements contiguous in
-    //    memory. The segmented array data structure assumes that elements are
-    //    trivially destructible, where FunctionCallTrie isn't.
-    //
-    // 2) Using a managed allocator means we need to manage that separately,
-    //    which complicates the nature of this code. To get around that, we're
-    //    using the internal allocator instead, which has its own global state
-    //    and is decoupled from the lifetime management required by the managed
-    //    allocator we have in XRay.
-    //
-    Item->Trie = reinterpret_cast<FunctionCallTrie *>(InternalAlloc(
-        sizeof(FunctionCallTrie), nullptr, alignof(FunctionCallTrie)));
-    DCHECK_NE(Item->Trie, nullptr);
-    new (Item->Trie) FunctionCallTrie(*GlobalAllocators);
+    auto Trie = reinterpret_cast<FunctionCallTrie *>(&Item->TrieStorage);
+    new (Trie) FunctionCallTrie(*GlobalAllocators);
   }
 
-  T.deepCopyInto(*Item->Trie);
+  auto Trie = reinterpret_cast<FunctionCallTrie *>(&Item->TrieStorage);
+  T.deepCopyInto(*Trie);
 }
 
 // A PathArray represents the function id's representing a stack trace. In this
@@ -127,17 +114,13 @@
 
   // The Path in this record is the function id's from the leaf to the root of
   // the function call stack as represented from a FunctionCallTrie.
-  PathArray *Path = nullptr;
+  PathArray Path;
   const FunctionCallTrie::Node *Node = nullptr;
 
   // Constructor for in-place construction.
-  ProfileRecord(PathAllocator &A, const FunctionCallTrie::Node *N)
-      : Path([&] {
-          auto P =
-              reinterpret_cast<PathArray *>(InternalAlloc(sizeof(PathArray)));
-          new (P) PathArray(A);
-          return P;
-        }()),
+  ProfileRecord(PathAllocator &A,
+                const FunctionCallTrie::Node *N) XRAY_NEVER_INSTRUMENT
+      : Path(A),
         Node(N) {}
 };
 
@@ -147,9 +130,9 @@
 
 // Walk a depth-first traversal of each root of the FunctionCallTrie to generate
 // the path(s) and the data associated with the path.
-static void populateRecords(ProfileRecordArray &PRs,
-                            ProfileRecord::PathAllocator &PA,
-                            const FunctionCallTrie &Trie) {
+static void
+populateRecords(ProfileRecordArray &PRs, ProfileRecord::PathAllocator &PA,
+                const FunctionCallTrie &Trie) XRAY_NEVER_INSTRUMENT {
   using StackArray = Array<const FunctionCallTrie::Node *>;
   using StackAllocator = typename StackArray::AllocatorType;
   StackAllocator StackAlloc(profilingFlags()->stack_allocator_max);
@@ -167,8 +150,8 @@
       // Traverse the Node's parents and as we're doing so, get the FIds in
       // the order they appear.
       for (auto N = Node; N != nullptr; N = N->Parent)
-        Record->Path->Append(N->FId);
-      DCHECK(!Record->Path->empty());
+        Record->Path.Append(N->FId);
+      DCHECK(!Record->Path.empty());
 
       for (const auto C : Node->Callees)
         DFSStack.Append(C.NodePtr);
@@ -177,52 +160,58 @@
 }
 
 static void serializeRecords(ProfileBuffer *Buffer, const BlockHeader &Header,
-                             const ProfileRecordArray &ProfileRecords) {
-  auto NextPtr = static_cast<char *>(
+                             const ProfileRecordArray &ProfileRecords)
+    XRAY_NEVER_INSTRUMENT {
+  auto NextPtr = static_cast<uint8_t *>(
                      internal_memcpy(Buffer->Data, &Header, sizeof(Header))) +
                  sizeof(Header);
   for (const auto &Record : ProfileRecords) {
     // List of IDs follow:
-    for (const auto FId : *Record.Path)
+    for (const auto FId : Record.Path)
       NextPtr =
-          static_cast<char *>(internal_memcpy(NextPtr, &FId, sizeof(FId))) +
+          static_cast<uint8_t *>(internal_memcpy(NextPtr, &FId, sizeof(FId))) +
           sizeof(FId);
 
     // Add the sentinel here.
     constexpr int32_t SentinelFId = 0;
-    NextPtr = static_cast<char *>(
+    NextPtr = static_cast<uint8_t *>(
                   internal_memset(NextPtr, SentinelFId, sizeof(SentinelFId))) +
               sizeof(SentinelFId);
 
     // Add the node data here.
     NextPtr =
-        static_cast<char *>(internal_memcpy(NextPtr, &Record.Node->CallCount,
-                                            sizeof(Record.Node->CallCount))) +
+        static_cast<uint8_t *>(internal_memcpy(
+            NextPtr, &Record.Node->CallCount, sizeof(Record.Node->CallCount))) +
         sizeof(Record.Node->CallCount);
-    NextPtr = static_cast<char *>(
+    NextPtr = static_cast<uint8_t *>(
                   internal_memcpy(NextPtr, &Record.Node->CumulativeLocalTime,
                                   sizeof(Record.Node->CumulativeLocalTime))) +
               sizeof(Record.Node->CumulativeLocalTime);
   }
 
-  DCHECK_EQ(NextPtr - static_cast<char *>(Buffer->Data), Buffer->Size);
+  DCHECK_EQ(NextPtr - static_cast<uint8_t *>(Buffer->Data), Buffer->Size);
 }
 
 } // namespace
 
-void serialize() {
+void serialize() XRAY_NEVER_INSTRUMENT {
   SpinMutexLock Lock(&GlobalMutex);
 
-  // Clear out the global ProfileBuffers.
-  for (uptr I = 0; I < ProfileBuffers->Size(); ++I)
-    InternalFree((*ProfileBuffers)[I].Data);
-  ProfileBuffers->Reset();
+  if (GlobalAllocators == nullptr || ThreadTries == nullptr ||
+      ProfileBuffers == nullptr)
+    return;
 
-  if (ThreadTries->Size() == 0)
+  // Clear out the global ProfileBuffers, if it's not empty.
+  for (auto &B : *ProfileBuffers)
+    deallocateBuffer(reinterpret_cast<uint8_t *>(B.Data), B.Size);
+  ProfileBuffers->trim(ProfileBuffers->size());
+
+  if (ThreadTries->empty())
     return;
 
   // Then repopulate the global ProfileBuffers.
-  for (u32 I = 0; I < ThreadTries->Size(); ++I) {
+  u32 I = 0;
+  for (const auto &ThreadTrie : *ThreadTries) {
     using ProfileRecordAllocator = typename ProfileRecordArray::AllocatorType;
     ProfileRecordAllocator PRAlloc(profilingFlags()->global_allocator_max);
     ProfileRecord::PathAllocator PathAlloc(
@@ -233,9 +222,11 @@
     // use a local allocator and an __xray::Array<...> to store the intermediary
     // data, then compute the size as we're going along. Then we'll allocate the
     // contiguous space to contain the thread buffer data.
-    const auto &Trie = *(*ThreadTries)[I].Trie;
+    const auto &Trie =
+        *reinterpret_cast<const FunctionCallTrie *>(&(ThreadTrie.TrieStorage));
     if (Trie.getRoots().empty())
       continue;
+
     populateRecords(ProfileRecords, PathAlloc, Trie);
     DCHECK(!Trie.getRoots().empty());
     DCHECK(!ProfileRecords.empty());
@@ -251,68 +242,71 @@
     //   + end of record (8 bytes)
     u32 CumulativeSizes = 0;
     for (const auto &Record : ProfileRecords)
-      CumulativeSizes += 20 + (4 * Record.Path->size());
+      CumulativeSizes += 20 + (4 * Record.Path.size());
 
-    BlockHeader Header{16 + CumulativeSizes, I, (*ThreadTries)[I].TId};
-    auto Buffer = ProfileBuffers->PushBack();
+    BlockHeader Header{16 + CumulativeSizes, I++, ThreadTrie.TId};
+    auto Buffer = ProfileBuffers->Append({});
     Buffer->Size = sizeof(Header) + CumulativeSizes;
-    Buffer->Data = InternalAlloc(Buffer->Size, nullptr, 64);
+    Buffer->Data = allocateBuffer(Buffer->Size);
     DCHECK_NE(Buffer->Data, nullptr);
     serializeRecords(Buffer, Header, ProfileRecords);
-
-    // Now clean up the ProfileRecords array, one at a time.
-    for (auto &Record : ProfileRecords) {
-      Record.Path->~PathArray();
-      InternalFree(Record.Path);
-    }
   }
 }
 
-void reset() {
+void reset() XRAY_NEVER_INSTRUMENT {
   SpinMutexLock Lock(&GlobalMutex);
+
   if (ProfileBuffers != nullptr) {
     // Clear out the profile buffers that have been serialized.
-    for (uptr I = 0; I < ProfileBuffers->Size(); ++I)
-      InternalFree((*ProfileBuffers)[I].Data);
-    ProfileBuffers->Reset();
-    InternalFree(ProfileBuffers);
-    ProfileBuffers = nullptr;
+    for (auto &B : *ProfileBuffers)
+      deallocateBuffer(reinterpret_cast<uint8_t *>(B.Data), B.Size);
+    ProfileBuffers->trim(ProfileBuffers->size());
   }
 
   if (ThreadTries != nullptr) {
     // Clear out the function call tries per thread.
-    for (uptr I = 0; I < ThreadTries->Size(); ++I) {
-      auto &T = (*ThreadTries)[I];
-      T.Trie->~FunctionCallTrie();
-      InternalFree(T.Trie);
+    for (auto &T : *ThreadTries) {
+      auto Trie = reinterpret_cast<FunctionCallTrie *>(&T.TrieStorage);
+      Trie->~FunctionCallTrie();
     }
-    ThreadTries->Reset();
-    InternalFree(ThreadTries);
-    ThreadTries = nullptr;
+    ThreadTries->trim(ThreadTries->size());
   }
 
   // Reset the global allocators.
-  if (GlobalAllocators != nullptr) {
+  if (GlobalAllocators != nullptr)
     GlobalAllocators->~Allocators();
-    InternalFree(GlobalAllocators);
-    GlobalAllocators = nullptr;
-  }
-  GlobalAllocators = reinterpret_cast<FunctionCallTrie::Allocators *>(
-      InternalAlloc(sizeof(FunctionCallTrie::Allocators)));
+
+  GlobalAllocators =
+      reinterpret_cast<FunctionCallTrie::Allocators *>(&AllocatorStorage);
   new (GlobalAllocators) FunctionCallTrie::Allocators();
   *GlobalAllocators = FunctionCallTrie::InitAllocators();
-  ThreadTries = reinterpret_cast<Vector<ThreadTrie> *>(
-      InternalAlloc(sizeof(Vector<ThreadTrie>)));
-  new (ThreadTries) Vector<ThreadTrie>();
-  ProfileBuffers = reinterpret_cast<Vector<ProfileBuffer> *>(
-      InternalAlloc(sizeof(Vector<ProfileBuffer>)));
-  new (ProfileBuffers) Vector<ProfileBuffer>();
+
+  if (ThreadTriesAllocator != nullptr)
+    ThreadTriesAllocator->~ThreadTriesArrayAllocator();
+
+  ThreadTriesAllocator = reinterpret_cast<ThreadTriesArrayAllocator *>(
+      &ThreadTriesArrayAllocatorStorage);
+  new (ThreadTriesAllocator)
+      ThreadTriesArrayAllocator(profilingFlags()->global_allocator_max);
+  ThreadTries = reinterpret_cast<ThreadTriesArray *>(&ThreadTriesStorage);
+  new (ThreadTries) ThreadTriesArray(*ThreadTriesAllocator);
+
+  if (ProfileBuffersAllocator != nullptr)
+    ProfileBuffersAllocator->~ProfileBufferArrayAllocator();
+
+  ProfileBuffersAllocator = reinterpret_cast<ProfileBufferArrayAllocator *>(
+      &ProfileBufferArrayAllocatorStorage);
+  new (ProfileBuffersAllocator)
+      ProfileBufferArrayAllocator(profilingFlags()->global_allocator_max);
+  ProfileBuffers =
+      reinterpret_cast<ProfileBufferArray *>(&ProfileBuffersStorage);
+  new (ProfileBuffers) ProfileBufferArray(*ProfileBuffersAllocator);
 }
 
-XRayBuffer nextBuffer(XRayBuffer B) {
+XRayBuffer nextBuffer(XRayBuffer B) XRAY_NEVER_INSTRUMENT {
   SpinMutexLock Lock(&GlobalMutex);
 
-  if (ProfileBuffers == nullptr || ProfileBuffers->Size() == 0)
+  if (ProfileBuffers == nullptr || ProfileBuffers->size() == 0)
     return {nullptr, 0};
 
   static pthread_once_t Once = PTHREAD_ONCE_INIT;
@@ -336,7 +330,7 @@
   BlockHeader Header;
   internal_memcpy(&Header, B.Data, sizeof(BlockHeader));
   auto NextBlock = Header.BlockNum + 1;
-  if (NextBlock < ProfileBuffers->Size())
+  if (NextBlock < ProfileBuffers->size())
     return {(*ProfileBuffers)[NextBlock].Data,
             (*ProfileBuffers)[NextBlock].Size};
   return {nullptr, 0};
diff --git a/lib/xray/xray_profiling.cc b/lib/xray/xray_profiling.cc
index d4b4345..6615de1 100644
--- a/lib/xray/xray_profiling.cc
+++ b/lib/xray/xray_profiling.cc
@@ -19,7 +19,6 @@
 #include "sanitizer_common/sanitizer_flags.h"
 #include "xray/xray_interface.h"
 #include "xray/xray_log_interface.h"
-
 #include "xray_flags.h"
 #include "xray_profile_collector.h"
 #include "xray_profiling_flags.h"
@@ -40,16 +39,30 @@
 SpinMutex ProfilerOptionsMutex;
 
 struct alignas(64) ProfilingData {
-  FunctionCallTrie::Allocators *Allocators = nullptr;
-  FunctionCallTrie *FCT = nullptr;
+  FunctionCallTrie::Allocators *Allocators;
+  FunctionCallTrie *FCT;
 };
 
 static pthread_key_t ProfilingKey;
 
+thread_local std::aligned_storage<sizeof(FunctionCallTrie::Allocators)>::type
+    AllocatorsStorage;
+thread_local std::aligned_storage<sizeof(FunctionCallTrie)>::type
+    FunctionCallTrieStorage;
 thread_local std::aligned_storage<sizeof(ProfilingData)>::type ThreadStorage{};
+
 static ProfilingData &getThreadLocalData() XRAY_NEVER_INSTRUMENT {
   thread_local auto ThreadOnce = [] {
     new (&ThreadStorage) ProfilingData{};
+    auto *Allocators =
+        reinterpret_cast<FunctionCallTrie::Allocators *>(&AllocatorsStorage);
+    new (Allocators) FunctionCallTrie::Allocators();
+    *Allocators = FunctionCallTrie::InitAllocators();
+    auto *FCT = reinterpret_cast<FunctionCallTrie *>(&FunctionCallTrieStorage);
+    new (FCT) FunctionCallTrie(*Allocators);
+    auto &TLD = *reinterpret_cast<ProfilingData *>(&ThreadStorage);
+    TLD.Allocators = Allocators;
+    TLD.FCT = FCT;
     pthread_setspecific(ProfilingKey, &ThreadStorage);
     return false;
   }();
@@ -57,25 +70,18 @@
 
   auto &TLD = *reinterpret_cast<ProfilingData *>(&ThreadStorage);
 
-  // We need to check whether the global flag to finalizing/finalized has been
-  // switched. If it is, then we ought to not actually initialise the data.
-  auto Status = atomic_load(&ProfilerLogStatus, memory_order_acquire);
-  if (Status == XRayLogInitStatus::XRAY_LOG_FINALIZING ||
-      Status == XRayLogInitStatus::XRAY_LOG_FINALIZED)
-    return TLD;
-
-  // If we're live, then we re-initialize TLD if the pointers are not null.
-  if (UNLIKELY(TLD.Allocators == nullptr && TLD.FCT == nullptr)) {
-    TLD.Allocators = reinterpret_cast<FunctionCallTrie::Allocators *>(
-        InternalAlloc(sizeof(FunctionCallTrie::Allocators)));
-    new (TLD.Allocators) FunctionCallTrie::Allocators();
-    *TLD.Allocators = FunctionCallTrie::InitAllocators();
-    TLD.FCT = reinterpret_cast<FunctionCallTrie *>(
-        InternalAlloc(sizeof(FunctionCallTrie)));
-    new (TLD.FCT) FunctionCallTrie(*TLD.Allocators);
+  if (UNLIKELY(TLD.Allocators == nullptr || TLD.FCT == nullptr)) {
+    auto *Allocators =
+        reinterpret_cast<FunctionCallTrie::Allocators *>(&AllocatorsStorage);
+    new (Allocators) FunctionCallTrie::Allocators();
+    *Allocators = FunctionCallTrie::InitAllocators();
+    auto *FCT = reinterpret_cast<FunctionCallTrie *>(&FunctionCallTrieStorage);
+    new (FCT) FunctionCallTrie(*Allocators);
+    TLD.Allocators = Allocators;
+    TLD.FCT = FCT;
   }
 
-  return TLD;
+  return *reinterpret_cast<ProfilingData *>(&ThreadStorage);
 }
 
 static void cleanupTLD() XRAY_NEVER_INSTRUMENT {
@@ -83,8 +89,6 @@
   if (TLD.Allocators != nullptr && TLD.FCT != nullptr) {
     TLD.FCT->~FunctionCallTrie();
     TLD.Allocators->~Allocators();
-    InternalFree(TLD.FCT);
-    InternalFree(TLD.Allocators);
     TLD.FCT = nullptr;
     TLD.Allocators = nullptr;
   }
@@ -162,11 +166,13 @@
 
 thread_local atomic_uint8_t ReentranceGuard{0};
 
-static void postCurrentThreadFCT(ProfilingData &TLD) {
+static void postCurrentThreadFCT(ProfilingData &TLD) XRAY_NEVER_INSTRUMENT {
   if (TLD.Allocators == nullptr || TLD.FCT == nullptr)
     return;
 
-  profileCollectorService::post(*TLD.FCT, GetTid());
+  if (!TLD.FCT->getRoots().empty())
+    profileCollectorService::post(*TLD.FCT, GetTid());
+
   cleanupTLD();
 }
 
@@ -181,13 +187,14 @@
     return;
 
   auto Status = atomic_load(&ProfilerLogStatus, memory_order_acquire);
-  auto &TLD = getThreadLocalData();
   if (UNLIKELY(Status == XRayLogInitStatus::XRAY_LOG_FINALIZED ||
                Status == XRayLogInitStatus::XRAY_LOG_FINALIZING)) {
+    auto &TLD = getThreadLocalData();
     postCurrentThreadFCT(TLD);
     return;
   }
 
+  auto &TLD = getThreadLocalData();
   switch (Entry) {
   case XRayEntryType::ENTRY:
   case XRayEntryType::LOG_ARGS_ENTRY:
@@ -235,15 +242,8 @@
 }
 
 XRayLogInitStatus
-profilingLoggingInit(size_t BufferSize, size_t BufferMax, void *Options,
-                     size_t OptionsSize) XRAY_NEVER_INSTRUMENT {
-  if (BufferSize != 0 || BufferMax != 0) {
-    if (Verbosity())
-      Report("__xray_log_init() being used, and is unsupported. Use "
-             "__xray_log_init_mode(...) instead. Bailing out.");
-    return XRayLogInitStatus::XRAY_LOG_UNINITIALIZED;
-  }
-
+profilingLoggingInit(UNUSED size_t BufferSize, UNUSED size_t BufferMax,
+                     void *Options, size_t OptionsSize) XRAY_NEVER_INSTRUMENT {
   s32 CurrentStatus = XRayLogInitStatus::XRAY_LOG_UNINITIALIZED;
   if (!atomic_compare_exchange_strong(&ProfilerLogStatus, &CurrentStatus,
                                       XRayLogInitStatus::XRAY_LOG_INITIALIZING,
diff --git a/lib/xray/xray_segmented_array.h b/lib/xray/xray_segmented_array.h
index 11dd794..c723c7d 100644
--- a/lib/xray/xray_segmented_array.h
+++ b/lib/xray/xray_segmented_array.h
@@ -88,7 +88,7 @@
   // segments when elements are trimmed off the end.
   SegmentBase *Freelist = &SentinelSegment;
 
-  Segment *NewSegment() {
+  Segment *NewSegment() XRAY_NEVER_INSTRUMENT {
     // We need to handle the case in which enough elements have been trimmed to
     // allow us to re-use segments we've allocated before. For this we look into
     // the Freelist, to see whether we need to actually allocate new blocks or
@@ -111,7 +111,7 @@
     return S;
   }
 
-  Segment *InitHeadAndTail() {
+  Segment *InitHeadAndTail() XRAY_NEVER_INSTRUMENT {
     DCHECK_EQ(Head, &SentinelSegment);
     DCHECK_EQ(Tail, &SentinelSegment);
     auto Segment = NewSegment();
@@ -123,7 +123,7 @@
     return Segment;
   }
 
-  Segment *AppendNewSegment() {
+  Segment *AppendNewSegment() XRAY_NEVER_INSTRUMENT {
     auto S = NewSegment();
     if (S == nullptr)
       return nullptr;
@@ -144,16 +144,18 @@
     size_t Size = 0;
 
   public:
-    Iterator(SegmentBase *IS, size_t Off, size_t S)
-        : S(IS), Offset(Off), Size(S) {}
-    Iterator(const Iterator &) noexcept = default;
-    Iterator() noexcept = default;
-    Iterator(Iterator &&) noexcept = default;
-    Iterator &operator=(const Iterator &) = default;
-    Iterator &operator=(Iterator &&) = default;
-    ~Iterator() = default;
+    Iterator(SegmentBase *IS, size_t Off, size_t S) XRAY_NEVER_INSTRUMENT
+        : S(IS),
+          Offset(Off),
+          Size(S) {}
+    Iterator(const Iterator &) NOEXCEPT XRAY_NEVER_INSTRUMENT = default;
+    Iterator() NOEXCEPT XRAY_NEVER_INSTRUMENT = default;
+    Iterator(Iterator &&) NOEXCEPT XRAY_NEVER_INSTRUMENT = default;
+    Iterator &operator=(const Iterator &) XRAY_NEVER_INSTRUMENT = default;
+    Iterator &operator=(Iterator &&) XRAY_NEVER_INSTRUMENT = default;
+    ~Iterator() XRAY_NEVER_INSTRUMENT = default;
 
-    Iterator &operator++() {
+    Iterator &operator++() XRAY_NEVER_INSTRUMENT {
       if (++Offset % ElementsPerSegment || Offset == Size)
         return *this;
 
@@ -168,7 +170,7 @@
       return *this;
     }
 
-    Iterator &operator--() {
+    Iterator &operator--() XRAY_NEVER_INSTRUMENT {
       DCHECK_NE(S, &SentinelSegment);
       DCHECK_GT(Offset, 0);
 
@@ -181,29 +183,31 @@
       return *this;
     }
 
-    Iterator operator++(int) {
+    Iterator operator++(int) XRAY_NEVER_INSTRUMENT {
       Iterator Copy(*this);
       ++(*this);
       return Copy;
     }
 
-    Iterator operator--(int) {
+    Iterator operator--(int) XRAY_NEVER_INSTRUMENT {
       Iterator Copy(*this);
       --(*this);
       return Copy;
     }
 
     template <class V, class W>
-    friend bool operator==(const Iterator<V> &L, const Iterator<W> &R) {
+    friend bool operator==(const Iterator<V> &L,
+                           const Iterator<W> &R) XRAY_NEVER_INSTRUMENT {
       return L.S == R.S && L.Offset == R.Offset;
     }
 
     template <class V, class W>
-    friend bool operator!=(const Iterator<V> &L, const Iterator<W> &R) {
+    friend bool operator!=(const Iterator<V> &L,
+                           const Iterator<W> &R) XRAY_NEVER_INSTRUMENT {
       return !(L == R);
     }
 
-    U &operator*() const {
+    U &operator*() const XRAY_NEVER_INSTRUMENT {
       DCHECK_NE(S, &SentinelSegment);
       auto RelOff = Offset % ElementsPerSegment;
 
@@ -214,11 +218,11 @@
       return *reinterpret_cast<U *>(AlignedOffset);
     }
 
-    U *operator->() const { return &(**this); }
+    U *operator->() const XRAY_NEVER_INSTRUMENT { return &(**this); }
   };
 
 public:
-  explicit Array(AllocatorType &A) : Alloc(&A) {}
+  explicit Array(AllocatorType &A) XRAY_NEVER_INSTRUMENT : Alloc(&A) {}
 
   Array(const Array &) = delete;
   Array(Array &&O) NOEXCEPT : Alloc(O.Alloc),
@@ -230,16 +234,16 @@
     O.Size = 0;
   }
 
-  bool empty() const { return Size == 0; }
+  bool empty() const XRAY_NEVER_INSTRUMENT { return Size == 0; }
 
-  AllocatorType &allocator() const {
+  AllocatorType &allocator() const XRAY_NEVER_INSTRUMENT {
     DCHECK_NE(Alloc, nullptr);
     return *Alloc;
   }
 
-  size_t size() const { return Size; }
+  size_t size() const XRAY_NEVER_INSTRUMENT { return Size; }
 
-  T *Append(const T &E) {
+  T *Append(const T &E) XRAY_NEVER_INSTRUMENT {
     if (UNLIKELY(Head == &SentinelSegment))
       if (InitHeadAndTail() == nullptr)
         return nullptr;
@@ -257,7 +261,8 @@
     return Position;
   }
 
-  template <class... Args> T *AppendEmplace(Args &&... args) {
+  template <class... Args>
+  T *AppendEmplace(Args &&... args) XRAY_NEVER_INSTRUMENT {
     if (UNLIKELY(Head == &SentinelSegment))
       if (InitHeadAndTail() == nullptr)
         return nullptr;
@@ -281,7 +286,7 @@
     return reinterpret_cast<T *>(Position);
   }
 
-  T &operator[](size_t Offset) const {
+  T &operator[](size_t Offset) const XRAY_NEVER_INSTRUMENT {
     DCHECK_LE(Offset, Size);
     // We need to traverse the array enough times to find the element at Offset.
     auto S = Head;
@@ -296,13 +301,13 @@
     return *reinterpret_cast<T *>(Position);
   }
 
-  T &front() const {
+  T &front() const XRAY_NEVER_INSTRUMENT {
     DCHECK_NE(Head, &SentinelSegment);
     DCHECK_NE(Size, 0u);
     return *begin();
   }
 
-  T &back() const {
+  T &back() const XRAY_NEVER_INSTRUMENT {
     DCHECK_NE(Tail, &SentinelSegment);
     DCHECK_NE(Size, 0u);
     auto It = end();
@@ -310,7 +315,8 @@
     return *It;
   }
 
-  template <class Predicate> T *find_element(Predicate P) const {
+  template <class Predicate>
+  T *find_element(Predicate P) const XRAY_NEVER_INSTRUMENT {
     if (empty())
       return nullptr;
 
@@ -324,7 +330,10 @@
 
   /// Remove N Elements from the end. This leaves the blocks behind, and not
   /// require allocation of new blocks for new elements added after trimming.
-  void trim(size_t Elements) {
+  void trim(size_t Elements) XRAY_NEVER_INSTRUMENT {
+    if (Elements == 0)
+      return;
+
     DCHECK_LE(Elements, Size);
     DCHECK_GT(Size, 0);
     auto OldSize = Size;
@@ -357,10 +366,18 @@
   }
 
   // Provide iterators.
-  Iterator<T> begin() const { return Iterator<T>(Head, 0, Size); }
-  Iterator<T> end() const { return Iterator<T>(Tail, Size, Size); }
-  Iterator<const T> cbegin() const { return Iterator<const T>(Head, 0, Size); }
-  Iterator<const T> cend() const { return Iterator<const T>(Tail, Size, Size); }
+  Iterator<T> begin() const XRAY_NEVER_INSTRUMENT {
+    return Iterator<T>(Head, 0, Size);
+  }
+  Iterator<T> end() const XRAY_NEVER_INSTRUMENT {
+    return Iterator<T>(Tail, Size, Size);
+  }
+  Iterator<const T> cbegin() const XRAY_NEVER_INSTRUMENT {
+    return Iterator<const T>(Head, 0, Size);
+  }
+  Iterator<const T> cend() const XRAY_NEVER_INSTRUMENT {
+    return Iterator<const T>(Tail, Size, Size);
+  }
 };
 
 // We need to have this storage definition out-of-line so that the compiler can
diff --git a/lib/xray/xray_trampoline_x86_64.S b/lib/xray/xray_trampoline_x86_64.S
index 99ad396..9dffae0 100644
--- a/lib/xray/xray_trampoline_x86_64.S
+++ b/lib/xray/xray_trampoline_x86_64.S
@@ -19,6 +19,7 @@
 
 
 .macro SAVE_REGISTERS
+	pushfq
 	subq $240, %rsp
 	CFI_DEF_CFA_OFFSET(248)
 	movq %rbp, 232(%rsp)
@@ -69,6 +70,7 @@
 	movq  8(%rsp), %r14
 	movq  0(%rsp), %r15
 	addq	$240, %rsp
+	popfq
 	CFI_DEF_CFA_OFFSET(8)
 .endm
 
diff --git a/lib/xray/xray_utils.cc b/lib/xray/xray_utils.cc
index 68f4e8c..5753354 100644
--- a/lib/xray/xray_utils.cc
+++ b/lib/xray/xray_utils.cc
@@ -103,10 +103,9 @@
   if (LastSlash != nullptr)
     Progname = LastSlash + 1;
 
-  const int HalfLength = sizeof(TmpFilename) / 2 - sizeof(TmpWildcardPattern);
   int NeededLength = internal_snprintf(
-      TmpFilename, sizeof(TmpFilename), "%.*s%.*s.%s", HalfLength,
-      flags()->xray_logfile_base, HalfLength, Progname, TmpWildcardPattern);
+      TmpFilename, sizeof(TmpFilename), "%s%s.%s",
+      flags()->xray_logfile_base, Progname, TmpWildcardPattern);
   if (NeededLength > int(sizeof(TmpFilename))) {
     Report("XRay log file name too long (%d): %s\n", NeededLength, TmpFilename);
     return -1;
diff --git a/lib/xray/xray_x86_64.cc b/lib/xray/xray_x86_64.cc
index 51dc4ce..d0411d0 100644
--- a/lib/xray/xray_x86_64.cc
+++ b/lib/xray/xray_x86_64.cc
@@ -1,9 +1,10 @@
 #include "cpuid.h"
 #include "sanitizer_common/sanitizer_common.h"
+#include "sanitizer_common/sanitizer_posix.h"
 #include "xray_defs.h"
 #include "xray_interface_internal.h"
 
-#if SANITIZER_FREEBSD || SANITIZER_NETBSD || SANITIZER_OPENBSD
+#if SANITIZER_FREEBSD || SANITIZER_NETBSD || SANITIZER_OPENBSD || SANITIZER_MAC
 #include <sys/types.h>
 #if SANITIZER_OPENBSD
 #include <sys/time.h>
@@ -81,17 +82,20 @@
   }
   return TSCFrequency == -1 ? 0 : static_cast<uint64_t>(TSCFrequency);
 }
-#elif SANITIZER_FREEBSD || SANITIZER_NETBSD || SANITIZER_OPENBSD
+#elif SANITIZER_FREEBSD || SANITIZER_NETBSD || SANITIZER_OPENBSD || SANITIZER_MAC
 uint64_t getTSCFrequency() XRAY_NEVER_INSTRUMENT {
     long long TSCFrequency = -1;
     size_t tscfreqsz = sizeof(TSCFrequency);
 #if SANITIZER_OPENBSD
     int Mib[2] = { CTL_MACHDEP, CPU_TSCFREQ };
-    if (sysctl(Mib, 2, &TSCFrequency, &tscfreqsz, NULL, 0) != -1) {
+    if (internal_sysctl(Mib, 2, &TSCFrequency, &tscfreqsz, NULL, 0) != -1) {
+#elif SANITIZER_MAC
+    if (internal_sysctlbyname("machdep.tsc.frequency", &TSCFrequency,
+                              &tscfreqsz, NULL, 0) != -1) {
 
 #else
-    if (sysctlbyname("machdep.tsc_freq", &TSCFrequency, &tscfreqsz,
-        NULL, 0) != -1) {
+    if (internal_sysctlbyname("machdep.tsc_freq", &TSCFrequency, &tscfreqsz,
+                              NULL, 0) != -1) {
 #endif
         return static_cast<uint64_t>(TSCFrequency);
     } else {
diff --git a/test/asan/TestCases/Darwin/segv_read_write.c b/test/asan/TestCases/Darwin/segv_read_write.c
index d8e2d21..127365d 100644
--- a/test/asan/TestCases/Darwin/segv_read_write.c
+++ b/test/asan/TestCases/Darwin/segv_read_write.c
@@ -9,11 +9,8 @@
 __attribute__((noinline)) void Read(int *ptr) { sink = *ptr; }
 __attribute__((noinline)) void Write(int *ptr) { *ptr = 0; }
 int main(int argc, char **argv) {
-  // Writes to shadow are detected as reads from shadow gap (because of how the
-  // shadow mapping works). This is kinda hard to fix. Test a random address in
-  // the application part of the address space.
   void *volatile p =
-      mmap(nullptr, 4096, PROT_READ, MAP_PRIVATE | MAP_ANON, 0, 0);
+      mmap(nullptr, 4096, PROT_READ, MAP_PRIVATE | MAP_ANON, -1, 0);
   munmap(p, 4096);
   if (argc == 1)
     Read((int *)p);
diff --git a/test/asan/TestCases/Linux/coverage-missing.cc b/test/asan/TestCases/Linux/coverage-missing.cc
index 32aada6..10acef9 100644
--- a/test/asan/TestCases/Linux/coverage-missing.cc
+++ b/test/asan/TestCases/Linux/coverage-missing.cc
@@ -45,7 +45,15 @@
 // RUN: diff bar.txt foo-missing.txt > %t.log || true
 // RUN: not grep "^<" %t.log
 
-// REQUIRES: x86-target-arch
+// FIXME %sancov GetInstrumentedPCs relies on objdump -d to
+// obtain the number of instrumented PCs.  The i386
+// %dynamiclib has .plt entries that are not recognized by
+// objdump,
+// "sancov.py: found 0 instrumented PCs in *.so",
+// causing AddressSanitizer-i386-linux to fail.
+// Change it back to x86-target-arch after %sancov switches to a more robust approach.
+
+// REQUIRES: x86_64-target-arch
 // XFAIL: android
 
 #include <stdio.h>
diff --git a/test/asan/TestCases/Posix/stack-use-after-return.cc b/test/asan/TestCases/Posix/stack-use-after-return.cc
index 237c880..528fa94 100644
--- a/test/asan/TestCases/Posix/stack-use-after-return.cc
+++ b/test/asan/TestCases/Posix/stack-use-after-return.cc
@@ -78,9 +78,11 @@
   pthread_attr_init(&attr);
   if (kStackSize > 0) {
     size_t desired_stack_size = kStackSize;
+#ifdef PTHREAD_STACK_MIN
     if (desired_stack_size < PTHREAD_STACK_MIN) {
       desired_stack_size = PTHREAD_STACK_MIN;
     }
+#endif
 
     int ret = pthread_attr_setstacksize(&attr, desired_stack_size);
     if (ret != 0) {
diff --git a/test/asan/TestCases/intercept-rethrow-exception.cc b/test/asan/TestCases/intercept-rethrow-exception.cc
index e81dc53..019092a 100644
--- a/test/asan/TestCases/intercept-rethrow-exception.cc
+++ b/test/asan/TestCases/intercept-rethrow-exception.cc
@@ -4,6 +4,10 @@
 // RUN: %clangxx_asan -fexceptions -O0 %s -o %t
 // RUN: %run %t
 
+// The current implementation of this functionality requires special
+// combination of libraries that are not used by default on NetBSD
+// XFAIL: netbsd
+
 #include <assert.h>
 #include <exception>
 #include <sanitizer/asan_interface.h>
diff --git a/test/asan/TestCases/throw_catch.cc b/test/asan/TestCases/throw_catch.cc
index 0108351..d7f00ea 100644
--- a/test/asan/TestCases/throw_catch.cc
+++ b/test/asan/TestCases/throw_catch.cc
@@ -21,6 +21,7 @@
   }
 }
 
+__attribute__((noinline))
 void TestThrow() {
   char x[32];
   fprintf(stderr, "Before: %p poisoned: %d\n", &x,
@@ -36,6 +37,7 @@
     assert(!__asan_address_is_poisoned(x + 32));
 }
 
+__attribute__((noinline))
 void TestThrowInline() {
   char x[32];
   fprintf(stderr, "Before: %p poisoned: %d\n", &x,
diff --git a/test/builtins/Unit/compiler_rt_logb_test.c b/test/builtins/Unit/compiler_rt_logb_test.c
new file mode 100644
index 0000000..7967659
--- /dev/null
+++ b/test/builtins/Unit/compiler_rt_logb_test.c
@@ -0,0 +1,63 @@
+// RUN: %clang_builtins %s %librt -o %t && %run %t
+//===-- compiler_rt_logb_test.c - Test __compiler_rt_logb -----------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is dual licensed under the MIT and the University of Illinois Open
+// Source Licenses. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file checks __compiler_rt_logb from the compiler_rt library for
+// conformance against libm.
+//
+//===----------------------------------------------------------------------===//
+
+#define DOUBLE_PRECISION
+#include <math.h>
+#include <stdio.h>
+#include "fp_lib.h"
+
+int test__compiler_rt_logb(fp_t x) {
+  fp_t crt_value = __compiler_rt_logb(x);
+  fp_t libm_value = logb(x);
+  // Compare actual rep, e.g. to avoid NaN != the same NaN
+  if (toRep(crt_value) != toRep(libm_value)) {
+    printf("error: in __compiler_rt_logb(%a [%lX]) = %a [%lX] !=  %a [%lX]\n",
+           x, toRep(x), crt_value, toRep(crt_value), libm_value,
+           toRep(libm_value));
+    return 1;
+  }
+  return 0;
+}
+
+double cases[] = {
+    1.e-6, -1.e-6, NAN, -NAN, INFINITY, -INFINITY, -1,
+    -0.0,  0.0,    1,   -2,   2,        -0.5,      0.5,
+};
+
+int main() {
+  const unsigned N = sizeof(cases) / sizeof(cases[0]);
+  unsigned i;
+  for (i = 0; i < N; ++i) {
+    if (test__compiler_rt_logb(cases[i])) return 1;
+  }
+
+  // Test a moving 1 bit, especially to handle denormal values.
+  // Test the negation as well.
+  rep_t x = signBit;
+  while (x) {
+    if (test__compiler_rt_logb(fromRep(x))) return 1;
+    if (test__compiler_rt_logb(fromRep(signBit ^ x))) return 1;
+    x >>= 1;
+  }
+  // Also try a couple moving ones
+  x = signBit | (signBit >> 1) | (signBit >> 2);
+  while (x) {
+    if (test__compiler_rt_logb(fromRep(x))) return 1;
+    if (test__compiler_rt_logb(fromRep(signBit ^ x))) return 1;
+    x >>= 1;
+  }
+
+  return 0;
+}
diff --git a/test/builtins/Unit/compiler_rt_logbf_test.c b/test/builtins/Unit/compiler_rt_logbf_test.c
new file mode 100644
index 0000000..5a4979a
--- /dev/null
+++ b/test/builtins/Unit/compiler_rt_logbf_test.c
@@ -0,0 +1,63 @@
+// RUN: %clang_builtins %s %librt -o %t && %run %t
+//===-- compiler_rt_logbf_test.c - Test __compiler_rt_logbf ---------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is dual licensed under the MIT and the University of Illinois Open
+// Source Licenses. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file checks __compiler_rt_logbf from the compiler_rt library for
+// conformance against libm.
+//
+//===----------------------------------------------------------------------===//
+
+#define SINGLE_PRECISION
+#include <math.h>
+#include <stdio.h>
+#include "fp_lib.h"
+
+int test__compiler_rt_logbf(fp_t x) {
+  fp_t crt_value = __compiler_rt_logbf(x);
+  fp_t libm_value = logbf(x);
+  // Compare actual rep, e.g. to avoid NaN != the same NaN
+  if (toRep(crt_value) != toRep(libm_value)) {
+    printf("error: in __compiler_rt_logb(%a [%X]) = %a [%X] !=  %a [%X]\n", x,
+           toRep(x), crt_value, toRep(crt_value), libm_value,
+           toRep(libm_value));
+    return 1;
+  }
+  return 0;
+}
+
+double cases[] = {
+    1.e-6, -1.e-6, NAN, -NAN, INFINITY, -INFINITY, -1,
+    -0.0,  0.0,    1,   -2,   2,        -0.5,      0.5,
+};
+
+int main() {
+  const unsigned N = sizeof(cases) / sizeof(cases[0]);
+  unsigned i;
+  for (i = 0; i < N; ++i) {
+    if (test__compiler_rt_logbf(cases[i])) return 1;
+  }
+
+  // Test a moving 1 bit, especially to handle denormal values.
+  // Test the negation as well.
+  rep_t x = signBit;
+  while (x) {
+    if (test__compiler_rt_logbf(fromRep(x))) return 1;
+    if (test__compiler_rt_logbf(fromRep(signBit ^ x))) return 1;
+    x >>= 1;
+  }
+  // Also try a couple moving ones
+  x = signBit | (signBit >> 1) | (signBit >> 2);
+  while (x) {
+    if (test__compiler_rt_logbf(fromRep(x))) return 1;
+    if (test__compiler_rt_logbf(fromRep(signBit ^ x))) return 1;
+    x >>= 1;
+  }
+
+  return 0;
+}
diff --git a/test/builtins/Unit/compiler_rt_logbl_test.c b/test/builtins/Unit/compiler_rt_logbl_test.c
new file mode 100644
index 0000000..2617748
--- /dev/null
+++ b/test/builtins/Unit/compiler_rt_logbl_test.c
@@ -0,0 +1,79 @@
+// RUN: %clang_builtins %s %librt -o %t && %run %t
+//===-- compiler_rt_logbl_test.c - Test __compiler_rt_logbl ---------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is dual licensed under the MIT and the University of Illinois Open
+// Source Licenses. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file checks __compiler_rt_logbl from the compiler_rt library for
+// conformance against libm.
+//
+//===----------------------------------------------------------------------===//
+
+#define QUAD_PRECISION
+#include <math.h>
+#include <stdio.h>
+#include "fp_lib.h"
+#include "int_lib.h"
+
+#if defined(CRT_HAS_128BIT) && defined(CRT_LDBL_128BIT)
+
+int test__compiler_rt_logbl(fp_t x) {
+  fp_t crt_value = __compiler_rt_logbl(x);
+  fp_t libm_value = logbl(x);
+  // Compare actual rep, e.g. to avoid NaN != the same NaN
+  if (toRep(crt_value) != toRep(libm_value)) {
+    // Split expected values into two for printf
+    twords x_t, crt_value_t, libm_value_t;
+    x_t.all = toRep(x);
+    crt_value_t.all = toRep(crt_value);
+    libm_value_t.all = toRep(libm_value);
+    printf(
+        "error: in __compiler_rt_logb(%a [%llX %llX]) = %a [%llX %llX] !=  %a "
+        "[%llX %llX]\n",
+        x, x_t.s.high, x_t.s.low, crt_value, crt_value_t.s.high,
+        crt_value_t.s.low, libm_value, libm_value_t.s.high, libm_value_t.s.low);
+    return 1;
+  }
+  return 0;
+}
+
+double cases[] = {
+    1.e-6, -1.e-6, NAN, -NAN, INFINITY, -INFINITY, -1,
+    -0.0,  0.0,    1,   -2,   2,        -0.5,      0.5,
+};
+
+#endif
+
+int main() {
+#if defined(CRT_HAS_128BIT) && defined(CRT_LDBL_128BIT)
+  const unsigned N = sizeof(cases) / sizeof(cases[0]);
+  unsigned i;
+  for (i = 0; i < N; ++i) {
+    if (test__compiler_rt_logbl(cases[i])) return 1;
+  }
+
+  // Test a moving 1 bit, especially to handle denormal values.
+  // Test the negation as well.
+  rep_t x = signBit;
+  while (x) {
+    if (test__compiler_rt_logbl(fromRep(x))) return 1;
+    if (test__compiler_rt_logbl(fromRep(signBit ^ x))) return 1;
+    x >>= 1;
+  }
+  // Also try a couple moving ones
+  x = signBit | (signBit >> 1) | (signBit >> 2);
+  while (x) {
+    if (test__compiler_rt_logbl(fromRep(x))) return 1;
+    if (test__compiler_rt_logbl(fromRep(signBit ^ x))) return 1;
+    x >>= 1;
+  }
+#else
+  printf("skipped\n");
+#endif
+
+  return 0;
+}
diff --git a/test/cfi/cross-dso/lit.local.cfg b/test/cfi/cross-dso/lit.local.cfg
index afdac42..245d434 100644
--- a/test/cfi/cross-dso/lit.local.cfg
+++ b/test/cfi/cross-dso/lit.local.cfg
@@ -5,7 +5,7 @@
 
 root = getRoot(config)
 
-if root.host_os not in ['Linux']:
+if root.host_os not in ['Linux', 'FreeBSD', 'NetBSD']:
   config.unsupported = True
 
 # Android O (API level 26) has support for cross-dso cfi in libdl.so.
diff --git a/test/esan/TestCases/large-stack-linux.c b/test/esan/TestCases/large-stack-linux.c
index 1af32f8..17d8867 100644
--- a/test/esan/TestCases/large-stack-linux.c
+++ b/test/esan/TestCases/large-stack-linux.c
@@ -1,5 +1,7 @@
 // RUN: %clang_esan_wset -O0 %s -o %t 2>&1
 // RUN: %env_esan_opts="verbosity=1 record_snapshots=0" %run %t %t 2>&1 | FileCheck %s
+// Stucks at init and no clone feature equivalent.
+// UNSUPPORTED: freebsd
 
 #include <assert.h>
 #include <stdio.h>
diff --git a/test/esan/TestCases/workingset-early-fault.c b/test/esan/TestCases/workingset-early-fault.c
index 1c420c3..971285b 100644
--- a/test/esan/TestCases/workingset-early-fault.c
+++ b/test/esan/TestCases/workingset-early-fault.c
@@ -3,6 +3,8 @@
 //
 // RUN: %clang_esan_wset %s -o %t
 // RUN: %run %t 2>&1 | FileCheck %s
+// Stucks at init and no clone feature equivalent.
+// UNSUPPORTED: freebsd
 
 #include <stdio.h>
 #include <stdlib.h>
diff --git a/test/esan/TestCases/workingset-memset.cpp b/test/esan/TestCases/workingset-memset.cpp
index 9c972ec..56ed2f5 100644
--- a/test/esan/TestCases/workingset-memset.cpp
+++ b/test/esan/TestCases/workingset-memset.cpp
@@ -1,5 +1,7 @@
 // RUN: %clang_esan_wset -O0 %s -o %t 2>&1
 // RUN: %run %t 2>&1 | FileCheck %s
+// Stucks at init and no clone feature equivalent.
+// UNSUPPORTED: freebsd
 
 #include <stdlib.h>
 #include <string.h>
diff --git a/test/esan/TestCases/workingset-midreport.cpp b/test/esan/TestCases/workingset-midreport.cpp
index 38c3765..acd1eed 100644
--- a/test/esan/TestCases/workingset-midreport.cpp
+++ b/test/esan/TestCases/workingset-midreport.cpp
@@ -6,6 +6,8 @@
 
 // FIXME: Re-enable once PR33590 is fixed.
 // UNSUPPORTED: x86_64
+// Stucks at init and no clone feature equivalent.
+// UNSUPPORTED: freebsd
 
 #include <sanitizer/esan_interface.h>
 #include <sched.h>
diff --git a/test/esan/TestCases/workingset-samples.cpp b/test/esan/TestCases/workingset-samples.cpp
index d97b62b..1f8e97d 100644
--- a/test/esan/TestCases/workingset-samples.cpp
+++ b/test/esan/TestCases/workingset-samples.cpp
@@ -3,6 +3,8 @@
 
 // FIXME: Re-enable once PR33590 is fixed.
 // UNSUPPORTED: x86_64
+// Stucks at init and no clone feature equivalent.
+// UNSUPPORTED: freebsd
 
 #include <sanitizer/esan_interface.h>
 #include <sched.h>
diff --git a/test/esan/TestCases/workingset-signal-posix.cpp b/test/esan/TestCases/workingset-signal-posix.cpp
index ba776fc..6f9787b 100644
--- a/test/esan/TestCases/workingset-signal-posix.cpp
+++ b/test/esan/TestCases/workingset-signal-posix.cpp
@@ -1,5 +1,7 @@
 // RUN: %clang_esan_wset -O0 %s -o %t 2>&1
 // RUN: %run %t 2>&1 | FileCheck %s
+// Stucks at init and no clone feature equivalent.
+// UNSUPPORTED: freebsd
 
 #include <assert.h>
 #include <setjmp.h>
diff --git a/test/esan/TestCases/workingset-simple.cpp b/test/esan/TestCases/workingset-simple.cpp
index f1ac2ec..dc17bcf 100644
--- a/test/esan/TestCases/workingset-simple.cpp
+++ b/test/esan/TestCases/workingset-simple.cpp
@@ -3,6 +3,8 @@
 
 // FIXME: Re-enable once PR33590 is fixed.
 // UNSUPPORTED: x86_64
+// Stucks at init and no clone feature equivalent.
+// UNSUPPORTED: freebsd
 
 #include <stdlib.h>
 #include <string.h>
diff --git a/test/esan/lit.cfg b/test/esan/lit.cfg
index 8b8457d..1bb34ee 100644
--- a/test/esan/lit.cfg
+++ b/test/esan/lit.cfg
@@ -39,6 +39,5 @@
 # Default test suffixes.
 config.suffixes = ['.c', '.cpp']
 
-# EfficiencySanitizer tests are currently supported on Linux x86-64 only.
-if config.host_os not in ['Linux'] or config.target_arch not in ['x86_64', 'mips64'] :
+if config.host_os not in ['Linux', 'FreeBSD'] or config.target_arch not in ['x86_64', 'mips64'] :
   config.unsupported = True
diff --git a/test/fuzzer/InitializeTest.cpp b/test/fuzzer/InitializeTest.cpp
index a93c2a5..5022c9e 100644
--- a/test/fuzzer/InitializeTest.cpp
+++ b/test/fuzzer/InitializeTest.cpp
@@ -9,7 +9,7 @@
 #include <stdlib.h>
 #include <string.h>
 
-static char *argv0;
+static char *argv0 = NULL;
 
 extern "C" int LLVMFuzzerInitialize(int *argc, char ***argv) {
   assert(*argc > 0);
@@ -20,8 +20,7 @@
 
 extern "C" int LLVMFuzzerTestOneInput(const uint8_t *Data, size_t Size) {
   assert(argv0);
-  if (Size == strlen(argv0) &&
-      !memmem(Data, Size, argv0, Size)) {
+  if (argv0 && Size >= 4 && !memcmp(Data, "fuzz", 4)) {
     fprintf(stderr, "BINGO %s\n", argv0);
     exit(1);
   }
diff --git a/test/fuzzer/SymbolizeDeadlock.cpp b/test/fuzzer/SymbolizeDeadlock.cpp
index 5be1be8..b9ece38 100644
--- a/test/fuzzer/SymbolizeDeadlock.cpp
+++ b/test/fuzzer/SymbolizeDeadlock.cpp
@@ -8,7 +8,6 @@
 #include <cstdio>
 #include <cstdlib>
 #include <cstring>
-#include <unistd.h>
 
 #include "Bingo.h"
 
diff --git a/test/fuzzer/afl-driver-extra-stats.test b/test/fuzzer/afl-driver-extra-stats.test
index cddb683..2f5641d 100644
--- a/test/fuzzer/afl-driver-extra-stats.test
+++ b/test/fuzzer/afl-driver-extra-stats.test
@@ -1,3 +1,5 @@
+# AFL doesn't work on Windows. No reason to test the driver.
+UNSUPPORTED: windows
 XFAIL: ios
 RUN: %no_fuzzer_cpp_compiler %S/AFLDriverTest.cpp %libfuzzer_src/afl/afl_driver.cpp -o %t-AFLDriverTest
 
diff --git a/test/fuzzer/afl-driver-stderr.test b/test/fuzzer/afl-driver-stderr.test
index d3d739d..5e3007e 100644
--- a/test/fuzzer/afl-driver-stderr.test
+++ b/test/fuzzer/afl-driver-stderr.test
@@ -1,5 +1,6 @@
+# AFL doesn't work on Windows. No reason to test the driver.
+UNSUPPORTED: freebsd, windows
 XFAIL: ios
-UNSUPPORTED: freebsd
 RUN: %no_fuzzer_cpp_compiler %S/AFLDriverTest.cpp %libfuzzer_src/afl/afl_driver.cpp -o %t-AFLDriverTest
 
 ; Test that not specifying a stderr file isn't broken.
diff --git a/test/fuzzer/coverage.test b/test/fuzzer/coverage.test
index 3b2341f..ff7a436 100644
--- a/test/fuzzer/coverage.test
+++ b/test/fuzzer/coverage.test
@@ -1,4 +1,5 @@
-UNSUPPORTED: aarch64
+# FIXME: Disabled on Windows because -fPIC cannot be used to compile for Windows.
+UNSUPPORTED: windows
 RUN: %cpp_compiler -mllvm -use-unknown-locations=Disable  %S/NullDerefTest.cpp -o %t-NullDerefTest
 RUN: %cpp_compiler -mllvm -use-unknown-locations=Disable %S/DSO1.cpp -fPIC %ld_flags_rpath_so1 -shared -o %dynamiclib1
 RUN: %cpp_compiler -mllvm -use-unknown-locations=Disable %S/DSO2.cpp -fPIC %ld_flags_rpath_so2 -shared -o %dynamiclib2
diff --git a/test/fuzzer/dso.test b/test/fuzzer/dso.test
index fc1fe23..60ef8a6 100644
--- a/test/fuzzer/dso.test
+++ b/test/fuzzer/dso.test
@@ -1,3 +1,5 @@
+# FIXME: Disabled on Windows because -fPIC cannot be used to compile for Windows.
+UNSUPPORTED: windows
 RUN: %cpp_compiler %S/DSO1.cpp -fPIC %ld_flags_rpath_so1 -shared -o %dynamiclib1
 RUN: %cpp_compiler %S/DSO2.cpp -fPIC %ld_flags_rpath_so2 -shared -o %dynamiclib2
 RUN: %cpp_compiler %S/DSOTestMain.cpp %S/DSOTestExtra.cpp %ld_flags_rpath_exe1 %ld_flags_rpath_exe2 -o %t-DSOTest
diff --git a/test/fuzzer/dump_coverage.test b/test/fuzzer/dump_coverage.test
index 41e1938..803a4fb 100644
--- a/test/fuzzer/dump_coverage.test
+++ b/test/fuzzer/dump_coverage.test
@@ -1,4 +1,5 @@
-UNSUPPORTED: freebsd
+# FIXME: Disabled on Windows because -fPIC cannot be used to compile for Windows.
+UNSUPPORTED: freebsd, windows
 RUN: %cpp_compiler -fsanitize-coverage=0 -fsanitize-coverage=trace-pc-guard %S/DSO1.cpp -fPIC -shared -o %dynamiclib1 %ld_flags_rpath_so1
 RUN: %cpp_compiler -fsanitize-coverage=0 -fsanitize-coverage=trace-pc-guard %S/DSO2.cpp -fPIC -shared -o %dynamiclib2 %ld_flags_rpath_so2
 RUN: %cpp_compiler -fsanitize-coverage=0 -fsanitize-coverage=trace-pc-guard %S/DSOTestMain.cpp %S/DSOTestExtra.cpp %ld_flags_rpath_exe1 %ld_flags_rpath_exe2 -o %t-DSOTest
@@ -7,7 +8,7 @@
 
 RUN: rm -rf %t_workdir && mkdir -p %t_workdir
 RUN: env ASAN_OPTIONS=coverage_dir='"%t_workdir"' not %run %t-NullDerefTest -dump_coverage=1 2>&1 | FileCheck %s
-RUN: sancov -covered-functions %t-NullDerefTest* %t_workdir/*.sancov | FileCheck %s --check-prefix=SANCOV
+RUN: sancov -covered-functions %t-NullDerefTest %t_workdir/*.sancov | FileCheck %s --check-prefix=SANCOV
 RUN: env ASAN_OPTIONS=coverage_dir='"%t_workdir"' %run %t-DSOTest -dump_coverage=1 -runs=0 2>&1 | FileCheck -allow-deprecated-dag-overlap %s --check-prefix=DSO
 RUN: env ASAN_OPTIONS=coverage_dir='"%t_workdir"' not %run %t-NullDerefTest -dump_coverage=0 2>&1 | FileCheck %s --check-prefix=NOCOV
 
diff --git a/test/fuzzer/exit_on_src_pos.test b/test/fuzzer/exit_on_src_pos.test
index ad0fa0a..c08c014 100644
--- a/test/fuzzer/exit_on_src_pos.test
+++ b/test/fuzzer/exit_on_src_pos.test
@@ -1,9 +1,11 @@
 # Temporary use -mllvm -use-unknown-locations=Disable so that
 # all instructions have debug info (file line numbers) attached.
 # TODO: Find out why test fails on Darwin with -O2.
-RUN: %cpp_compiler -O0 %S/SimpleTest.cpp -o %t-SimpleTest -mllvm -use-unknown-locations=Disable
-RUN: %cpp_compiler -O0 %S/ShrinkControlFlowTest.cpp -o %t-ShrinkControlFlowTest
+# Binaries must end in .exe or else symbolization will break on Windows because of how periods
+# in expansion of %t cause the compiler to overwrite .lib and .exp files.
+RUN: %cpp_compiler -O0 %S/SimpleTest.cpp -o %t-SimpleTest.exe -mllvm -use-unknown-locations=Disable
+RUN: %cpp_compiler -O0 %S/ShrinkControlFlowTest.cpp -o %t-ShrinkControlFlowTest.exe
 
-RUN: %run %t-SimpleTest  -exit_on_src_pos=SimpleTest.cpp:18                 2>&1 | FileCheck %s --check-prefix=EXIT_ON_SRC_POS
-RUN: %run %t-ShrinkControlFlowTest  -exit_on_src_pos=Foo 2>&1 | FileCheck %s --check-prefix=EXIT_ON_SRC_POS
+RUN: %run %t-SimpleTest.exe  -exit_on_src_pos=SimpleTest.cpp:18 2>&1 | FileCheck %s --check-prefix=EXIT_ON_SRC_POS
+RUN: %run %t-ShrinkControlFlowTest.exe  -exit_on_src_pos=Foo 2>&1 | FileCheck %s --check-prefix=EXIT_ON_SRC_POS
 EXIT_ON_SRC_POS: INFO: found line matching '{{.*}}', exiting.
diff --git a/test/fuzzer/fuzzer-implicit-integer-truncation.test b/test/fuzzer/fuzzer-implicit-integer-truncation.test
index 212559b..c968ea7 100644
--- a/test/fuzzer/fuzzer-implicit-integer-truncation.test
+++ b/test/fuzzer/fuzzer-implicit-integer-truncation.test
@@ -1,5 +1,5 @@
 RUN: rm -f %t-ImplicitIntegerTruncationTest-Ubsan
 RUN: %cpp_compiler -fsanitize=implicit-integer-truncation -fno-sanitize-recover=all %S/ImplicitIntegerTruncationTest.cpp -o %t-ImplicitIntegerTruncationTest-Ubsan
 RUN: not %run %t-ImplicitIntegerTruncationTest-Ubsan 2>&1 | FileCheck %s
-CHECK: runtime error: implicit conversion from type 'int' of value 256 (32-bit, signed) to type 'unsigned char' changed the value to 0 (8-bit, unsigned)
+CHECK: ImplicitIntegerTruncationTest.cpp:22:17: runtime error: implicit conversion from type 'int' of value 256 (32-bit, signed) to type 'unsigned char' changed the value to 0 (8-bit, unsigned)
 CHECK: Test unit written to ./crash-
diff --git a/test/fuzzer/fuzzer-mutationstats.test b/test/fuzzer/fuzzer-mutationstats.test
deleted file mode 100644
index cb75b36..0000000
--- a/test/fuzzer/fuzzer-mutationstats.test
+++ /dev/null
@@ -1,10 +0,0 @@
-RUN: %cpp_compiler %S/SimpleTest.cpp -o %t-MutationStatsTest
-RUN: not %run %t-MutationStatsTest -print_mutation_stats=1 2>&1 | FileCheck %s --check-prefix=STAT
-
-# Ensures there are some non-zero values in the usefulness percentages printed.
-STAT: stat::mutation_usefulness:   {{[0-9]+\.[0-9]+}}
-
-# Weighted mutations only trigger after first 10,000 runs, hence flag.
-RUN: not %run %t-MutationStatsTest -use_weighted_mutations=1 -seed=1 -runs=100000 2>&1 | FileCheck %s --check-prefix=WEIGHTED
-
-WEIGHTED: BINGO
diff --git a/test/fuzzer/fuzzer-oom.test b/test/fuzzer/fuzzer-oom.test
index e82fb47..362ec31 100644
--- a/test/fuzzer/fuzzer-oom.test
+++ b/test/fuzzer/fuzzer-oom.test
@@ -1,17 +1,21 @@
 UNSUPPORTED: aarch64
-RUN: %cpp_compiler %S/OutOfMemoryTest.cpp -o %t-OutOfMemoryTest
-RUN: %cpp_compiler %S/OutOfMemorySingleLargeMallocTest.cpp -o %t-OutOfMemorySingleLargeMallocTest
-RUN: %cpp_compiler %S/AccumulateAllocationsTest.cpp -o %t-AccumulateAllocationsTest
+# Tests break on windows unless exe extension is used (because there are periods
+# in expansion of %t, the string after the period is interpreted as the file
+# extension, so each compilation will clobber the previous one's lib and exp
+# files causing symbolization to break).
+RUN: %cpp_compiler %S/OutOfMemoryTest.cpp -o %t-OutOfMemoryTest.exe
+RUN: %cpp_compiler %S/OutOfMemorySingleLargeMallocTest.cpp -o %t-OutOfMemorySingleLargeMallocTest.exe
+RUN: %cpp_compiler %S/AccumulateAllocationsTest.cpp -o %t-AccumulateAllocationsTest.exe
 
-RUN: not %run %t-OutOfMemoryTest -rss_limit_mb=300 2>&1 | FileCheck %s
+RUN: not %run %t-OutOfMemoryTest.exe -rss_limit_mb=300 2>&1 | FileCheck %s
 
 CHECK: ERROR: libFuzzer: out-of-memory (used: {{.*}}; limit: 300Mb)
 CHECK: Test unit written to ./oom-
 SUMMARY: libFuzzer: out-of-memory
 
-RUN: not %run %t-OutOfMemorySingleLargeMallocTest -rss_limit_mb=300    2>&1 | FileCheck %s --check-prefix=SINGLE_LARGE_MALLOC
-RUN: not %run %t-OutOfMemorySingleLargeMallocTest -malloc_limit_mb=300 2>&1 | FileCheck %s --check-prefix=SINGLE_LARGE_MALLOC
-RUN: not %run %t-OutOfMemorySingleLargeMallocTest -rss_limit_mb=1000 -malloc_limit_mb=300 2>&1 | FileCheck %s --check-prefix=SINGLE_LARGE_MALLOC
+RUN: not %run %t-OutOfMemorySingleLargeMallocTest.exe -rss_limit_mb=300    2>&1 | FileCheck %s --check-prefix=SINGLE_LARGE_MALLOC
+RUN: not %run %t-OutOfMemorySingleLargeMallocTest.exe -malloc_limit_mb=300 2>&1 | FileCheck %s --check-prefix=SINGLE_LARGE_MALLOC
+RUN: not %run %t-OutOfMemorySingleLargeMallocTest.exe -rss_limit_mb=1000 -malloc_limit_mb=300 2>&1 | FileCheck %s --check-prefix=SINGLE_LARGE_MALLOC
 
 We used to check for "out-of-memory (malloc(53{{.*}}))", but that would fail
 sometimes, so now we accept any OOM message.
@@ -20,4 +24,4 @@
 SINGLE_LARGE_MALLOC: in LLVMFuzzerTestOneInput
 
 # Check that -rss_limit_mb=0 means no limit.
-RUN: %run %t-AccumulateAllocationsTest -runs=1000 -rss_limit_mb=0
+RUN: %run %t-AccumulateAllocationsTest.exe -runs=1000 -rss_limit_mb=0
diff --git a/test/fuzzer/gc-sections.test b/test/fuzzer/gc-sections.test
index b8abfbb..e915c4c 100644
--- a/test/fuzzer/gc-sections.test
+++ b/test/fuzzer/gc-sections.test
@@ -8,8 +8,13 @@
 RUN: %cpp_compiler %S/GcSectionsTest.cpp -o %t -fuse-ld=lld -ffunction-sections -Wl,-gc-sections
 RUN: nm %t | not grep UnusedFunctionShouldBeRemovedByLinker
 RUN: %run %t -runs=0 2>&1 | FileCheck %s
-CHECK-NOT: ERROR: The size of coverage PC tables does not match
 
 With gc sections, with trace-pc. Unused code is removed.
 RUN: %cpp_compiler %S/GcSectionsTest.cpp -o %t -fsanitize-coverage=0 -fsanitize-coverage=trace-pc -ffunction-sections -Wl,-gc-sections
 RUN: nm %t | not grep UnusedFunctionShouldBeRemovedByLinker
+
+RUN: %cpp_compiler %S/GcSectionsTest.cpp -o %t -fsanitize-coverage=0 -fsanitize-coverage=trace-pc-guard,pc-table -fuse-ld=lld -ffunction-sections -Wl,-gc-sections
+RUN: nm %t | not grep UnusedFunctionShouldBeRemovedByLinker
+RUN: %run %t -runs=0 2>&1 | FileCheck %s
+
+CHECK-NOT: ERROR: The size of coverage PC tables does not match
diff --git a/test/fuzzer/handle-unstable.test b/test/fuzzer/handle-unstable.test
index a0b7c83..d202246 100644
--- a/test/fuzzer/handle-unstable.test
+++ b/test/fuzzer/handle-unstable.test
@@ -1,5 +1,6 @@
 # Tests -handle_unstable
-UNSUPPORTED: aarch64
+# FIXME: Disabled on Windows until symbolization works properly.
+UNSUPPORTED: windows
 
 RUN: %cpp_compiler %S/PrintUnstableStatsTest.cpp -o %t-HandleUnstableTest
 
diff --git a/test/fuzzer/lit.cfg b/test/fuzzer/lit.cfg
index 8a44860..608991c 100644
--- a/test/fuzzer/lit.cfg
+++ b/test/fuzzer/lit.cfg
@@ -24,15 +24,18 @@
 # the test runner updated.
 config.test_format = lit.formats.ShTest(execute_external)
 
-# LeakSanitizer is not supported on OSX right now.
-if sys.platform.startswith('darwin') or sys.platform.startswith('freebsd'):
+# LeakSanitizer is not supported on OSX or Windows right now.
+if (sys.platform.startswith('darwin') or
+    sys.platform.startswith('freebsd') or
+    sys.platform.startswith('netbsd') or
+    sys.platform.startswith('win')):
   lit_config.note('lsan feature unavailable')
 else:
   lit_config.note('lsan feature available')
   config.available_features.add('lsan')
 
-# MemorySanitizer is not supported on OSX right now
-if sys.platform.startswith('darwin'):
+# MemorySanitizer is not supported on OSX or Windows right now
+if sys.platform.startswith('darwin') or sys.platform.startswith('win'):
   lit_config.note('msan feature unavailable')
   assert 'msan' not in config.available_features
 else:
@@ -67,10 +70,18 @@
         config.runtime_library_dir)
   elif any(x in config.target_triple for x in ('darwin', 'freebsd')):
     link_cmd = '-lc++'
+  elif 'windows-msvc' in config.target_triple:
+    link_cmd = ''
   else:
     link_cmd = '-lstdc++'
 
-  std_cmd = '--driver-mode=g++ -std=c++11' if is_cpp else ''
+  if is_cpp and 'windows-msvc' in config.target_triple:
+    std_cmd = '--driver-mode=cl'
+  elif is_cpp:
+    std_cmd = '--driver-mode=g++ -std=c++11'
+  else:
+    std_cmd = ''
+
   if msan_enabled:
     sanitizers = ['memory']
   else:
diff --git a/test/fuzzer/merge-control-file.test b/test/fuzzer/merge-control-file.test
index 64b7471..60b2a6a 100644
--- a/test/fuzzer/merge-control-file.test
+++ b/test/fuzzer/merge-control-file.test
@@ -1,6 +1,8 @@
 XFAIL: ios
 RUN: mkdir -p %t
-RUN: %cpp_compiler %S/FullCoverageSetTest.cpp -o %t/T
+# Use a ".exe" extension because it is needed on Windows to call system()
+# to execute itself again.
+RUN: %cpp_compiler %S/FullCoverageSetTest.cpp -o %t/T.exe
 
 RUN: rm -rf %t/T0 %t/T1 %t/T2
 RUN: mkdir -p %t/T0 %t/T1 %t/T2
@@ -11,9 +13,9 @@
 # Test what happens if the control file is junk.
 
 RUN: echo JUNK > %t/MCF
-RUN: not %run %t/T -merge=1 %t/T1 %t/T2 -merge_control_file=%t/MCF 2>&1 | FileCheck %s --check-prefix=JUNK
+RUN: not %run %t/T.exe -merge=1 %t/T1 %t/T2 -merge_control_file=%t/MCF 2>&1 | FileCheck %s --check-prefix=JUNK
 RUN: echo 3 > %t/MCF; echo 0 >> %t/MCF; echo %t/T1/1 >> %t/MCF
-RUN: not %run %t/T -merge=1 %t/T1 %t/T2 -merge_control_file=%t/MCF 2>&1 | FileCheck %s --check-prefix=JUNK
+RUN: not %run %t/T.exe -merge=1 %t/T1 %t/T2 -merge_control_file=%t/MCF 2>&1 | FileCheck %s --check-prefix=JUNK
 JUNK: MERGE-OUTER: non-empty control file provided: {{.*}}MCF
 JUNK: MERGE-OUTER: bad control file, will overwrite it
 
@@ -22,18 +24,18 @@
 
 RUN: rm -f %t/T1/*; cp %t/T0/* %t/T1
 RUN: echo 3 > %t/MCF; echo 0 >> %t/MCF; echo %t/T1/1 >> %t/MCF; echo %t/T1/2 >> %t/MCF; echo %t/T1/3 >> %t/MCF
-RUN: %run %t/T -merge=1 %t/T1 %t/T2 -merge_control_file=%t/MCF 2>&1 | FileCheck %s --check-prefix=OK_0
+RUN: %run %t/T.exe -merge=1 %t/T1 %t/T2 -merge_control_file=%t/MCF 2>&1 | FileCheck %s --check-prefix=OK_0
 OK_0: MERGE-OUTER: control file ok, 3 files total, first not processed file 0
 OK_0: MERGE-OUTER: 3 new files with {{.*}} new features added
 
 RUN: rm -f %t/T1/*; cp %t/T0/* %t/T1
 RUN: echo 3 > %t/MCF; echo 0 >> %t/MCF; echo %t/T1/1 >> %t/MCF; echo %t/T1/2 >> %t/MCF; echo %t/T1/3 >> %t/MCF
-RUN: %run %t/T -merge=1 %t/T1 %t/T2 -merge_control_file=%t/MCF -save_coverage_summary=%t/SUMMARY 2>&1 | FileCheck %s --check-prefix=SAVE_SUMMARY
+RUN: %run %t/T.exe -merge=1 %t/T1 %t/T2 -merge_control_file=%t/MCF -save_coverage_summary=%t/SUMMARY 2>&1 | FileCheck %s --check-prefix=SAVE_SUMMARY
 SAVE_SUMMARY: MERGE-OUTER: writing coverage summary for 3 files to {{.*}}/SUMMARY
 
 RUN: rm -f %t/T1/*; cp %t/T0/* %t/T1
 RUN: echo 3 > %t/MCF; echo 0 >> %t/MCF; echo %t/T1/1 >> %t/MCF; echo %t/T1/2 >> %t/MCF; echo %t/T1/3 >> %t/MCF
-RUN: %run %t/T -merge=1 %t/T1 %t/T2 -merge_control_file=%t/MCF -load_coverage_summary=%t/SUMMARY 2>&1 | FileCheck %s --check-prefix=LOAD_SUMMARY
+RUN: %run %t/T.exe -merge=1 %t/T1 %t/T2 -merge_control_file=%t/MCF -load_coverage_summary=%t/SUMMARY 2>&1 | FileCheck %s --check-prefix=LOAD_SUMMARY
 LOAD_SUMMARY: MERGE-OUTER: coverage summary loaded from
 
 RUN: rm -f %t/T1/*; cp %t/T0/* %t/T1
@@ -42,7 +44,7 @@
 RUN: echo DONE 0 11 >> %t/MCF
 RUN: echo STARTED 1 2 >> %t/MCF
 RUN: echo DONE 1 12 >> %t/MCF
-RUN: %run %t/T -merge=1 %t/T1 %t/T2 -merge_control_file=%t/MCF 2>&1 | FileCheck %s --check-prefix=OK_2
+RUN: %run %t/T.exe -merge=1 %t/T1 %t/T2 -merge_control_file=%t/MCF 2>&1 | FileCheck %s --check-prefix=OK_2
 OK_2: MERGE-OUTER: control file ok, 3 files total, first not processed file 2
 OK_2: MERGE-OUTER: 3 new files with {{.*}} new features added
 
@@ -54,5 +56,5 @@
 RUN: echo DONE 1 12 >> %t/MCF
 RUN: echo STARTED 2 2 >> %t/MCF
 RUN: echo DONE 2 13 >> %t/MCF
-RUN: %run %t/T -merge=1 %t/T1 %t/T2 -merge_control_file=%t/MCF 2>&1 | FileCheck %s --check-prefix=OK_3
+RUN: %run %t/T.exe -merge=1 %t/T1 %t/T2 -merge_control_file=%t/MCF 2>&1 | FileCheck %s --check-prefix=OK_3
 OK_3: MERGE-OUTER: nothing to do, merge has been completed before
diff --git a/test/fuzzer/merge-posix.test b/test/fuzzer/merge-posix.test
index db0a48b..883b7b6 100644
--- a/test/fuzzer/merge-posix.test
+++ b/test/fuzzer/merge-posix.test
@@ -1,4 +1,5 @@
 XFAIL: ios
+UNSUPPORTED: windows
 RUN: %cpp_compiler %S/FullCoverageSetTest.cpp -o %t-FullCoverageSetTest
 
 RUN: rm -rf  %tmp/T1 %tmp/T2
diff --git a/test/fuzzer/merge-sigusr.test b/test/fuzzer/merge-sigusr.test
index a03e544..44448ca 100644
--- a/test/fuzzer/merge-sigusr.test
+++ b/test/fuzzer/merge-sigusr.test
@@ -1,5 +1,7 @@
 # Check that libFuzzer honors SIGUSR1/SIGUSR2
-UNSUPPORTED: darwin
+# FIXME: Disabled on Windows for now because of reliance on posix only features
+# (eg: export, "&", pkill).
+UNSUPPORTED: darwin, windows
 RUN: rm -rf %t
 RUN: mkdir -p %t
 RUN: %cpp_compiler %S/SleepOneSecondTest.cpp -o %t/LFSIGUSR
diff --git a/test/fuzzer/minimize_crash.test b/test/fuzzer/minimize_crash.test
index de44b87..dcab67b 100644
--- a/test/fuzzer/minimize_crash.test
+++ b/test/fuzzer/minimize_crash.test
@@ -1,3 +1,4 @@
+UNSUPPORTED: windows
 RUN: %cpp_compiler %S/NullDerefTest.cpp -o %t-NullDerefTest
 RUN: %cpp_compiler %S/SingleByteInputTest.cpp -o %t-SingleByteInputTest
 RUN: mkdir -p %t.dir
diff --git a/test/fuzzer/minimize_two_crashes.test b/test/fuzzer/minimize_two_crashes.test
index 3c528f7..cba88ee 100644
--- a/test/fuzzer/minimize_two_crashes.test
+++ b/test/fuzzer/minimize_two_crashes.test
@@ -1,5 +1,5 @@
-# Test that the minimizer stops when it sees a differe bug.
-UNSUPPORTED: freebsd
+# Test that the minimizer stops when it sees a different bug.
+UNSUPPORTED: freebsd,windows
 
 # TODO: Find out why test fails on Darwin with -O2.
 RUN: %cpp_compiler -O0 %S/TwoDifferentBugsTest.cpp -o %t-TwoDifferentBugsTest
diff --git a/test/fuzzer/null-deref-on-empty.test b/test/fuzzer/null-deref-on-empty.test
index f159a79..d576cc1 100644
--- a/test/fuzzer/null-deref-on-empty.test
+++ b/test/fuzzer/null-deref-on-empty.test
@@ -1,3 +1,4 @@
+UNSUPPORTED: windows
 RUN: %cpp_compiler %S/NullDerefOnEmptyTest.cpp -o %t-NullDerefOnEmptyTest
 
 RUN: not %run %t-NullDerefOnEmptyTest -print_final_stats=1 2>&1 | FileCheck %s --check-prefix=NULL_DEREF_ON_EMPTY
diff --git a/test/fuzzer/null-deref.test b/test/fuzzer/null-deref.test
index 31eb599..e9926ca 100644
--- a/test/fuzzer/null-deref.test
+++ b/test/fuzzer/null-deref.test
@@ -1,3 +1,4 @@
+UNSUPPORTED: windows
 RUN: %cpp_compiler %S/NullDerefTest.cpp -o %t-NullDerefTest
 
 RUN: not %run %t-NullDerefTest                  2>&1 | FileCheck %s --check-prefix=NullDerefTest
diff --git a/test/fuzzer/print_unstable_stats.test b/test/fuzzer/print_unstable_stats.test
index c20efe9..41b2791 100644
--- a/test/fuzzer/print_unstable_stats.test
+++ b/test/fuzzer/print_unstable_stats.test
@@ -1,10 +1,11 @@
 # Tests -print_unstable_stats
-UNSUPPORTED: aarch64
+# Disabled on Windows because of differences symbolizing and flakiness.
+UNSUPPORTED: aarch64, windows
 
 RUN: %cpp_compiler %S/PrintUnstableStatsTest.cpp -o %t-PrintUnstableStatsTest
 
 RUN: %run %t-PrintUnstableStatsTest -print_unstable_stats=1 -runs=100000 2>&1 | FileCheck %s --check-prefix=LONG
-; We do not observe ini functions since we take the minimum hit counts, and minimum hit counts for ini is 0.
+# We do not observe ini functions since we take the minimum hit counts, and minimum hit counts for ini is 0.
 LONG: UNSTABLE_FUNCTIONS:
 LONG-NOT: det0()
 LONG-NOT: det1()
diff --git a/test/fuzzer/sigusr.test b/test/fuzzer/sigusr.test
index 0b3ddc7..fa477a7 100644
--- a/test/fuzzer/sigusr.test
+++ b/test/fuzzer/sigusr.test
@@ -1,4 +1,6 @@
-UNSUPPORTED: darwin
+# FIXME: Disabled on Windows for now because of reliance on posix only features
+# (eg: export, "&", pkill).
+UNSUPPORTED: darwin, windows
 # Check that libFuzzer honors SIGUSR1/SIGUSR2
 RUN: rm -rf %t
 RUN: mkdir -p %t
diff --git a/test/fuzzer/trace-malloc-threaded.test b/test/fuzzer/trace-malloc-threaded.test
index 8f972d6..f38005c 100644
--- a/test/fuzzer/trace-malloc-threaded.test
+++ b/test/fuzzer/trace-malloc-threaded.test
@@ -1,6 +1,7 @@
 // FIXME: This test infinite loops on darwin because it crashes
 // printing a stack trace repeatedly
-UNSUPPORTED: darwin, aarch64
+// FIXME: Disabled on Windows because of a crash (possibly related to above).
+UNSUPPORTED: darwin, aarch64, windows
 
 RUN: %cpp_compiler %S/TraceMallocThreadedTest.cpp -o \
 RUN:   %t-TraceMallocThreadedTest
diff --git a/test/fuzzer/trace-malloc-unbalanced.test b/test/fuzzer/trace-malloc-unbalanced.test
index 193df01..c7b4632 100644
--- a/test/fuzzer/trace-malloc-unbalanced.test
+++ b/test/fuzzer/trace-malloc-unbalanced.test
@@ -6,14 +6,17 @@
 
 RUN: %cpp_compiler %S/TraceMallocTest.cpp -o %t-TraceMallocTest
 
+# Specify python because we can't use the shebang line on Windows.
 RUN: %run %t-TraceMallocTest -seed=1 -trace_malloc=1 -runs=200 2>&1 | \
-RUN:    %libfuzzer_src/scripts/unbalanced_allocs.py --skip=5 | FileCheck %s
+RUN:    python %libfuzzer_src/scripts/unbalanced_allocs.py --skip=5 | FileCheck %s
 
 RUN: %run %t-TraceMallocTest -seed=1 -trace_malloc=2 -runs=200 2>&1 | \
-RUN:    %libfuzzer_src/scripts/unbalanced_allocs.py --skip=5 | FileCheck %s --check-prefixes=CHECK,CHECK2
+RUN:    python %libfuzzer_src/scripts/unbalanced_allocs.py --skip=5 | FileCheck %s --check-prefixes=CHECK,CHECK2
 
 CHECK: MallocFreeTracer: START
-CHECK: Unbalanced MALLOC[{{[0-9]+}}] [[PTR:0x[0-9a-f]+]] 4
+# Behavior of the format string "%p" is implementation defined. Account for the
+# implementation on Windows and Linux.
+CHECK: Unbalanced MALLOC[{{[0-9]+}}] [[PTR:(:?0x)?[0-9a-fA-F]+]] 4
 CHECK2-NEXT: {{ #0 0x[0-9a-f]+ in }}
 CHECK2-NEXT: {{ #1 0x[0-9a-f]+ in }}
 CHECK2-NEXT: {{ #2 0x[0-9a-f]+ in }}
diff --git a/test/fuzzer/ulimit.test b/test/fuzzer/ulimit.test
index 076866c..7cf4c0a 100644
--- a/test/fuzzer/ulimit.test
+++ b/test/fuzzer/ulimit.test
@@ -1,3 +1,5 @@
+# FIXME: Disabled on Windows for now because Windows has no ulimit command.
+UNSUPPORTED: windows
 RUN: %cpp_compiler %S/SimpleTest.cpp -o %t-SimpleTest
 RUN: ulimit -s 1000
 RUN: not %run %t-SimpleTest
diff --git a/test/fuzzer/value-profile-cmp.test b/test/fuzzer/value-profile-cmp.test
index b927422..8d964b1 100644
--- a/test/fuzzer/value-profile-cmp.test
+++ b/test/fuzzer/value-profile-cmp.test
@@ -1,3 +1,5 @@
+# FIXME: Disabled on Windows because of hangs.
+UNSUPPORTED: windows
 CHECK: BINGO
 RUN: %cpp_compiler %S/SimpleCmpTest.cpp -o %t-SimpleCmpTest
 RUN: not %run %t-SimpleCmpTest -seed=1 -use_cmp=0 -use_value_profile=1 -runs=100000000 2>&1 | FileCheck %s
diff --git a/test/fuzzer/value-profile-cmp2.test b/test/fuzzer/value-profile-cmp2.test
index 4bf119f..5935ed6 100644
--- a/test/fuzzer/value-profile-cmp2.test
+++ b/test/fuzzer/value-profile-cmp2.test
@@ -1,3 +1,4 @@
+UNSUPPORTED: ios
 CHECK: BINGO
 RUN: %cpp_compiler -fno-sanitize=address %S/SimpleHashTest.cpp -o %t-SimpleHashTest
 RUN: not %run %t-SimpleHashTest -seed=1 -use_cmp=0 -use_value_profile=1 -runs=100000000 -max_len=64 2>&1 | FileCheck %s
diff --git a/test/fuzzer/value-profile-cmp3.test b/test/fuzzer/value-profile-cmp3.test
index 58ba18b..fe71592 100644
--- a/test/fuzzer/value-profile-cmp3.test
+++ b/test/fuzzer/value-profile-cmp3.test
@@ -1,3 +1,4 @@
+UNSUPPORTED: ios
 CHECK: BINGO
 RUN: %cpp_compiler %S/AbsNegAndConstantTest.cpp -o %t-AbsNegAndConstantTest
 RUN: not %run %t-AbsNegAndConstantTest -seed=1 -use_cmp=0 -use_value_profile=1 -runs=100000000 2>&1 | FileCheck %s
diff --git a/test/fuzzer/value-profile-cmp4.test b/test/fuzzer/value-profile-cmp4.test
index 05bc3f4..e5ac29f 100644
--- a/test/fuzzer/value-profile-cmp4.test
+++ b/test/fuzzer/value-profile-cmp4.test
@@ -1,3 +1,5 @@
+# FIXME: Disabled on Windows because of hangs.
+UNSUPPORTED: windows
 CHECK: BINGO
 RUN: %cpp_compiler %S/AbsNegAndConstant64Test.cpp -o %t-AbsNegAndConstant64Test
 RUN: not %run %t-AbsNegAndConstant64Test -seed=1 -use_cmp=0 -use_value_profile=1 -runs=100000000 2>&1 | FileCheck %s
diff --git a/test/fuzzer/value-profile-div.test b/test/fuzzer/value-profile-div.test
index 59cc7c2..38f2112 100644
--- a/test/fuzzer/value-profile-div.test
+++ b/test/fuzzer/value-profile-div.test
@@ -1,4 +1,4 @@
-XFAIL: ios
+UNSUPPORTED: ios
 UNSUPPORTED: aarch64
 CHECK: AddressSanitizer: {{FPE|int-divide-by-zero}}
 RUN: %cpp_compiler %S/DivTest.cpp -fsanitize-coverage=trace-div -o %t-DivTest
diff --git a/test/fuzzer/value-profile-load.test b/test/fuzzer/value-profile-load.test
index 607b81c..b6baf13 100644
--- a/test/fuzzer/value-profile-load.test
+++ b/test/fuzzer/value-profile-load.test
@@ -1,3 +1,5 @@
+# FIXME: Disabled on Windows because of hangs.
+UNSUPPORTED: windows
 CHECK: AddressSanitizer: global-buffer-overflow
 RUN: %cpp_compiler %S/LoadTest.cpp -fsanitize-coverage=trace-gep -o %t-LoadTest
 RUN: not %run %t-LoadTest -seed=2 -use_cmp=0 -use_value_profile=1 -runs=20000000 2>&1 | FileCheck %s
diff --git a/test/fuzzer/value-profile-mem.test b/test/fuzzer/value-profile-mem.test
index 57c844e..7d68b88 100644
--- a/test/fuzzer/value-profile-mem.test
+++ b/test/fuzzer/value-profile-mem.test
@@ -1,3 +1,4 @@
+UNSUPPORTED: ios
 UNSUPPORTED: freebsd
 CHECK: BINGO
 RUN: %cpp_compiler %S/SingleMemcmpTest.cpp -o %t-SingleMemcmpTest
diff --git a/test/fuzzer/value-profile-set.test b/test/fuzzer/value-profile-set.test
index e55f1e4..7515e36 100644
--- a/test/fuzzer/value-profile-set.test
+++ b/test/fuzzer/value-profile-set.test
@@ -1,3 +1,4 @@
+UNSUPPORTED: ios
 CHECK: BINGO
 RUN: %cpp_compiler %S/FourIndependentBranchesTest.cpp -o %t-FourIndependentBranchesTest
 RUN: not %run %t-FourIndependentBranchesTest -seed=1 -use_cmp=0 -use_value_profile=1 -runs=100000000 2>&1 | FileCheck %s
diff --git a/test/fuzzer/value-profile-strcmp.test b/test/fuzzer/value-profile-strcmp.test
index 647121f..9b7a244 100644
--- a/test/fuzzer/value-profile-strcmp.test
+++ b/test/fuzzer/value-profile-strcmp.test
@@ -1,3 +1,4 @@
+UNSUPPORTED: ios
 UNSUPPORTED: freebsd
 CHECK: BINGO
 RUN: %cpp_compiler %S/SingleStrcmpTest.cpp -o %t-SingleStrcmpTest
diff --git a/test/fuzzer/value-profile-strncmp.test b/test/fuzzer/value-profile-strncmp.test
index b60b97f..98488df 100644
--- a/test/fuzzer/value-profile-strncmp.test
+++ b/test/fuzzer/value-profile-strncmp.test
@@ -1,4 +1,4 @@
-UNSUPPORTED: freebsd
+UNSUPPORTED: freebsd, aarch64
 CHECK: BINGO
 RUN: %cpp_compiler %S/SingleStrncmpTest.cpp -o %t-SingleStrncmpTest
 RUN: not %run %t-SingleStrncmpTest -seed=1 -use_cmp=0 -use_value_profile=1 -runs=100000000 2>&1 | FileCheck %s
diff --git a/test/fuzzer/value-profile-switch.test b/test/fuzzer/value-profile-switch.test
index cc3d494..a71682d 100644
--- a/test/fuzzer/value-profile-switch.test
+++ b/test/fuzzer/value-profile-switch.test
@@ -1,4 +1,4 @@
-XFAIL: ios
+UNSUPPORTED: ios
 CHECK: BINGO
 RUN: %cpp_compiler %S/SwitchTest.cpp -o %t-SwitchTest
 RUN: %cpp_compiler %S/Switch2Test.cpp -o %t-Switch2Test
diff --git a/test/hwasan/TestCases/deep-recursion.c b/test/hwasan/TestCases/deep-recursion.c
new file mode 100644
index 0000000..1ac0f5b
--- /dev/null
+++ b/test/hwasan/TestCases/deep-recursion.c
@@ -0,0 +1,73 @@
+// RUN: %clang_hwasan -O1 %s -o %t
+// RUN: %env_hwasan_opts=stack_history_size=1 not %run %t 2>&1 | FileCheck %s --check-prefix=D1
+// RUN: %env_hwasan_opts=stack_history_size=2 not %run %t 2>&1 | FileCheck %s --check-prefix=D2
+// RUN: %env_hwasan_opts=stack_history_size=3 not %run %t 2>&1 | FileCheck %s --check-prefix=D3
+// RUN: %env_hwasan_opts=stack_history_size=5 not %run %t 2>&1 | FileCheck %s --check-prefix=D5
+// RUN:                                       not %run %t 2>&1 | FileCheck %s --check-prefix=DEFAULT
+
+// REQUIRES: stable-runtime
+
+#include <stdlib.h>
+// At least -O1 is needed for this function to not have a stack frame on
+// AArch64.
+void USE(void *x) { // pretend_to_do_something(void *x)
+  __asm__ __volatile__("" : : "r" (x) : "memory");
+}
+
+volatile int four = 4;
+
+__attribute__((noinline)) void OOB() { int x[4]; x[four] = 0; USE(&x[0]); }
+__attribute__((noinline)) void FUNC1() { int x; USE(&x); OOB(); }
+__attribute__((noinline)) void FUNC2() { int x; USE(&x); FUNC1(); }
+__attribute__((noinline)) void FUNC3() { int x; USE(&x); FUNC2(); }
+__attribute__((noinline)) void FUNC4() { int x; USE(&x); FUNC3(); }
+__attribute__((noinline)) void FUNC5() { int x; USE(&x); FUNC4(); }
+__attribute__((noinline)) void FUNC6() { int x; USE(&x); FUNC5(); }
+__attribute__((noinline)) void FUNC7() { int x; USE(&x); FUNC6(); }
+__attribute__((noinline)) void FUNC8() { int x; USE(&x); FUNC7(); }
+__attribute__((noinline)) void FUNC9() { int x; USE(&x); FUNC8(); }
+__attribute__((noinline)) void FUNC10() { int x; USE(&x); FUNC9(); }
+
+int main() { FUNC10(); }
+
+// D1: Previosly allocated frames
+// D1: in OOB
+// D1-NOT: in FUNC
+// D1: Memory tags around the buggy address
+
+// D2: Previosly allocated frames
+// D2: in OOB
+// D2: in FUNC1
+// D2-NOT: in FUNC
+// D2: Memory tags around the buggy address
+
+// D3: Previosly allocated frames
+// D3: in OOB
+// D3: in FUNC1
+// D3: in FUNC2
+// D3-NOT: in FUNC
+// D3: Memory tags around the buggy address
+
+// D5: Previosly allocated frames
+// D5: in OOB
+// D5: in FUNC1
+// D5: in FUNC2
+// D5: in FUNC3
+// D5: in FUNC4
+// D5-NOT: in FUNC
+// D5: Memory tags around the buggy address
+
+// DEFAULT: Previosly allocated frames
+// DEFAULT: in OOB
+// DEFAULT: in FUNC1
+// DEFAULT: in FUNC2
+// DEFAULT: in FUNC3
+// DEFAULT: in FUNC4
+// DEFAULT: in FUNC5
+// DEFAULT: in FUNC6
+// DEFAULT: in FUNC7
+// DEFAULT: in FUNC8
+// DEFAULT: in FUNC9
+// DEFAULT: in FUNC10
+// DEFAULT-NOT: in FUNC
+// DEFAULT: Memory tags around the buggy address
diff --git a/test/hwasan/TestCases/double-free.c b/test/hwasan/TestCases/double-free.c
new file mode 100644
index 0000000..e97aae6
--- /dev/null
+++ b/test/hwasan/TestCases/double-free.c
@@ -0,0 +1,23 @@
+// RUN: %clang_hwasan %s -o %t && not %run %t 2>&1 | FileCheck %s
+
+// REQUIRES: stable-runtime
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <sanitizer/hwasan_interface.h>
+
+int main() {
+  __hwasan_enable_allocator_tagging();
+  char * volatile x = (char*)malloc(40);
+  free(x);
+  free(x);
+// CHECK: ERROR: HWAddressSanitizer: invalid-free on address
+// CHECK: tags: [[PTR_TAG:..]]/[[MEM_TAG:..]] (ptr/mem)
+// CHECK: freed by thread {{.*}} here:
+// CHECK: previously allocated here:
+// CHECK: Memory tags around the buggy address (one tag corresponds to 16 bytes):
+// CHECK: =>{{.*}}[[MEM_TAG]]
+  fprintf(stderr, "DONE\n");
+  __hwasan_disable_allocator_tagging();
+// CHECK-NOT: DONE
+}
diff --git a/test/hwasan/TestCases/heap-buffer-overflow.c b/test/hwasan/TestCases/heap-buffer-overflow.c
new file mode 100644
index 0000000..40b6e6e
--- /dev/null
+++ b/test/hwasan/TestCases/heap-buffer-overflow.c
@@ -0,0 +1,26 @@
+// RUN: %clang_hwasan  %s -o %t
+// RUN: not %run %t 40 2>&1 | FileCheck %s --check-prefix=CHECK40
+// RUN: not %run %t 80 2>&1 | FileCheck %s --check-prefix=CHECK80
+// RUN: not %run %t -30 2>&1 | FileCheck %s --check-prefix=CHECKm30
+// RUN: not %run %t -30 1000000 2>&1 | FileCheck %s --check-prefix=CHECKMm30
+// RUN: not %run %t 1000000 1000000 2>&1 | FileCheck %s --check-prefix=CHECKM
+
+// REQUIRES: stable-runtime
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <sanitizer/hwasan_interface.h>
+
+int main(int argc, char **argv) {
+  __hwasan_enable_allocator_tagging();
+  int offset = argc < 2 ? 40 : atoi(argv[1]);
+  int size = argc < 3 ? 30 : atoi(argv[2]);
+  char * volatile x = (char*)malloc(size);
+  x[offset] = 42;
+// CHECK40: is located 10 bytes to the right of 30-byte region
+// CHECK80: is located 50 bytes to the right of 30-byte region
+// CHECKm30: is located 30 bytes to the left of 30-byte region
+// CHECKMm30: is located 30 bytes to the left of 1000000-byte region
+// CHECKM: is located 0 bytes to the right of 1000000-byte region
+  free(x);
+}
diff --git a/test/hwasan/TestCases/hwasan-print-shadow.cc b/test/hwasan/TestCases/hwasan-print-shadow.cc
new file mode 100644
index 0000000..fa6330b
--- /dev/null
+++ b/test/hwasan/TestCases/hwasan-print-shadow.cc
@@ -0,0 +1,29 @@
+// RUN: %clangxx_hwasan -DSIZE=16 -O0 %s -o %t && %run %t 2>&1 | FileCheck %s
+
+// REQUIRES: stable-runtime
+
+#include <assert.h>
+#include <stdlib.h>
+#include <sys/mman.h>
+#include <sanitizer/hwasan_interface.h>
+
+int main() {
+  char *p = (char *)mmap(nullptr, 4096, PROT_READ | PROT_WRITE,
+                         MAP_PRIVATE | MAP_ANONYMOUS, 0, 0);
+  assert(p);
+
+  __hwasan_tag_memory(p, 1, 32);
+  __hwasan_tag_memory(p + 32, 3, 16);
+  __hwasan_tag_memory(p + 48, 0, 32);
+  __hwasan_tag_memory(p + 80, 4, 16);
+
+  char *q = (char *)__hwasan_tag_pointer(p, 7);
+  __hwasan_print_shadow(q + 5, 89 - 5);
+  // CHECK:      HWASan shadow map for {{.*}}5 .. {{.*}}9 (pointer tag 7)
+  // CHECK-NEXT:   {{.*}}0: 1
+  // CHECK-NEXT:   {{.*}}0: 1
+  // CHECK-NEXT:   {{.*}}0: 3
+  // CHECK-NEXT:   {{.*}}0: 0
+  // CHECK-NEXT:   {{.*}}0: 0
+  // CHECK-NEXT:   {{.*}}0: 4
+}
diff --git a/test/hwasan/TestCases/longjmp.c b/test/hwasan/TestCases/longjmp.c
new file mode 100644
index 0000000..8d847b5
--- /dev/null
+++ b/test/hwasan/TestCases/longjmp.c
@@ -0,0 +1,26 @@
+// RUN: %clang_hwasan -O0 -DNEGATIVE %s -o %t && %run %t 2>&1
+// RUN: %clang_hwasan -O0 %s -o %t && not %run %t 2>&1 | FileCheck %s
+
+// REQUIRES: stable-runtime
+
+#include <stdlib.h>
+#include <assert.h>
+#include <sanitizer/hwasan_interface.h>
+
+__attribute__((noinline))
+int f(void *caller_frame) {
+  int z = 0;
+  int *volatile p = &z;
+  // Tag of local is never zero.
+  assert(__hwasan_tag_pointer(p, 0) != p);
+#ifndef NEGATIVE
+  // This will destroy shadow of "z", and the following load will crash.
+  __hwasan_handle_longjmp(caller_frame);
+#endif
+  return p[0];
+}
+
+int main() {
+  return f(__builtin_frame_address(0));
+  // CHECK: READ of size 8 at {{.*}} tags: {{.*}}/00 (ptr/mem)
+}
diff --git a/test/hwasan/TestCases/malloc-test.c b/test/hwasan/TestCases/malloc-test.c
new file mode 100644
index 0000000..199464b
--- /dev/null
+++ b/test/hwasan/TestCases/malloc-test.c
@@ -0,0 +1,16 @@
+// Test basic malloc functionality.
+// RUN: %clang_hwasan %s -o %t
+// RUN: %run %t
+
+#include <stdlib.h>
+#include <assert.h>
+#include <sanitizer/hwasan_interface.h>
+#include <sanitizer/allocator_interface.h>
+
+int main() {
+  __hwasan_enable_allocator_tagging();
+  char *a1 = (char*)malloc(0);
+  assert(a1 != 0);
+  assert(__sanitizer_get_allocated_size(a1) == 0);
+  free(a1);
+}
diff --git a/test/hwasan/TestCases/malloc_fill.cc b/test/hwasan/TestCases/malloc_fill.cc
new file mode 100644
index 0000000..b8513b7
--- /dev/null
+++ b/test/hwasan/TestCases/malloc_fill.cc
@@ -0,0 +1,22 @@
+// Check that we fill malloc-ed memory correctly.
+// RUN: %clangxx_hwasan %s -o %t
+// RUN: %run %t | FileCheck %s
+// RUN: %env_hwasan_opts=max_malloc_fill_size=10:malloc_fill_byte=8 %run %t | FileCheck %s --check-prefix=CHECK-10-8
+// RUN: %env_hwasan_opts=max_malloc_fill_size=20:malloc_fill_byte=171 %run %t | FileCheck %s --check-prefix=CHECK-20-ab
+
+#include <stdio.h>
+int main(int argc, char **argv) {
+  // With asan allocator this makes sure we get memory from mmap.
+  static const int kSize = 1 << 25;
+  unsigned char *x = new unsigned char[kSize];
+  printf("-");
+  for (int i = 0; i <= 32; i++) {
+    printf("%02x", x[i]);
+  }
+  printf("-\n");
+  delete [] x;
+}
+
+// CHECK: -bebebebebebebebebebebebebebebebebebebebebebebebebebebebebebebebebe-
+// CHECK-10-8: -080808080808080808080000000000000000000000000000000000000000000000-
+// CHECK-20-ab: -abababababababababababababababababababab00000000000000000000000000-
diff --git a/test/hwasan/TestCases/many-threads-uaf.c b/test/hwasan/TestCases/many-threads-uaf.c
new file mode 100644
index 0000000..3a79cb3
--- /dev/null
+++ b/test/hwasan/TestCases/many-threads-uaf.c
@@ -0,0 +1,45 @@
+// RUN: %clang_hwasan %s -o %t && not %env_hwasan_opts=verbose_threads=1 %run %t 2>&1 | FileCheck %s
+// REQUIRES: stable-runtime
+
+#include <pthread.h>
+#include <stdlib.h>
+#include <stdio.h>
+
+#include <sanitizer/hwasan_interface.h>
+
+void *BoringThread(void *arg) {
+  char * volatile x = (char*)malloc(10);
+  x[5] = 0;
+  free(x);
+  return NULL;
+}
+
+// CHECK: Creating  : T0
+// CHECK: Creating  : T1
+// CHECK: Destroying: T1
+// CHECK: Creating  : T1100
+// CHECK: Destroying: T1100
+// CHECK: Creating  : T1101
+
+void *UAFThread(void *arg) {
+  char * volatile x = (char*)malloc(10);
+  fprintf(stderr, "ZZZ %p\n", x);
+  free(x);
+  x[5] = 42;
+  // CHECK: ERROR: HWAddressSanitizer: tag-mismatch on address
+  // CHECK: WRITE of size 1
+  // CHECK: many-threads-uaf.c:[[@LINE-3]]
+  // CHECK: Thread: T1101
+  return NULL;
+}
+
+int main() {
+  __hwasan_enable_allocator_tagging();
+  pthread_t t;
+  for (int i = 0; i < 1100; i++) {
+    pthread_create(&t, NULL, BoringThread, NULL);
+    pthread_join(t, NULL);
+  }
+  pthread_create(&t, NULL, UAFThread, NULL);
+  pthread_join(t, NULL);
+}
diff --git a/test/hwasan/TestCases/new-test.cc b/test/hwasan/TestCases/new-test.cc
new file mode 100644
index 0000000..3b1991e
--- /dev/null
+++ b/test/hwasan/TestCases/new-test.cc
@@ -0,0 +1,18 @@
+// Test basic new functionality.
+// RUN: %clangxx_hwasan %s -o %t
+// RUN: %run %t
+
+#include <stdlib.h>
+#include <assert.h>
+#include <sanitizer/hwasan_interface.h>
+#include <sanitizer/allocator_interface.h>
+
+int main() {
+  __hwasan_enable_allocator_tagging();
+
+  size_t volatile n = 0;
+  char *a1 = new char[n];
+  assert(a1 != nullptr);
+  assert(__sanitizer_get_allocated_size(a1) == 0);
+  delete[] a1;
+}
diff --git a/test/hwasan/TestCases/print-memory-usage-android.c b/test/hwasan/TestCases/print-memory-usage-android.c
new file mode 100644
index 0000000..5a05792
--- /dev/null
+++ b/test/hwasan/TestCases/print-memory-usage-android.c
@@ -0,0 +1,21 @@
+// Tests __hwasan_print_memory_usage through /proc/$PID/maps.
+// RUN: %clang_hwasan %s -o %t && %env_hwasan_opts=export_memory_stats=1 %run %t 2>&1 | FileCheck %s
+// REQUIRES: android
+
+#include <sys/types.h>
+#include <unistd.h>
+#include <stdlib.h>
+#include <stdio.h>
+
+// The function needs to be unsanitized in order for &cmd to be untagged. This
+// address is passed to system() and then to execve() syscall. The tests need to
+// run on unpatched linux kernel, which at this time does not accept tagged
+// pointers in system call arguments (but there is hope: see
+// https://lore.kernel.org/patchwork/cover/979328).
+__attribute__((no_sanitize("hwaddress")))
+int main() {
+  char cmd[1024];
+  snprintf(cmd, sizeof(cmd), "cat /proc/%d/maps", getpid());
+  system(cmd);
+  // CHECK: HWASAN pid: [[PID:[0-9]*]] rss: {{.*}} threads: 1 stacks: [[STACKS:[0-9]*]] thr_aux: {{.*}} stack_depot: {{.*}} uniq_stacks: [[UNIQ_STACKS:[0-9]*]] heap: [[HEAP:[0-9]*]]
+}
diff --git a/test/hwasan/TestCases/print-memory-usage.c b/test/hwasan/TestCases/print-memory-usage.c
new file mode 100644
index 0000000..df9d534
--- /dev/null
+++ b/test/hwasan/TestCases/print-memory-usage.c
@@ -0,0 +1,72 @@
+// Tests __hwasan_print_memory_usage.
+// RUN: %clang_hwasan %s -o %t
+// RUN: ulimit -s 1000
+// RUN: %run %t 2>&1 | FileCheck %s
+// REQUIRES: stable-runtime
+
+#include <pthread.h>
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+
+#include <sanitizer/hwasan_interface.h>
+
+int state;
+__thread volatile char *sink;
+
+__attribute__((noinline))
+void *malloc_and_use(int size) {
+  char *x = (char*)malloc(size);
+  for (int i = 0; i < size; i++)
+    x[i] = 42;  // make this memory used.
+  return x;
+}
+
+void *T1(void *arg) {
+
+  for (int i = 1; i <= (1 << 20); i *= 2)
+    sink = malloc_and_use(i);
+
+  __sync_fetch_and_add(&state, 1);
+  while (__sync_fetch_and_add(&state, 0) != 4) {}
+  return NULL;
+}
+
+void *T4(void *arg) { return NULL; }
+
+int main() {
+  __hwasan_enable_allocator_tagging();
+  sink = malloc_and_use(10);
+
+  __hwasan_print_memory_usage();
+  // CHECK: HWASAN pid: [[PID:[0-9]*]] rss: {{.*}} threads: 1 stacks: [[STACKS:[0-9]*]] thr_aux: {{.*}} stack_depot: {{.*}} uniq_stacks: [[UNIQ_STACKS:[0-9]*]] heap: [[HEAP:[0-9]*]]
+
+  void *one_meg = malloc_and_use(1 << 20);
+
+  __hwasan_print_memory_usage();
+  // CHECK: HWASAN pid: [[PID]] rss: {{.*}} threads: 1 stacks: [[STACKS]] thr_aux: {{.*}} stack_depot: {{.*}}
+
+  free(one_meg);
+
+  __hwasan_print_memory_usage();
+  // CHECK: HWASAN pid: [[PID]] rss: {{.*}} threads: 1 stacks: [[STACKS]] thr_aux: {{.*}} stack_depot: {{.*}} uniq_stacks: {{.*}} heap: [[HEAP]]
+
+  pthread_t t1, t2, t3, t4;
+
+  pthread_create(&t1, NULL, T1, NULL);
+  pthread_create(&t2, NULL, T1, NULL);
+  pthread_create(&t3, NULL, T1, NULL);
+  pthread_create(&t4, NULL, T4, NULL);
+  while (__sync_fetch_and_add(&state, 0) != 3) {}
+  pthread_join(t4, NULL);
+
+  __hwasan_print_memory_usage();
+  // CHECK: HWASAN pid: [[PID]] rss: {{.*}} threads: 4 stacks:
+
+  __sync_fetch_and_add(&state, 1);
+  pthread_join(t1, NULL);
+  pthread_join(t2, NULL);
+  pthread_join(t3, NULL);
+  __hwasan_print_memory_usage();
+  // CHECK: HWASAN pid: [[PID]] rss: {{.*}} threads: 1 stacks: [[STACKS]]
+}
diff --git a/test/hwasan/TestCases/pthread_exit.c b/test/hwasan/TestCases/pthread_exit.c
new file mode 100644
index 0000000..937e20c
--- /dev/null
+++ b/test/hwasan/TestCases/pthread_exit.c
@@ -0,0 +1,5 @@
+// Tests pthread_exit.
+// RUN: %clang_hwasan %s -o %t && %run %t
+// REQUIRES: stable-runtime
+#include <pthread.h>
+int main() { pthread_exit(NULL); }
diff --git a/test/hwasan/TestCases/realloc-after-free.c b/test/hwasan/TestCases/realloc-after-free.c
new file mode 100644
index 0000000..c4bc48c
--- /dev/null
+++ b/test/hwasan/TestCases/realloc-after-free.c
@@ -0,0 +1,28 @@
+// RUN: %clang_hwasan %s -o %t
+// RUN: not %run %t 50 2>&1 | FileCheck %s
+// RUN: not %run %t 40 2>&1 | FileCheck %s
+// RUN: not %run %t 30 2>&1 | FileCheck %s
+
+// REQUIRES: stable-runtime
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <sanitizer/hwasan_interface.h>
+
+int main(int argc, char **argv) {
+  __hwasan_enable_allocator_tagging();
+  if (argc != 2) return 0;
+  int realloc_size = atoi(argv[1]);
+  char * volatile x = (char*)malloc(40);
+  free(x);
+  x = realloc(x, realloc_size);
+// CHECK: ERROR: HWAddressSanitizer: invalid-free on address
+// CHECK: tags: [[PTR_TAG:..]]/[[MEM_TAG:..]] (ptr/mem)
+// CHECK: freed by thread {{.*}} here:
+// CHECK: previously allocated here:
+// CHECK: Memory tags around the buggy address (one tag corresponds to 16 bytes):
+// CHECK: =>{{.*}}[[MEM_TAG]]
+  fprintf(stderr, "DONE\n");
+  __hwasan_disable_allocator_tagging();
+// CHECK-NOT: DONE
+}
diff --git a/test/hwasan/TestCases/realloc-test.cc b/test/hwasan/TestCases/realloc-test.cc
new file mode 100644
index 0000000..8387902
--- /dev/null
+++ b/test/hwasan/TestCases/realloc-test.cc
@@ -0,0 +1,37 @@
+// Test basic realloc functionality.
+// RUN: %clang_hwasan %s -o %t
+// RUN: %run %t
+
+#include <stdlib.h>
+#include <assert.h>
+#include <sanitizer/hwasan_interface.h>
+
+int main() {
+  __hwasan_enable_allocator_tagging();
+  char *x = (char*)realloc(nullptr, 4);
+  x[0] = 10;
+  x[1] = 20;
+  x[2] = 30;
+  x[3] = 40;
+  char *x1 = (char*)realloc(x, 5);
+  assert(x1 != x);  // not necessary true for C,
+                    // but true today for hwasan.
+  assert(x1[0] == 10 && x1[1] == 20 && x1[2] == 30 && x1[3] == 40);
+  x1[4] = 50;
+
+  char *x2 = (char*)realloc(x1, 6);
+  x2[5] = 60;
+  assert(x2 != x1);
+  assert(x2[0] == 10 && x2[1] == 20 && x2[2] == 30 && x2[3] == 40 &&
+         x2[4] == 50 && x2[5] == 60);
+
+  char *x3 = (char*)realloc(x2, 6);
+  assert(x3 != x2);
+  assert(x3[0] == 10 && x3[1] == 20 && x3[2] == 30 && x3[3] == 40 &&
+         x3[4] == 50 && x3[5] == 60);
+
+  char *x4 = (char*)realloc(x3, 5);
+  assert(x4 != x3);
+  assert(x4[0] == 10 && x4[1] == 20 && x4[2] == 30 && x4[3] == 40 &&
+         x4[4] == 50);
+}
diff --git a/test/hwasan/TestCases/rich-stack.c b/test/hwasan/TestCases/rich-stack.c
new file mode 100644
index 0000000..6787d57
--- /dev/null
+++ b/test/hwasan/TestCases/rich-stack.c
@@ -0,0 +1,66 @@
+// Test how stack frames are reported (not fully implemented yet).
+// RUN: %clang_hwasan %s -o %t
+// RUN: not %run %t 3 2 -1 2>&1 | FileCheck %s --check-prefix=R321
+// REQUIRES: stable-runtime
+#include <stdint.h>
+#include <stdlib.h>
+void USE(void *x) { // pretend_to_do_something(void *x)
+  __asm__ __volatile__("" : : "r" (x) : "memory");
+}
+void USE2(void *a, void *b) { USE(a); USE(b); }
+void USE4(void *a, void *b, void *c, void *d) { USE2(a, b); USE2(c, d); }
+
+void BAR(int depth, int err_depth, int offset);
+
+uint64_t *leaked_ptr;
+
+void FOO(int depth, int err_depth, int offset) {
+  uint8_t v1;
+  uint16_t v2;
+  uint32_t v4;
+  uint64_t v8;
+  uint64_t v16[2];
+  uint64_t v32[4];
+  uint64_t v48[3];
+  USE4(&v1, &v2, &v4, &v8);  USE4(&v16, &v32, &v48, 0);
+  leaked_ptr = &v16[0];
+  if (depth)
+    BAR(depth - 1, err_depth, offset);
+
+  if (err_depth == depth)
+    v16[offset] = 0;  // maybe OOB.
+  if (err_depth == -depth)
+    leaked_ptr[offset] = 0; // maybe UAR.
+  USE(&v16);
+}
+
+void BAR(int depth, int err_depth, int offset) {
+  uint64_t x16[2];
+  uint64_t x32[4];
+  USE2(&x16, &x32);
+  leaked_ptr = &x16[0];
+  if (depth)
+    FOO(depth - 1, err_depth, offset);
+  if (err_depth == depth)
+    x16[offset] = 0;  // maybe OOB
+  if (err_depth == -depth)
+    leaked_ptr[offset] = 0;  // maybe UAR
+  USE(&x16);
+}
+
+
+int main(int argc, char **argv) {
+  if (argc != 4) return -1;
+  int depth = atoi(argv[1]);
+  int err_depth = atoi(argv[2]);
+  int offset = atoi(argv[3]);
+  FOO(depth, err_depth, offset);
+  return 0;
+}
+
+// R321: HWAddressSanitizer: tag-mismatch
+// R321-NEXT: WRITE of size 8
+// R321-NEXT: in BAR
+// R321-NEXT: in FOO
+// R321-NEXT: in main
+// R321: is located in stack of thread T0
diff --git a/test/hwasan/TestCases/sanitizer_malloc.cc b/test/hwasan/TestCases/sanitizer_malloc.cc
new file mode 100644
index 0000000..66ac964
--- /dev/null
+++ b/test/hwasan/TestCases/sanitizer_malloc.cc
@@ -0,0 +1,29 @@
+// Test allocator aliases.
+//
+// RUN: %clangxx_hwasan -O0 %s -o %t && %run %t
+
+#include <sanitizer/hwasan_interface.h>
+
+int main() {
+  void *volatile sink;
+  sink = (void *)&__sanitizer_posix_memalign;
+  sink = (void *)&__sanitizer_memalign;
+  sink = (void *)&__sanitizer_aligned_alloc;
+  sink = (void *)&__sanitizer___libc_memalign;
+  sink = (void *)&__sanitizer_valloc;
+  sink = (void *)&__sanitizer_pvalloc;
+  sink = (void *)&__sanitizer_free;
+  sink = (void *)&__sanitizer_cfree;
+  sink = (void *)&__sanitizer_malloc_usable_size;
+  sink = (void *)&__sanitizer_mallinfo;
+  sink = (void *)&__sanitizer_mallopt;
+  sink = (void *)&__sanitizer_malloc_stats;
+  sink = (void *)&__sanitizer_calloc;
+  sink = (void *)&__sanitizer_realloc;
+  sink = (void *)&__sanitizer_malloc;
+
+  // sanity check
+  void *p = __sanitizer_malloc(100);
+  p = __sanitizer_realloc(p, 200);
+  __sanitizer_free(p);
+}
diff --git a/test/hwasan/TestCases/sizes.cpp b/test/hwasan/TestCases/sizes.cpp
new file mode 100644
index 0000000..52217de
--- /dev/null
+++ b/test/hwasan/TestCases/sizes.cpp
@@ -0,0 +1,80 @@
+// RUN: %clangxx_hwasan %s -lstdc++ -o %t
+// RUN: %env_hwasan_opts=allocator_may_return_null=0 not %run %t malloc 2>&1          | FileCheck %s --check-prefix=CHECK-max
+// RUN: %env_hwasan_opts=allocator_may_return_null=1     %run %t malloc 2>&1
+// RUN: %env_hwasan_opts=allocator_may_return_null=0 not %run %t malloc max 2>&1      | FileCheck %s --check-prefix=CHECK-max
+// RUN: %env_hwasan_opts=allocator_may_return_null=1     %run %t malloc max 2>&1
+// RUN: %env_hwasan_opts=allocator_may_return_null=0 not %run %t calloc 2>&1          | FileCheck %s --check-prefix=CHECK-calloc
+// RUN: %env_hwasan_opts=allocator_may_return_null=1     %run %t calloc 2>&1
+// RUN: %env_hwasan_opts=allocator_may_return_null=0 not %run %t new 2>&1             | FileCheck %s --check-prefix=CHECK-max
+// RUN: %env_hwasan_opts=allocator_may_return_null=1 not %run %t new 2>&1             | FileCheck %s --check-prefix=CHECK-oom
+// RUN: %env_hwasan_opts=allocator_may_return_null=0 not %run %t new max 2>&1         | FileCheck %s --check-prefix=CHECK-max
+// RUN: %env_hwasan_opts=allocator_may_return_null=1 not %run %t new max 2>&1         | FileCheck %s --check-prefix=CHECK-oom
+// RUN: %env_hwasan_opts=allocator_may_return_null=0 not %run %t new-nothrow 2>&1     | FileCheck %s --check-prefix=CHECK-max
+// RUN: %env_hwasan_opts=allocator_may_return_null=1     %run %t new-nothrow 2>&1
+// RUN: %env_hwasan_opts=allocator_may_return_null=0 not %run %t new-nothrow max 2>&1 | FileCheck %s --check-prefix=CHECK-max
+// RUN: %env_hwasan_opts=allocator_may_return_null=1     %run %t new-nothrow max 2>&1
+// RUN:                                                 %run %t usable 2>&1
+
+// Tests for various edge cases related to sizes, notably the maximum size the
+// allocator can allocate. Tests that an integer overflow in the parameters of
+// calloc is caught.
+
+#include <assert.h>
+#include <malloc.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include <limits>
+#include <new>
+
+#include <sanitizer/allocator_interface.h>
+
+int main(int argc, char **argv) {
+  assert(argc <= 3);
+  bool test_size_max = argc == 3 && !strcmp(argv[2], "max");
+
+  static const size_t kMaxAllowedMallocSize = 1ULL << 40;
+  static const size_t kChunkHeaderSize = 16;
+
+  size_t MallocSize = test_size_max ? std::numeric_limits<size_t>::max()
+                                    : kMaxAllowedMallocSize;
+
+  if (!strcmp(argv[1], "malloc")) {
+    void *p = malloc(MallocSize);
+    assert(!p);
+    p = malloc(kMaxAllowedMallocSize - kChunkHeaderSize);
+    assert(!p);
+  } else if (!strcmp(argv[1], "calloc")) {
+    // Trigger an overflow in calloc.
+    size_t size = std::numeric_limits<size_t>::max();
+    void *p = calloc((size / 0x1000) + 1, 0x1000);
+    assert(!p);
+  } else if (!strcmp(argv[1], "new")) {
+    void *p = operator new(MallocSize);
+    assert(!p);
+  } else if (!strcmp(argv[1], "new-nothrow")) {
+    void *p = operator new(MallocSize, std::nothrow);
+    assert(!p);
+  } else if (!strcmp(argv[1], "usable")) {
+    // Playing with the actual usable size of a chunk.
+    void *p = malloc(1007);
+    assert(p);
+    size_t size = __sanitizer_get_allocated_size(p);
+    assert(size >= 1007);
+    memset(p, 'A', size);
+    p = realloc(p, 2014);
+    assert(p);
+    size = __sanitizer_get_allocated_size(p);
+    assert(size >= 2014);
+    memset(p, 'B', size);
+    free(p);
+  } else {
+    assert(0);
+  }
+
+  return 0;
+}
+
+// CHECK-max: {{ERROR: HWAddressSanitizer: requested allocation size .* exceeds maximum supported size}}
+// CHECK-oom: ERROR: HWAddressSanitizer: allocator is out of memory
+// CHECK-calloc: ERROR: HWAddressSanitizer: calloc parameters overflow
diff --git a/test/hwasan/TestCases/stack-history-length.c b/test/hwasan/TestCases/stack-history-length.c
new file mode 100644
index 0000000..f4c0b03
--- /dev/null
+++ b/test/hwasan/TestCases/stack-history-length.c
@@ -0,0 +1,36 @@
+// RUN: %clang_hwasan -O1 -DX=2046 %s -o %t.2046
+// RUN: %clang_hwasan -O1 -DX=2047 %s -o %t.2047
+// RUN: %env_hwasan_opts=stack_history_size=2048 not %run %t.2046 2>&1 | FileCheck %s --check-prefix=YES
+// RUN: %env_hwasan_opts=stack_history_size=2048 not %run %t.2047 2>&1 | FileCheck %s --check-prefix=NO
+
+// REQUIRES: stable-runtime
+
+#include <stdlib.h>
+
+void USE(void *x) { // pretend_to_do_something(void *x)
+  __asm__ __volatile__("" : : "r" (x) : "memory");
+}
+
+volatile int four = 4;
+__attribute__((noinline)) void FUNC0() { int x[4]; USE(&x[0]); }
+__attribute__((noinline)) void FUNC() { int x[4]; USE(&x[0]); }
+__attribute__((noinline)) void OOB() { int x[4]; x[four] = 0; USE(&x[0]); }
+
+int main() {
+  // FUNC0 is X+2's element of the ring buffer.
+  // If runtime buffer size is less than it, FUNC0 record will be lost.
+  FUNC0();
+  for (int i = 0; i < X; ++i)
+    FUNC();
+  OOB();
+}
+
+// YES: Previosly allocated frames
+// YES: OOB
+// YES: FUNC
+// YES: FUNC0
+
+// NO: Previosly allocated frames
+// NO: OOB
+// NO: FUNC
+// NO-NOT: FUNC0
diff --git a/test/hwasan/TestCases/stack-oob.c b/test/hwasan/TestCases/stack-oob.c
new file mode 100644
index 0000000..567af33
--- /dev/null
+++ b/test/hwasan/TestCases/stack-oob.c
@@ -0,0 +1,25 @@
+// RUN: %clang_hwasan -DSIZE=16 -O0 %s -o %t && not %run %t 2>&1 | FileCheck %s
+// RUN: %clang_hwasan -DSIZE=64 -O0 %s -o %t && not %run %t 2>&1 | FileCheck %s
+// RUN: %clang_hwasan -DSIZE=0x1000 -O0 %s -o %t && not %run %t 2>&1 | FileCheck %s
+
+// REQUIRES: stable-runtime
+
+#include <stdlib.h>
+#include <sanitizer/hwasan_interface.h>
+
+__attribute__((noinline))
+int f() {
+  char z[SIZE];
+  char *volatile p = z;
+  return p[SIZE];
+}
+
+int main() {
+  return f();
+  // CHECK: READ of size 1 at
+  // CHECK: #0 {{.*}} in f{{.*}}stack-oob.c:14
+
+  // CHECK: is located in stack of threa
+
+  // CHECK: SUMMARY: HWAddressSanitizer: tag-mismatch {{.*}} in f
+}
diff --git a/test/hwasan/TestCases/stack-oob.cc b/test/hwasan/TestCases/stack-oob.cc
deleted file mode 100644
index 60b9a62..0000000
--- a/test/hwasan/TestCases/stack-oob.cc
+++ /dev/null
@@ -1,25 +0,0 @@
-// RUN: %clangxx_hwasan -DSIZE=16 -O0 %s -o %t && not %run %t 2>&1 | FileCheck %s
-// RUN: %clangxx_hwasan -DSIZE=64 -O0 %s -o %t && not %run %t 2>&1 | FileCheck %s
-// RUN: %clangxx_hwasan -DSIZE=0x1000 -O0 %s -o %t && not %run %t 2>&1 | FileCheck %s
-
-// REQUIRES: stable-runtime
-
-#include <stdlib.h>
-#include <sanitizer/hwasan_interface.h>
-
-__attribute__((noinline))
-int f() {
-  char z[SIZE];
-  char *volatile p = z;
-  return p[SIZE];
-}
-
-int main() {
-  return f();
-  // CHECK: READ of size 1 at
-  // CHECK: #0 {{.*}} in f{{.*}}stack-oob.cc:14
-
-  // CHECK: HWAddressSanitizer can not describe address in more detail.
-
-  // CHECK: SUMMARY: HWAddressSanitizer: tag-mismatch {{.*}} in f
-}
diff --git a/test/hwasan/TestCases/stack-uar.cc b/test/hwasan/TestCases/stack-uar.c
similarity index 61%
rename from test/hwasan/TestCases/stack-uar.cc
rename to test/hwasan/TestCases/stack-uar.c
index e99dcce..2c59b17 100644
--- a/test/hwasan/TestCases/stack-uar.cc
+++ b/test/hwasan/TestCases/stack-uar.c
@@ -1,4 +1,4 @@
-// RUN: %clangxx_hwasan -O0 %s -o %t && not %run %t 2>&1 | FileCheck %s
+// RUN: %clang_hwasan -O0 %s -o %t && not %run %t 2>&1 | FileCheck %s
 
 // REQUIRES: stable-runtime
 
@@ -15,9 +15,9 @@
 int main() {
   return *f();
   // CHECK: READ of size 1 at
-  // CHECK: #0 {{.*}} in main{{.*}}stack-uar.cc:16
+  // CHECK: #0 {{.*}} in main{{.*}}stack-uar.c:16
 
-  // CHECK: HWAddressSanitizer can not describe address in more detail.
+  // CHECK: is located in stack of thread
 
   // CHECK: SUMMARY: HWAddressSanitizer: tag-mismatch {{.*}} in main
 }
diff --git a/test/hwasan/TestCases/thread-uaf.c b/test/hwasan/TestCases/thread-uaf.c
new file mode 100644
index 0000000..33cea10
--- /dev/null
+++ b/test/hwasan/TestCases/thread-uaf.c
@@ -0,0 +1,56 @@
+// Tests UAF detection where Allocate/Deallocate/Use
+// happen in separate threads.
+// RUN: %clang_hwasan %s -o %t && not %run %t 2>&1 | FileCheck %s
+// REQUIRES: stable-runtime
+
+#include <pthread.h>
+#include <stdlib.h>
+#include <stdio.h>
+
+#include <sanitizer/hwasan_interface.h>
+
+char *volatile x;
+int state;
+
+void *Allocate(void *arg) {
+  x = (char*)malloc(10);
+  __sync_fetch_and_add(&state, 1);
+  while (__sync_fetch_and_add(&state, 0) != 3) {}
+  return NULL;
+}
+void *Deallocate(void *arg) {
+  free(x);
+  __sync_fetch_and_add(&state, 1);
+  while (__sync_fetch_and_add(&state, 0) != 3) {}
+  return NULL;
+}
+
+void *Use(void *arg) {
+  x[5] = 42;
+  // CHECK: ERROR: HWAddressSanitizer: tag-mismatch on address
+  // CHECK: WRITE of size 1 {{.*}} in thread T3
+  // CHECK: thread-uaf.c:[[@LINE-3]]
+  // CHECK: freed by thread T2 here
+  // CHECK: in Deallocate
+  // CHECK: previously allocated here:
+  // CHECK: in Allocate
+  // CHECK: Thread: T2 0x
+  // CHECK: Thread: T3 0x
+  __sync_fetch_and_add(&state, 1);
+  return NULL;
+}
+
+int main() {
+  __hwasan_enable_allocator_tagging();
+  pthread_t t1, t2, t3;
+
+  pthread_create(&t1, NULL, Allocate, NULL);
+  while (__sync_fetch_and_add(&state, 0) != 1) {}
+  pthread_create(&t2, NULL, Deallocate, NULL);
+  while (__sync_fetch_and_add(&state, 0) != 2) {}
+  pthread_create(&t3, NULL, Use, NULL);
+
+  pthread_join(t1, NULL);
+  pthread_join(t2, NULL);
+  pthread_join(t3, NULL);
+}
diff --git a/test/hwasan/TestCases/uaf_with_rb_distance.c b/test/hwasan/TestCases/uaf_with_rb_distance.c
new file mode 100644
index 0000000..25aae52
--- /dev/null
+++ b/test/hwasan/TestCases/uaf_with_rb_distance.c
@@ -0,0 +1,27 @@
+// Checks how we print the developer note "hwasan_dev_note_heap_rb_distance".
+// RUN: %clang_hwasan %s -o %t
+// RUN: not %run %t 10 2>&1 | FileCheck %s --check-prefix=D10
+// RUN: not %run %t 42 2>&1 | FileCheck %s --check-prefix=D42
+
+// REQUIRES: stable-runtime
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <sanitizer/hwasan_interface.h>
+
+
+void *p[100];
+
+int main(int argc, char **argv) {
+  __hwasan_enable_allocator_tagging();
+  int distance = argc >= 2 ? atoi(argv[1]) : 1;
+  for (int i = 0; i < 100; i++)
+    p[i] = malloc(i);
+  for (int i = 0; i < 100; i++)
+    free(p[i]);
+
+  *(int*)p[distance] = 0;
+}
+
+// D10: hwasan_dev_note_heap_rb_distance: 90 1023
+// D42: hwasan_dev_note_heap_rb_distance: 58 1023
diff --git a/test/hwasan/TestCases/use-after-free.c b/test/hwasan/TestCases/use-after-free.c
index b9f6060..3dae97b 100644
--- a/test/hwasan/TestCases/use-after-free.c
+++ b/test/hwasan/TestCases/use-after-free.c
@@ -1,13 +1,14 @@
-// RUN: %clang_hwasan -O0 -DLOAD %s -o %t && not %run %t 2>&1 | FileCheck %s --check-prefixes=CHECK,LOAD
-// RUN: %clang_hwasan -O1 -DLOAD %s -o %t && not %run %t 2>&1 | FileCheck %s --check-prefixes=CHECK,LOAD
-// RUN: %clang_hwasan -O2 -DLOAD %s -o %t && not %run %t 2>&1 | FileCheck %s --check-prefixes=CHECK,LOAD
-// RUN: %clang_hwasan -O3 -DLOAD %s -o %t && not %run %t 2>&1 | FileCheck %s --check-prefixes=CHECK,LOAD
+// RUN: %clang_hwasan -O0 -DISREAD=1 %s -o %t && not %run %t 2>&1 | FileCheck %s --check-prefixes=CHECK
+// RUN: %clang_hwasan -O1 -DISREAD=1 %s -o %t && not %run %t 2>&1 | FileCheck %s --check-prefixes=CHECK
+// RUN: %clang_hwasan -O2 -DISREAD=1 %s -o %t && not %run %t 2>&1 | FileCheck %s --check-prefixes=CHECK
+// RUN: %clang_hwasan -O3 -DISREAD=1 %s -o %t && not %run %t 2>&1 | FileCheck %s --check-prefixes=CHECK
 
-// RUN: %clang_hwasan -O0 -DSTORE %s -o %t && not %run %t 2>&1 | FileCheck %s --check-prefixes=CHECK,STORE
+// RUN: %clang_hwasan -O0 -DISREAD=0 %s -o %t && not %run %t 2>&1 | FileCheck %s --check-prefixes=CHECK
 
 // REQUIRES: stable-runtime
 
 #include <stdlib.h>
+#include <stdio.h>
 #include <sanitizer/hwasan_interface.h>
 
 int main() {
@@ -15,25 +16,22 @@
   char * volatile x = (char*)malloc(10);
   free(x);
   __hwasan_disable_allocator_tagging();
-#ifdef STORE
-  x[5] = 42;
-#endif
-#ifdef LOAD
-  return x[5];
-#endif
-  // LOAD: READ of size 1 at
-  // LOAD: #0 {{.*}} in main {{.*}}use-after-free.c:22
+  fprintf(stderr, "Going to do a %s\n", ISREAD ? "READ" : "WRITE");
+  // CHECK: Going to do a [[TYPE:[A-Z]*]]
+  int r = 0;
+  if (ISREAD) r = x[5]; else x[5] = 42;  // should be on the same line.
+  // CHECK: [[TYPE]] of size 1 at {{.*}} tags: [[PTR_TAG:[0-9a-f][0-9a-f]]]/[[MEM_TAG:[0-9a-f][0-9a-f]]] (ptr/mem)
+  // CHECK: #0 {{.*}} in main {{.*}}use-after-free.c:[[@LINE-2]]
 
-  // STORE: WRITE of size 1 at
-  // STORE: #0 {{.*}} in main {{.*}}use-after-free.c:19
-
-  // CHECK: freed here:
+  // CHECK: freed by thread {{.*}} here:
   // CHECK: #0 {{.*}} in {{.*}}free{{.*}} {{.*}}hwasan_interceptors.cc
-  // CHECK: #1 {{.*}} in main {{.*}}use-after-free.c:16
+  // CHECK: #1 {{.*}} in main {{.*}}use-after-free.c:[[@LINE-11]]
 
   // CHECK: previously allocated here:
   // CHECK: #0 {{.*}} in {{.*}}malloc{{.*}} {{.*}}hwasan_interceptors.cc
-  // CHECK: #1 {{.*}} in main {{.*}}use-after-free.c:15
-
+  // CHECK: #1 {{.*}} in main {{.*}}use-after-free.c:[[@LINE-16]]
+  // CHECK: Memory tags around the buggy address (one tag corresponds to 16 bytes):
+  // CHECK: =>{{.*}}[[MEM_TAG]]
   // CHECK: SUMMARY: HWAddressSanitizer: tag-mismatch {{.*}} in main
+  return r;
 }
diff --git a/test/lit.common.cfg b/test/lit.common.cfg
index 5274b49..6591227 100644
--- a/test/lit.common.cfg
+++ b/test/lit.common.cfg
@@ -106,6 +106,10 @@
 if config.emulator:
   config.substitutions.append( ('%run', config.emulator) )
   config.substitutions.append( ('%env ', "env ") )
+  # TODO: Implement `%device_rm` to perform removal of files in the emulator.
+  # For now just make it a no-op.
+  lit_config.warning('%device_rm is not implemented')
+  config.substitutions.append( ('%device_rm', 'echo ') )
   config.compile_wrapper = ""
 elif config.host_os == 'Darwin' and config.apple_platform != "osx":
   # Darwin tests can be targetting macOS, a device or a simulator. All devices
@@ -127,14 +131,18 @@
   ios_or_iossim = "iossim" if config.apple_platform.endswith("sim") else "ios"
 
   config.available_features.add('ios')
+  device_id_env = "SANITIZER_" + ios_or_iossim.upper() + "_TEST_DEVICE_IDENTIFIER"
   if ios_or_iossim == "iossim":
     config.available_features.add('iossim')
+    if device_id_env not in os.environ:
+      lit_config.fatal(
+        '{} must be set in the environment when running iossim tests'.format(
+          device_id_env))
   if config.apple_platform != "ios" and config.apple_platform != "iossim":
     config.available_features.add(config.apple_platform)
 
   ios_commands_dir = os.path.join(config.compiler_rt_src_root, "test", "sanitizer_common", "ios_commands")
 
-  device_id_env = "SANITIZER_" + ios_or_iossim.upper() + "_TEST_DEVICE_IDENTIFIER"
   run_wrapper = os.path.join(ios_commands_dir, ios_or_iossim + "_run.py")
   env_wrapper = os.path.join(ios_commands_dir, ios_or_iossim + "_env.py")
   compile_wrapper = os.path.join(ios_commands_dir, ios_or_iossim + "_compile.py")
@@ -144,9 +152,17 @@
     config.environment[device_id_env] = os.environ[device_id_env]
   config.substitutions.append(('%run', run_wrapper))
   config.substitutions.append(('%env ', env_wrapper + " "))
+  # Current implementation of %device_rm uses the run_wrapper to do
+  # the work.
+  config.substitutions.append(('%device_rm', '{} rm '.format(run_wrapper)))
   config.compile_wrapper = compile_wrapper
 
-  prepare_output = subprocess.check_output([prepare_script, config.apple_platform, config.clang]).strip()
+  try:
+    prepare_output = subprocess.check_output([prepare_script, config.apple_platform, config.clang]).strip()
+  except subprocess.CalledProcessError as e:
+    print("Command failed:")
+    print(e.output)
+    raise e
   if len(prepare_output) > 0: print(prepare_output)
   prepare_output_json = prepare_output.split("\n")[-1]
   prepare_output = json.loads(prepare_output_json)
@@ -157,9 +173,15 @@
   config.compile_wrapper = compile_wrapper
   config.substitutions.append( ('%run', "") )
   config.substitutions.append( ('%env ', "env ") )
+  # TODO: Implement `%device_rm` to perform removal of files on a device.  For
+  # now just make it a no-op.
+  lit_config.warning('%device_rm is not implemented')
+  config.substitutions.append( ('%device_rm', 'echo ') )
 else:
   config.substitutions.append( ('%run', "") )
   config.substitutions.append( ('%env ', "env ") )
+  # When running locally %device_rm is a no-op.
+  config.substitutions.append( ('%device_rm', 'echo ') )
   config.compile_wrapper = ""
 
 # Define CHECK-%os to check for OS-dependent output.
@@ -295,7 +317,7 @@
   config.lto_supported = True
   config.lto_launch = ["env", "DYLD_LIBRARY_PATH=" + config.llvm_shlib_dir]
   config.lto_flags = []
-elif config.host_os == 'Linux' and is_linux_lto_supported():
+elif config.host_os in ['Linux', 'FreeBSD', 'NetBSD'] and is_linux_lto_supported():
   config.lto_supported = True
   config.lto_launch = []
   if config.use_lld:
diff --git a/test/lsan/TestCases/Linux/fork_and_leak.cc b/test/lsan/TestCases/Linux/fork_and_leak.cc
new file mode 100644
index 0000000..d7427ce
--- /dev/null
+++ b/test/lsan/TestCases/Linux/fork_and_leak.cc
@@ -0,0 +1,23 @@
+// Test that leaks detected after forking without exec().
+// RUN: %clangxx_lsan %s -o %t && not %run %t 2>&1 | FileCheck %s
+
+#include <assert.h>
+#include <stdlib.h>
+#include <sys/wait.h>
+#include <unistd.h>
+
+int main() {
+  pid_t pid = fork();
+  assert(pid >= 0);
+  if (pid > 0) {
+    int status = 0;
+    waitpid(pid, &status, 0);
+    assert(WIFEXITED(status));
+    return WEXITSTATUS(status);
+  } else {
+    malloc(1337);
+    // CHECK: LeakSanitizer: detected memory leaks
+  }
+  return 0;
+}
+
diff --git a/test/lsan/TestCases/Linux/fork_with_threads.cc b/test/lsan/TestCases/Linux/fork_with_threads.cc
deleted file mode 100644
index 221c5d2..0000000
--- a/test/lsan/TestCases/Linux/fork_with_threads.cc
+++ /dev/null
@@ -1,35 +0,0 @@
-// Test forked process does not run lsan.
-// RUN: %clangxx_lsan %s -o %t && %run %t 2>&1 | FileCheck %s
-
-#include <pthread.h>
-#include <stdlib.h>
-#include <sys/wait.h>
-#include <unistd.h>
-
-static pthread_barrier_t barrier;
-
-// CHECK-NOT: SUMMARY: {{(Leak|Address)}}Sanitizer:
-static void *thread_func(void *arg) {
-  void *buffer = malloc(1337);
-  pthread_barrier_wait(&barrier);
-  for (;;)
-    pthread_yield();
-  return 0;
-}
-
-int main() {
-  pthread_barrier_init(&barrier, 0, 2);
-  pthread_t tid;
-  int res = pthread_create(&tid, 0, thread_func, 0);
-  pthread_barrier_wait(&barrier);
-  pthread_barrier_destroy(&barrier);
-
-  pid_t pid = fork();
-  if (pid > 0) {
-    int status = 0;
-    waitpid(pid, &status, 0);
-  }
-  return 0;
-}
-
-// CHECK: WARNING: LeakSanitizer is disabled in forked process
diff --git a/test/msan/chained_origin_with_signals.cc b/test/msan/chained_origin_with_signals.cc
index 43dbdcc..9c07156 100644
--- a/test/msan/chained_origin_with_signals.cc
+++ b/test/msan/chained_origin_with_signals.cc
@@ -10,6 +10,9 @@
 // RUN:     not %run %t >%t.out 2>&1
 // RUN: FileCheck %s < %t.out
 
+// Reported deadly signal due to stack-overflow
+// XFAIL: netbsd
+
 #include <signal.h>
 #include <stdio.h>
 #include <sys/types.h>
diff --git a/test/msan/dtls_test.c b/test/msan/dtls_test.c
index b9021e0..ee91be2 100644
--- a/test/msan/dtls_test.c
+++ b/test/msan/dtls_test.c
@@ -8,6 +8,9 @@
 
    XFAIL: FreeBSD
    UNSUPPORTED: powerpc
+
+   // Reports use-of-uninitialized-value, not analyzed
+   XFAIL: netbsd
 */
 
 #ifndef BUILD_SO
diff --git a/test/msan/fork.cc b/test/msan/fork.cc
index e4dc549..87d71f8 100644
--- a/test/msan/fork.cc
+++ b/test/msan/fork.cc
@@ -14,6 +14,9 @@
 // UNSUPPORTED: powerpc64-target-arch
 // UNSUPPORTED: powerpc64le-target-arch
 
+// Sometimes hangs
+// UNSUPPORTED: netbsd
+
 #include <pthread.h>
 #include <unistd.h>
 #include <stdio.h>
diff --git a/test/msan/ioctl_custom.cc b/test/msan/ioctl_custom.cc
index eaab633..794f345 100644
--- a/test/msan/ioctl_custom.cc
+++ b/test/msan/ioctl_custom.cc
@@ -4,6 +4,9 @@
 // RUN: %clangxx_msan -DPOSITIVE -O0 -g %s -o %t && not %run %t 2>&1 | FileCheck %s
 // RUN: %clangxx_msan -DPOSITIVE -O3 -g %s -o %t && not %run %t 2>&1 | FileCheck %s
 
+// Reports different report (not analyzed)
+// XFAIL: netbsd
+
 #include <assert.h>
 #include <stdlib.h>
 #include <net/if.h>
diff --git a/test/msan/lit.cfg b/test/msan/lit.cfg
index 550d04d..e08673d 100644
--- a/test/msan/lit.cfg
+++ b/test/msan/lit.cfg
@@ -17,14 +17,20 @@
                       config.debug_info_flags)
 # Some Msan tests leverage backtrace() which requires libexecinfo on FreeBSD.
 if config.host_os == 'FreeBSD':
-  clang_msan_cflags += ["-lexecinfo"]
+  clang_msan_cflags += ["-lexecinfo", "-fPIC"]
 clang_msan_cxxflags = config.cxx_mode_flags + clang_msan_cflags
 
+# Flags for KMSAN invocation. This is C-only, we're not interested in C++.
+clang_kmsan_cflags = (["-fsanitize=kernel-memory"] +
+                      [config.target_cflags] +
+                      config.debug_info_flags)
+
 def build_invocation(compile_flags):
   return " " + " ".join([config.clang] + compile_flags) + " "
 
 config.substitutions.append( ("%clang_msan ", build_invocation(clang_msan_cflags)) )
 config.substitutions.append( ("%clangxx_msan ", build_invocation(clang_msan_cxxflags)) )
+config.substitutions.append( ("%clang_kmsan ", build_invocation(clang_kmsan_cflags)) )
 
 # Default test suffixes.
 config.suffixes = ['.c', '.cc', '.cpp']
diff --git a/test/msan/signal_stress_test.cc b/test/msan/signal_stress_test.cc
index 5bc6f59..dfbc580 100644
--- a/test/msan/signal_stress_test.cc
+++ b/test/msan/signal_stress_test.cc
@@ -2,6 +2,9 @@
 //
 // Test that va_arg shadow from a signal handler does not leak outside.
 
+// Reported deadly signal due to stack-overflow
+// XFAIL: netbsd
+
 #include <signal.h>
 #include <stdarg.h>
 #include <sanitizer/msan_interface.h>
diff --git a/test/msan/tls_reuse.cc b/test/msan/tls_reuse.cc
index 9c2ee97..78a328f 100644
--- a/test/msan/tls_reuse.cc
+++ b/test/msan/tls_reuse.cc
@@ -1,7 +1,6 @@
 // RUN: %clangxx_msan -O0 %s -o %t && %run %t
 
 // Check that when TLS block is reused between threads, its shadow is cleaned.
-// XFAIL: freebsd
 
 #include <pthread.h>
 #include <stdio.h>
diff --git a/test/msan/vararg.cc b/test/msan/vararg.cc
new file mode 100644
index 0000000..e1a7b12
--- /dev/null
+++ b/test/msan/vararg.cc
@@ -0,0 +1,60 @@
+// RUN: %clangxx_msan -fsanitize-memory-track-origins=0 -O3 %s -o %t && \
+// RUN:     not %run %t va_arg_tls >%t.out 2>&1
+// RUN: FileCheck %s --check-prefix=CHECK < %t.out
+
+// RUN: %clangxx_msan -fsanitize-memory-track-origins=0 -O3 %s -o %t && \
+// RUN:     not %run %t overflow >%t.out 2>&1
+// RUN: FileCheck %s --check-prefix=CHECK < %t.out
+
+// RUN: %clangxx_msan -fsanitize-memory-track-origins=2 -O3 %s -o %t && \
+// RUN:     not %run %t va_arg_tls >%t.out 2>&1
+// RUN: FileCheck %s --check-prefixes=CHECK,CHECK-ORIGIN < %t.out
+
+// RUN: %clangxx_msan -fsanitize-memory-track-origins=2 -O3 %s -o %t && \
+// RUN:     not %run %t overflow >%t.out 2>&1
+// RUN: FileCheck %s --check-prefixes=CHECK,CHECK-ORIGIN < %t.out
+
+// Check that shadow and origin are passed through va_args.
+
+// Copying origins on AArch64, MIPS and PowerPC isn't supported yet.
+// XFAIL: aarch64
+// XFAIL: mips
+// XFAIL: powerpc64
+
+#include <stdarg.h>
+#include <string.h>
+
+__attribute__((noinline))
+int sum(int n, ...) {
+  va_list args;
+  int i, sum = 0, arg;
+  volatile int temp;
+  va_start(args, n);
+  for (i = 0; i < n; i++) {
+    arg = va_arg(args, int);
+    sum += arg;
+  }
+  va_end(args);
+  return sum;
+}
+
+int main(int argc, char *argv[]) {
+  volatile int uninit;
+  volatile int a = 1, b = 2;
+  if (argc == 2) {
+    // Shadow/origin will be passed via va_arg_tls/va_arg_origin_tls.
+    if (strcmp(argv[1], "va_arg_tls") == 0) {
+      return sum(3, uninit, a, b);
+    }
+    // Shadow/origin of |uninit| will be passed via overflow area.
+    if (strcmp(argv[1], "overflow") == 0) {
+      return sum(7,
+        a, a, a, a, a, a, uninit
+      );
+    }
+  }
+  return 0;
+}
+
+// CHECK: WARNING: MemorySanitizer: use-of-uninitialized-value
+// CHECK-ORIGIN: Uninitialized value was created by an allocation of 'uninit' in the stack frame of function 'main'
diff --git a/test/profile/Inputs/instrprof-gcov-exceptions.cpp.gcov b/test/profile/Inputs/instrprof-gcov-exceptions.cpp.gcov
index 7caf508..2a7b724 100644
--- a/test/profile/Inputs/instrprof-gcov-exceptions.cpp.gcov
+++ b/test/profile/Inputs/instrprof-gcov-exceptions.cpp.gcov
@@ -6,7 +6,7 @@
 // CHECK-NEXT:        -:    1:#include <string>
 // CHECK-NEXT:        -:    2:
 // CHECK-NEXT:        -:    3:void asd(std::string i) {
-// CHECK-NEXT:        2:    4:}
+// CHECK-NEXT:        1:    4:}
 // CHECK-NEXT:        -:    5:
 // CHECK-NEXT:        -:    6:int main(void)
 // CHECK-NEXT:        -:    7:{
diff --git a/test/profile/Inputs/instrprof-gcov-switch1.c.gcov b/test/profile/Inputs/instrprof-gcov-switch1.c.gcov
index 7d136dc..f19431e 100644
--- a/test/profile/Inputs/instrprof-gcov-switch1.c.gcov
+++ b/test/profile/Inputs/instrprof-gcov-switch1.c.gcov
@@ -5,9 +5,9 @@
 // CHECK-NEXT:        -:    0:Programs:1
 // CHECK-NEXT:        -:    1:int main(void)
 // CHECK-NEXT:        -:    2:{
-// CHECK-NEXT:        2:    3:  int i = 22;
+// CHECK-NEXT:        1:    3:  int i = 22;
 // CHECK-NEXT:        -:    4:
-// CHECK-NEXT:        2:    5:  switch (i) {
+// CHECK-NEXT:        1:    5:  switch (i) {
 // CHECK-NEXT:        -:    6:    case 7:
 // CHECK-NEXT:    #####:    7:      break;
 // CHECK-NEXT:        -:    8:
diff --git a/test/profile/Inputs/instrprof-gcov-switch2.c.gcov b/test/profile/Inputs/instrprof-gcov-switch2.c.gcov
index 67f4086..0b85e0f 100644
--- a/test/profile/Inputs/instrprof-gcov-switch2.c.gcov
+++ b/test/profile/Inputs/instrprof-gcov-switch2.c.gcov
@@ -5,9 +5,9 @@
 // CHECK-NEXT:        -:    0:Programs:1
 // CHECK-NEXT:        -:    1:int main(void)
 // CHECK-NEXT:        -:    2:{
-// CHECK-NEXT:        3:    3:  int i = 22;
+// CHECK-NEXT:        1:    3:  int i = 22;
 // CHECK-NEXT:        -:    4:
-// CHECK-NEXT:        3:    5:  switch (i) {
+// CHECK-NEXT:        1:    5:  switch (i) {
 // CHECK-NEXT:        -:    6:    case 7:
 // CHECK-NEXT:    #####:    7:      break;
 // CHECK-NEXT:        -:    8:
diff --git a/test/profile/Inputs/instrprof-shared-main.c.gcov b/test/profile/Inputs/instrprof-shared-main.c.gcov
index 70be367..1636ca6 100644
--- a/test/profile/Inputs/instrprof-shared-main.c.gcov
+++ b/test/profile/Inputs/instrprof-shared-main.c.gcov
@@ -8,8 +8,8 @@
 // CHECK-NEXT:        -:    3:
 // CHECK-NEXT:        -:    4:int main() {
 // CHECK-NEXT:        -:    5:  int i, j;
-// CHECK-NEXT:     2002:    6:  for (i = 0; i < 1000; i++)
-// CHECK-NEXT:  2002000:    7:    for (j = 0; j < 1000; j++)
+// CHECK-NEXT:     1001:    6:  for (i = 0; i < 1000; i++)
+// CHECK-NEXT:  1001000:    7:    for (j = 0; j < 1000; j++)
 // CHECK-NEXT:  1001000:    8:      foo(i * j);
 // CHECK-NEXT:        -:    9:
 // CHECK-NEXT:        1:   10:  if (g2 - g1 == 280001)
diff --git a/test/profile/instrprof-dlopen-dlclose-gcov.test b/test/profile/instrprof-dlopen-dlclose-gcov.test
index 0444fca..8b1b2e2 100644
--- a/test/profile/instrprof-dlopen-dlclose-gcov.test
+++ b/test/profile/instrprof-dlopen-dlclose-gcov.test
@@ -7,6 +7,7 @@
 RUN: %clang --coverage -o %t -fPIC -rpath %t.d %S/Inputs/instrprof-dlopen-dlclose-main.c
 
 # Test with two dlopened libraries.
+RUN: rm -f instrprof-dlopen-dlclose-main.gcda instrprof-dlopen-func.gcda instrprof-dlopen-func2.gcda
 RUN: %run %t
 RUN: llvm-cov gcov instrprof-dlopen-dlclose-main.gcda
 RUN: FileCheck --match-full-lines --strict-whitespace --input-file instrprof-dlopen-dlclose-main.c.gcov %S/Inputs/instrprof-dlopen-dlclose-main.c.gcov
@@ -14,10 +15,10 @@
 RUN: FileCheck --match-full-lines --strict-whitespace --input-file instrprof-dlopen-func.c.gcov %S/Inputs/instrprof-dlopen-func.c.gcov
 RUN: llvm-cov gcov instrprof-dlopen-func2.gcda
 RUN: FileCheck --match-full-lines --strict-whitespace --input-file instrprof-dlopen-func2.c.gcov %S/Inputs/instrprof-dlopen-func2.c.gcov
-RUN: rm instrprof-dlopen-dlclose-main.gcda instrprof-dlopen-func.gcda instrprof-dlopen-func2.gcda
 
 # Test with three dlopened libraries.
 RUN: %clang -DUSE_LIB3 --coverage -o %t -fPIC -rpath %t.d %S/Inputs/instrprof-dlopen-dlclose-main.c
+RUN: rm -f instrprof-dlopen-dlclose-main.gcda instrprof-dlopen-func.gcda instrprof-dlopen-func2.gcda instrprof-dlopen-func3.gcda
 RUN: %run %t
 RUN: llvm-cov gcov instrprof-dlopen-dlclose-main.gcda
 RUN: FileCheck --match-full-lines --strict-whitespace --input-file instrprof-dlopen-dlclose-main.c.gcov %S/Inputs/instrprof-dlopen-dlclose-main_three-libs.c.gcov
@@ -27,4 +28,3 @@
 RUN: FileCheck --match-full-lines --strict-whitespace --input-file instrprof-dlopen-func2.c.gcov %S/Inputs/instrprof-dlopen-func2.c.gcov
 RUN: llvm-cov gcov instrprof-dlopen-func3.gcda
 RUN: FileCheck --match-full-lines --strict-whitespace --input-file instrprof-dlopen-func2.c.gcov %S/Inputs/instrprof-dlopen-func3.c.gcov
-RUN: rm instrprof-dlopen-dlclose-main.gcda instrprof-dlopen-func.gcda instrprof-dlopen-func2.gcda instrprof-dlopen-func3.gcda
diff --git a/test/profile/instrprof-gcov-multiple-bbs-single-line.test b/test/profile/instrprof-gcov-multiple-bbs-single-line.test
index 8839455..66b6429 100644
--- a/test/profile/instrprof-gcov-multiple-bbs-single-line.test
+++ b/test/profile/instrprof-gcov-multiple-bbs-single-line.test
@@ -1,5 +1,3 @@
-XFAIL: *
-
 RUN: mkdir -p %t.d
 RUN: cd %t.d
 
diff --git a/test/profile/instrprof-gcov-two-objects.test b/test/profile/instrprof-gcov-two-objects.test
index a53d51d..4080caa 100644
--- a/test/profile/instrprof-gcov-two-objects.test
+++ b/test/profile/instrprof-gcov-two-objects.test
@@ -10,9 +10,9 @@
 RUN: %clang --coverage -o %t instrprof-shared-main.o instrprof-shared-lib.o
 RUN: test -f %t
 
+RUN: rm -f instrprof-shared-main.gcda instrprof-shared-lib.gcda
 RUN: %run %t
 RUN: llvm-cov gcov instrprof-shared-main.gcda
 RUN: FileCheck --match-full-lines --strict-whitespace --input-file instrprof-shared-main.c.gcov %S/Inputs/instrprof-shared-main.c.gcov
 RUN: llvm-cov gcov instrprof-shared-lib.gcda
 RUN: FileCheck --match-full-lines --strict-whitespace --input-file instrprof-shared-lib.c.gcov %S/Inputs/instrprof-shared-lib_in-loop.c.gcov
-RUN: rm instrprof-shared-main.gcda instrprof-shared-lib.gcda
diff --git a/test/profile/instrprof-shared-gcov-flush.test b/test/profile/instrprof-shared-gcov-flush.test
index 50292b6..542db04 100644
--- a/test/profile/instrprof-shared-gcov-flush.test
+++ b/test/profile/instrprof-shared-gcov-flush.test
@@ -11,42 +11,42 @@
 RUN: %clang -DEXIT_ABRUPTLY -DSHARED_CALL_BEFORE_GCOV_FLUSH -DSHARED_CALL_AFTER_GCOV_FLUSH --coverage -o %t -L%t.d -rpath %t.d -lfunc %S/Inputs/instrprof-shared-main-gcov-flush.c
 RUN: test -f instrprof-shared-main-gcov-flush.gcno
 
+RUN: rm -f instrprof-shared-main-gcov-flush.gcda instrprof-shared-lib.gcda
 RUN: %run %t
 RUN: llvm-cov gcov instrprof-shared-main-gcov-flush.gcda
 RUN: FileCheck --match-full-lines --strict-whitespace --input-file instrprof-shared-main-gcov-flush.c.gcov %S/Inputs/instrprof-shared-main-gcov-flush_no-writeout.c.gcov
 RUN: llvm-cov gcov instrprof-shared-lib.gcda
 RUN: FileCheck --match-full-lines --strict-whitespace --input-file instrprof-shared-lib.c.gcov %S/Inputs/instrprof-shared-lib.c.gcov
-RUN: rm instrprof-shared-main-gcov-flush.gcda instrprof-shared-lib.gcda
 
 # Test the case where we exit normally and we have a call to the shared library function before __gcov_flush.
 RUN: %clang -DSHARED_CALL_BEFORE_GCOV_FLUSH --coverage -o %t -L%t.d -rpath %t.d -lfunc %S/Inputs/instrprof-shared-main-gcov-flush.c
 RUN: test -f instrprof-shared-main-gcov-flush.gcno
 
+RUN: rm -f instrprof-shared-main-gcov-flush.gcda instrprof-shared-lib.gcda
 RUN: %run %t
 RUN: llvm-cov gcov instrprof-shared-main-gcov-flush.gcda
 RUN: FileCheck --match-full-lines --strict-whitespace --input-file instrprof-shared-main-gcov-flush.c.gcov %S/Inputs/instrprof-shared-main-gcov-flush_shared-call-before.c.gcov
 RUN: llvm-cov gcov instrprof-shared-lib.gcda
 RUN: FileCheck --match-full-lines --strict-whitespace --input-file instrprof-shared-lib.c.gcov %S/Inputs/instrprof-shared-lib.c.gcov
-RUN: rm instrprof-shared-main-gcov-flush.gcda instrprof-shared-lib.gcda
 
 # Test the case where we exit normally and we have a call to the shared library function after __gcov_flush.
 RUN: %clang -DSHARED_CALL_AFTER_GCOV_FLUSH --coverage -o %t -L%t.d -rpath %t.d -lfunc %S/Inputs/instrprof-shared-main-gcov-flush.c
 RUN: test -f instrprof-shared-main-gcov-flush.gcno
 
+RUN: rm -f instrprof-shared-main-gcov-flush.gcda instrprof-shared-lib.gcda
 RUN: %run %t
 RUN: llvm-cov gcov instrprof-shared-main-gcov-flush.gcda
 RUN: FileCheck --match-full-lines --strict-whitespace --input-file instrprof-shared-main-gcov-flush.c.gcov %S/Inputs/instrprof-shared-main-gcov-flush_shared-call-after.c.gcov
 RUN: llvm-cov gcov instrprof-shared-lib.gcda
 RUN: FileCheck --match-full-lines --strict-whitespace --input-file instrprof-shared-lib.c.gcov %S/Inputs/instrprof-shared-lib.c.gcov
-RUN: rm instrprof-shared-main-gcov-flush.gcda instrprof-shared-lib.gcda
 
 # Test the case where we exit normally and we have calls to the shared library function before and after __gcov_flush.
 RUN: %clang -DSHARED_CALL_BEFORE_GCOV_FLUSH -DSHARED_CALL_AFTER_GCOV_FLUSH --coverage -o %t -L%t.d -rpath %t.d -lfunc %S/Inputs/instrprof-shared-main-gcov-flush.c
 RUN: test -f instrprof-shared-main-gcov-flush.gcno
 
+RUN: rm -f instrprof-shared-main-gcov-flush.gcda instrprof-shared-lib.gcda
 RUN: %run %t
 RUN: llvm-cov gcov instrprof-shared-main-gcov-flush.gcda
 RUN: FileCheck --match-full-lines --strict-whitespace --input-file instrprof-shared-main-gcov-flush.c.gcov %S/Inputs/instrprof-shared-main-gcov-flush_shared-call-before-after.c.gcov
 RUN: llvm-cov gcov instrprof-shared-lib.gcda
 RUN: FileCheck --match-full-lines --strict-whitespace --input-file instrprof-shared-lib.c.gcov %S/Inputs/instrprof-shared-lib_called-twice.c.gcov
-RUN: rm instrprof-shared-main-gcov-flush.gcda instrprof-shared-lib.gcda
diff --git a/test/safestack/lit.cfg b/test/safestack/lit.cfg
index 10cd8a5..f58ec45 100644
--- a/test/safestack/lit.cfg
+++ b/test/safestack/lit.cfg
@@ -18,5 +18,5 @@
 if config.lto_supported:
   config.substitutions.append((r"%clang_lto_safestack ", ' '.join(config.lto_launch + [config.clang] + config.lto_flags + ['-fsanitize=safe-stack '])))
 
-if config.host_os not in ['Linux', 'FreeBSD', 'Darwin', 'NetBSD']:
+if config.host_os not in ['Linux', 'FreeBSD', 'NetBSD']:
    config.unsupported = True
diff --git a/test/safestack/pthread-cleanup.c b/test/safestack/pthread-cleanup.c
index 805366c..55c5b42 100644
--- a/test/safestack/pthread-cleanup.c
+++ b/test/safestack/pthread-cleanup.c
@@ -1,7 +1,9 @@
 // RUN: %clang_safestack %s -pthread -o %t
-// RUN: not --crash %run %t
+// RUN: %run %t 0
+// RUN: not --crash %run %t 1
 
-// Test that unsafe stacks are deallocated correctly on thread exit.
+// Test unsafe stack deallocation. Unsafe stacks are not deallocated immediately
+// at thread exit. They are deallocated by following exiting threads.
 
 #include <stdlib.h>
 #include <string.h>
@@ -9,7 +11,7 @@
 
 enum { kBufferSize = (1 << 15) };
 
-void *t1_start(void *ptr)
+void *start(void *ptr)
 {
   char buffer[kBufferSize];
   return buffer;
@@ -17,15 +19,36 @@
 
 int main(int argc, char **argv)
 {
-  pthread_t t1;
-  char *buffer = NULL;
+  int arg = atoi(argv[1]);
 
-  if (pthread_create(&t1, NULL, t1_start, NULL))
+  pthread_t t1, t2;
+  char *t1_buffer = NULL;
+
+  if (pthread_create(&t1, NULL, start, NULL))
     abort();
-  if (pthread_join(t1, &buffer))
+  if (pthread_join(t1, &t1_buffer))
     abort();
 
-  // should segfault here
-  memset(buffer, 0, kBufferSize);
+  // Stack has not yet been deallocated
+  memset(t1_buffer, 0, kBufferSize);
+
+  if (arg == 0)
+    return 0;
+
+  for (int i = 0; i < 3; i++) {
+    if (pthread_create(&t2, NULL, start, NULL))
+      abort();
+    // Second thread destructor cleans up the first thread's stack.
+    if (pthread_join(t2, NULL))
+      abort();
+
+    // Should segfault here
+    memset(t1_buffer, 0, kBufferSize);
+
+    // PR39001: Re-try in the rare case that pthread_join() returns before the
+    // thread finishes exiting in the kernel--hence the tgkill() check for t1
+    // returns that it's alive despite pthread_join() returning.
+    sleep(1);
+  }
   return 0;
 }
diff --git a/test/safestack/pthread-stack-size.c b/test/safestack/pthread-stack-size.c
new file mode 100644
index 0000000..2fe7c68
--- /dev/null
+++ b/test/safestack/pthread-stack-size.c
@@ -0,0 +1,53 @@
+// RUN: %clang_safestack %s -pthread -o %t
+// RUN: %run %t
+
+// Test unsafe stack deallocation with custom stack sizes, in particular ensure
+// that we correctly deallocate small stacks and don't accidentally deallocate
+// adjacent memory.
+
+#include <pthread.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+
+volatile int step = 0;
+
+void *wait_until(void *ptr) {
+  while ((int)ptr != step)
+    usleep(1000);
+
+  volatile char buf[64];
+  buf[0] = 0;
+
+  return NULL;
+}
+
+int main(int argc, char **argv) {
+  pthread_t t1, t2, t3;
+
+  pthread_attr_t small_stack_attr;
+  pthread_attr_init(&small_stack_attr);
+  pthread_attr_setstacksize(&small_stack_attr, 65536);
+
+  if (pthread_create(&t3, NULL, wait_until, (void *)3))
+    abort();
+  if (pthread_create(&t1, &small_stack_attr, wait_until, (void *)1))
+    abort();
+  if (pthread_create(&t2, NULL, wait_until, (void *)2))
+    abort();
+
+  step = 1;
+  if (pthread_join(t1, NULL))
+    abort();
+
+  step = 2;
+  if (pthread_join(t2, NULL))
+    abort();
+
+  step = 3;
+  if (pthread_join(t3, NULL))
+    abort();
+
+  pthread_attr_destroy(&small_stack_attr);
+  return 0;
+}
diff --git a/test/safestack/pthread.c b/test/safestack/pthread.c
index 416586e..1687c10 100644
--- a/test/safestack/pthread.c
+++ b/test/safestack/pthread.c
@@ -1,8 +1,6 @@
 // RUN: %clang_safestack %s -pthread -o %t
 // RUN: %run %t
 
-// XFAIL: darwin
-
 // Test that pthreads receive their own unsafe stack.
 
 #include <stdlib.h>
diff --git a/test/sanitizer_common/TestCases/Posix/devname_r.cc b/test/sanitizer_common/TestCases/Posix/devname_r.cc
index 826b7c9..3a05dae 100644
--- a/test/sanitizer_common/TestCases/Posix/devname_r.cc
+++ b/test/sanitizer_common/TestCases/Posix/devname_r.cc
@@ -17,8 +17,13 @@
 
   type = S_ISCHR(st.st_mode) ? S_IFCHR : S_IFBLK;
 
+#if defined(__NetBSD__)
+  if (devname_r(st.st_rdev, type, name, sizeof(name)))
+    exit(1);
+#else
   if (!devname_r(st.st_rdev, type, name, sizeof(name)))
     exit(1);
+#endif
 
   printf("%s\n", name);
 
diff --git a/test/sanitizer_common/TestCases/get_module_and_offset_for_pc.cc b/test/sanitizer_common/TestCases/get_module_and_offset_for_pc.cc
index 0591d35..b313df8 100644
--- a/test/sanitizer_common/TestCases/get_module_and_offset_for_pc.cc
+++ b/test/sanitizer_common/TestCases/get_module_and_offset_for_pc.cc
@@ -1,6 +1,6 @@
 // RUN: mkdir -p %t-dir
 // RUN: %clangxx -DSHARED %s -shared -o %t-dir/get_module_and_offset_for_pc.so -fPIC
-// RUN: %clangxx -DSO_DIR=\"%t-dir\" -O0 %s -ldl -o %t
+// RUN: %clangxx -DSO_DIR=\"%t-dir\" -O0 %s -o %t
 // RUN: %run %t 2>&1 | FileCheck %s
 
 // UNSUPPORTED: i386-darwin
diff --git a/test/sanitizer_common/TestCases/pthread_mutexattr_get.cc b/test/sanitizer_common/TestCases/pthread_mutexattr_get.cc
index 26060f3..742ae72 100644
--- a/test/sanitizer_common/TestCases/pthread_mutexattr_get.cc
+++ b/test/sanitizer_common/TestCases/pthread_mutexattr_get.cc
@@ -1,5 +1,8 @@
 // RUN: %clangxx -O0 %s -o %t && %run %t
 
+// pthread_mutexattr_setpshared and pthread_mutexattr_getpshared unavailable
+// UNSUPPORTED: netbsd
+
 #include <assert.h>
 #include <pthread.h>
 
diff --git a/test/sanitizer_common/TestCases/sanitizer_coverage_symbolize.cc b/test/sanitizer_common/TestCases/sanitizer_coverage_symbolize.cc
index 28e2378..daa994c 100644
--- a/test/sanitizer_common/TestCases/sanitizer_coverage_symbolize.cc
+++ b/test/sanitizer_common/TestCases/sanitizer_coverage_symbolize.cc
@@ -7,7 +7,7 @@
 // RUN: rm -rf $DIR
 // RUN: mkdir -p $DIR
 // RUN: cd $DIR
-// RUN: %clangxx -O0 -fsanitize-coverage=trace-pc-guard %s -ldl -o %t
+// RUN: %clangxx -O0 -fsanitize-coverage=trace-pc-guard %s -o %t
 // RUN: %env_tool_opts=coverage=1 %t 2>&1 | FileCheck %s
 // RUN: rm -rf $DIR
 
diff --git a/test/sanitizer_common/TestCases/sanitizer_coverage_trace_pc_guard.cc b/test/sanitizer_common/TestCases/sanitizer_coverage_trace_pc_guard.cc
index 1adbf65..5e01a80 100644
--- a/test/sanitizer_common/TestCases/sanitizer_coverage_trace_pc_guard.cc
+++ b/test/sanitizer_common/TestCases/sanitizer_coverage_trace_pc_guard.cc
@@ -9,7 +9,7 @@
 // RUN: rm -rf $DIR
 // RUN: mkdir -p $DIR
 // RUN: cd $DIR
-// RUN: %clangxx -O0 -fsanitize-coverage=trace-pc-guard %s -ldl -o %t
+// RUN: %clangxx -O0 -fsanitize-coverage=trace-pc-guard %s -o %t
 // RUN: %env_tool_opts=coverage=1 %t 2>&1 | FileCheck %s
 // RUN: %sancovcc  -covered-functions -strip_path_prefix=TestCases/ *.sancov %t 2>&1 | \
 // RUN:   FileCheck --check-prefix=CHECK-SANCOV %s
diff --git a/test/sanitizer_common/TestCases/symbolize_pc.cc b/test/sanitizer_common/TestCases/symbolize_pc.cc
index 68a6733..16a81f1 100644
--- a/test/sanitizer_common/TestCases/symbolize_pc.cc
+++ b/test/sanitizer_common/TestCases/symbolize_pc.cc
@@ -7,6 +7,17 @@
 
 int GLOBAL_VAR_ABC;
 
+void SymbolizeSmallBuffer() {
+  char data[] = "abcdef";
+  __sanitizer_symbolize_pc(__builtin_return_address(0), "%p %F %L", data, 0);
+  printf("UNCHANGED '%s'\n", data);
+  __sanitizer_symbolize_pc(__builtin_return_address(0), "%p %F %L", data, 1);
+  printf("EMPTY '%s'\n", data);
+  __sanitizer_symbolize_pc(__builtin_return_address(0), "%p %F %L", data,
+                           sizeof(data));
+  printf("PARTIAL '%s'\n", data);
+}
+
 void SymbolizeCaller() {
   char data[100];
   __sanitizer_symbolize_pc(__builtin_return_address(0), "%p %F %L", data,
@@ -31,10 +42,16 @@
   printf("GLOBAL: %s\n", data);
 }
 
-// CHECK: FIRST_FORMAT 0x{{.*}} in main symbolize_pc.cc:[[@LINE+3]]
-// CHECK: SECOND_FORMAT FUNC:main LINE:[[@LINE+2]] FILE:symbolize_pc.cc
 int main() {
+  // CHECK: UNCHANGED 'abcdef'
+  // CHECK: EMPTY ''
+  // CHECK: PARTIAL '0x{{.*}}'
+  SymbolizeSmallBuffer();
+
+  // CHECK: FIRST_FORMAT 0x{{.*}} in main symbolize_pc.cc:[[@LINE+2]]
+  // CHECK: SECOND_FORMAT FUNC:main LINE:[[@LINE+1]] FILE:symbolize_pc.cc
   SymbolizeCaller();
+
+  // CHECK: GLOBAL: GLOBAL_VAR_ABC
   SymbolizeData();
 }
-// CHECK: GLOBAL: GLOBAL_VAR_ABC
diff --git a/test/sanitizer_common/TestCases/symbolize_pc_inline.cc b/test/sanitizer_common/TestCases/symbolize_pc_inline.cc
new file mode 100644
index 0000000..9589a3a
--- /dev/null
+++ b/test/sanitizer_common/TestCases/symbolize_pc_inline.cc
@@ -0,0 +1,32 @@
+// RUN: %clangxx -O3  %s -o %t
+// RUN: %env_tool_opts=strip_path_prefix=/TestCases/ %run %t 2>&1 | FileCheck %s
+// RUN: %env_tool_opts=strip_path_prefix=/TestCases/:symbolize_inline_frames=0 \
+// RUN:   %run %t 2>&1 | FileCheck %s --check-prefixes=CHECK-NOINLINE
+
+// XFAIL: darwin
+
+#include <sanitizer/common_interface_defs.h>
+#include <stdio.h>
+#include <string.h>
+
+char buffer[10000];
+
+__attribute__((noinline)) static void Symbolize() {
+  __sanitizer_symbolize_pc(__builtin_return_address(0), "%p %F %L", buffer,
+                           sizeof(buffer));
+  for (char *p = buffer; strlen(p); p += strlen(p) + 1)
+    printf("%s\n", p);
+}
+
+// CHECK-NOINLINE: {{0x[0-9a-f]+}} in main symbolize_pc_inline.cc:[[@LINE+2]]
+// CHECK: [[ADDR:0x[0-9a-f]+]] in C2 symbolize_pc_inline.cc:[[@LINE+1]]
+static inline void C2() { Symbolize(); }
+
+// CHECK: [[ADDR]] in C3 symbolize_pc_inline.cc:[[@LINE+1]]
+static inline void C3() { C2(); }
+
+// CHECK: [[ADDR]] in C4 symbolize_pc_inline.cc:[[@LINE+1]]
+static inline void C4() { C3(); }
+
+// CHECK: [[ADDR]] in main symbolize_pc_inline.cc:[[@LINE+1]]
+int main() { C4(); }
diff --git a/test/sanitizer_common/ios_commands/iossim_run.py b/test/sanitizer_common/ios_commands/iossim_run.py
index 47b847f..0d9ca93 100755
--- a/test/sanitizer_common/ios_commands/iossim_run.py
+++ b/test/sanitizer_common/ios_commands/iossim_run.py
@@ -1,6 +1,6 @@
 #!/usr/bin/python
 
-import os, sys, subprocess
+import glob, os, pipes, sys, subprocess
 
 
 if not "SANITIZER_IOSSIM_TEST_DEVICE_IDENTIFIER" in os.environ:
@@ -8,11 +8,29 @@
 
 device_id = os.environ["SANITIZER_IOSSIM_TEST_DEVICE_IDENTIFIER"]
 
-for e in ["ASAN_OPTIONS", "TSAN_OPTIONS"]:
+for e in ["ASAN_OPTIONS", "TSAN_OPTIONS", "UBSAN_OPTIONS"]:
   if e in os.environ:
     os.environ["SIMCTL_CHILD_" + e] = os.environ[e]
 
-exitcode = subprocess.call(["xcrun", "simctl", "spawn", device_id] + sys.argv[1:])
+prog = sys.argv[1]
+exit_code = None
+if prog == 'rm':
+  # The simulator and host actually share the same file system so we can just
+  # execute directly on the host.
+  rm_args = []
+  for arg in sys.argv[2:]:
+    if '*' in arg or '?' in arg:
+      # Don't quote glob pattern
+      rm_args.append(arg)
+    else:
+      # FIXME(dliew): pipes.quote() is deprecated
+      rm_args.append(pipes.quote(arg))
+  rm_cmd_line = ["/bin/rm"] + rm_args
+  rm_cmd_line_str = ' '.join(rm_cmd_line)
+  # We use `shell=True` so that any wildcard globs get expanded by the shell.
+  exitcode = subprocess.call(rm_cmd_line_str, shell=True)
+else:
+  exitcode = subprocess.call(["xcrun", "simctl", "spawn", device_id] + sys.argv[1:])
 if exitcode > 125:
   exitcode = 126
 sys.exit(exitcode)
diff --git a/test/sanitizer_common/lit.common.cfg b/test/sanitizer_common/lit.common.cfg
index 1b347bf..75cc7b7 100644
--- a/test/sanitizer_common/lit.common.cfg
+++ b/test/sanitizer_common/lit.common.cfg
@@ -46,7 +46,13 @@
   config.environment[tool_options] = default_tool_options_str
   default_tool_options_str += ':'
 
+if config.host_os in ['Linux']:
+  extra_link_flags = ["-ldl"]
+else:
+  extra_link_flags = []
+
 clang_cflags = config.debug_info_flags + tool_cflags + [config.target_cflags]
+clang_cflags += extra_link_flags
 clang_cxxflags = config.cxx_mode_flags + clang_cflags
 
 def build_invocation(compile_flags):
diff --git a/test/scudo/dealloc-race.c b/test/scudo/dealloc-race.c
new file mode 100644
index 0000000..326e22f
--- /dev/null
+++ b/test/scudo/dealloc-race.c
@@ -0,0 +1,69 @@
+// RUN: %clang_scudo %s -O2 -o %t
+// RUN: %env_scudo_opts="QuarantineChunksUpToSize=0" %run %t 2>&1
+
+// This test attempts to reproduce a race condition in the deallocation path
+// when bypassing the Quarantine. The old behavior was to zero-out the chunk
+// header after checking its checksum, state & various other things, but that
+// left a window during which 2 (or more) threads could deallocate the same
+// chunk, with a net result of having said chunk present in those distinct
+// thread caches.
+
+// A passing test means all the children died with an error. The failing
+// scenario involves winning a race, so repro can be scarce.
+
+#include <pthread.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <sys/types.h>
+#include <sys/wait.h>
+
+const int kNumThreads = 2;
+pthread_t tid[kNumThreads];
+
+pthread_cond_t cond = PTHREAD_COND_INITIALIZER;
+pthread_mutex_t mutex = PTHREAD_MUTEX_INITIALIZER;
+char go = 0;
+
+// Frees the pointer passed when signaled to.
+void *thread_free(void *p) {
+  pthread_mutex_lock(&mutex);
+  while (!go)
+    pthread_cond_wait(&cond, &mutex);
+  pthread_mutex_unlock(&mutex);
+  free(p);
+  return 0;
+}
+
+// Allocates a chunk, and attempts to free it "simultaneously" by 2 threads.
+void child(void) {
+  void *p = malloc(16);
+  for (int i = 0; i < kNumThreads; i++)
+    pthread_create(&tid[i], 0, thread_free, p);
+  pthread_mutex_lock(&mutex);
+  go = 1;
+  pthread_cond_broadcast(&cond);
+  pthread_mutex_unlock(&mutex);
+  for (int i = 0; i < kNumThreads; i++)
+    pthread_join(tid[i], 0);
+}
+
+int main(int argc, char** argv) {
+  const int kChildren = 40;
+  pid_t pid;
+  for (int i = 0; i < kChildren; ++i) {
+    pid = fork();
+    if (pid < 0) {
+      exit(1);
+    } else if (pid == 0) {
+      child();
+      exit(0);
+    } else {
+      int status;
+      wait(&status);
+      // A 0 status means the child didn't die with an error. The race was won.
+      if (status == 0)
+        exit(1);
+    }
+  }
+  return 0;
+}
diff --git a/test/tsan/Darwin/gcd-sync-block-copy.mm b/test/tsan/Darwin/gcd-sync-block-copy.mm
new file mode 100644
index 0000000..87658d7
--- /dev/null
+++ b/test/tsan/Darwin/gcd-sync-block-copy.mm
@@ -0,0 +1,34 @@
+// This test verifies that dispatch_sync() doesn't actually copy the block under TSan (without TSan, it doesn't).
+
+// RUN: %clang_tsan -fno-sanitize=thread %s -o %t_no_tsan -framework Foundation
+// RUN: %run %t_no_tsan 2>&1 | FileCheck %s
+
+// RUN: %clang_tsan %s -o %t_with_tsan -framework Foundation
+// RUN: %run %t_with_tsan 2>&1 | FileCheck %s
+
+#import <Foundation/Foundation.h>
+
+@interface MyClass : NSObject
+@end
+
+@implementation MyClass
+- (instancetype)retain {
+  // Copying the dispatch_sync'd block below will increment the retain count of
+  // this object. Abort if that happens.
+  abort();
+}
+@end
+
+int main(int argc, const char* argv[]) {
+  dispatch_queue_t q = dispatch_queue_create("my.queue", NULL);
+  id object = [[MyClass alloc] init];
+  dispatch_sync(q, ^{
+    NSLog(@"%@", object);
+  });
+  [object release];
+  NSLog(@"Done.");
+  return 0;
+}
+
+// CHECK: Done.
+// CHECK-NOT: WARNING: ThreadSanitizer
diff --git a/test/tsan/large_malloc_meta.cc b/test/tsan/large_malloc_meta.cc
index e830048..0d0eec3 100644
--- a/test/tsan/large_malloc_meta.cc
+++ b/test/tsan/large_malloc_meta.cc
@@ -1,4 +1,7 @@
 // RUN: %clangxx_tsan -O1 %s -o %t && %run %t 2>&1 | FileCheck %s
+
+// UNSUPPORTED: ios
+
 #include "test.h"
 #include <sys/mman.h>
 
diff --git a/test/tsan/mmap_large.cc b/test/tsan/mmap_large.cc
index 764e954..c8d258e 100644
--- a/test/tsan/mmap_large.cc
+++ b/test/tsan/mmap_large.cc
@@ -1,4 +1,7 @@
 // RUN: %clang_tsan -O1 %s -o %t && %run %t 2>&1 | FileCheck %s
+
+// UNSUPPORTED: ios
+
 #include <stdint.h>
 #include <stdio.h>
 #include <errno.h>
diff --git a/test/ubsan/CMakeLists.txt b/test/ubsan/CMakeLists.txt
index fa8b16b..351bf3b 100644
--- a/test/ubsan/CMakeLists.txt
+++ b/test/ubsan/CMakeLists.txt
@@ -62,6 +62,31 @@
   endif()
 endforeach()
 
+macro(add_ubsan_device_testsuite test_mode sanitizer platform arch)
+  # Note we expect the caller to have already set UBSAN_TEST_TARGET_CFLAGS
+  set(UBSAN_LIT_TEST_MODE "${test_mode}")
+  set(CONFIG_NAME ${UBSAN_LIT_TEST_MODE}-${platform}-${arch})
+  set(UBSAN_TEST_TARGET_ARCH ${arch})
+  set(UBSAN_TEST_USE_LLD "False")
+  set(UBSAN_TEST_USE_THINLTO "False")
+  if (APPLE)
+    set(UBSAN_TEST_APPLE_PLATFORM "${platform}")
+  else()
+    unset(UBSAN_TEST_APPLE_PLATFORM)
+  endif()
+  configure_lit_site_cfg(
+    ${CMAKE_CURRENT_SOURCE_DIR}/lit.site.cfg.in
+    ${CMAKE_CURRENT_BINARY_DIR}/${CONFIG_NAME}/lit.site.cfg)
+  #list(APPEND UBSAN_TESTSUITES ${CMAKE_CURRENT_BINARY_DIR}/${CONFIG_NAME})
+  if(NOT COMPILER_RT_STANDALONE_BUILD)
+    list(APPEND UBSAN_TEST_DEPS ${sanitizer})
+  endif()
+  add_lit_testsuite(check-ubsan-${test_mode}-${platform}-${arch}
+    "UBSan ${CONFIG_NAME} tests"
+    ${CMAKE_CURRENT_BINARY_DIR}/${CONFIG_NAME}/
+    DEPENDS ${UBSAN_TEST_DEPS})
+endmacro()
+
 if(APPLE)
   foreach(arch ${UBSAN_TEST_ARCH})
     set(UBSAN_TEST_TARGET_ARCH ${arch})
@@ -69,9 +94,47 @@
     set(UBSAN_TEST_TARGET_CFLAGS "${UBSAN_TEST_TARGET_CFLAGS} -lc++abi")
     add_ubsan_testsuites("StandaloneStatic" ubsan ${arch})
   endforeach()
+
+  # Device and simulator test suites.
+  # These are not added into "check-all", in order to run these tests, use
+  # "check-asan-iossim-x86_64" and similar. They also require that an extra env
+  # variable to select which iOS device or simulator to use, e.g.:
+  # SANITIZER_IOSSIM_TEST_DEVICE_IDENTIFIER="iPhone 6"
+  set(EXCLUDE_FROM_ALL ON)
+  set(UBSAN_APPLE_PLATFORMS "")
+  if (COMPILER_RT_ENABLE_IOS)
+    list(APPEND UBSAN_APPLE_PLATFORMS ios iossim)
+  endif()
+  if (COMPILER_RT_ENABLE_WATCHOS)
+    list(APPEND UBSAN_APPLE_PLATFORMS watchos watchossim)
+  endif()
+  if (COMPILER_RT_ENABLE_TVOS)
+    list(APPEND UBSAN_APPLE_PLATFORMS tvos tvossim)
+  endif()
+  foreach(platform ${UBSAN_APPLE_PLATFORMS})
+    foreach(arch ${DARWIN_${platform}_ARCHS})
+      get_target_flags_for_arch(${arch} UBSAN_TEST_TARGET_ARCH_FLAGS_AS_LIST)
+      string(REPLACE ";" " " UBSAN_TEST_TARGET_ARCH_FLAGS "${UBSAN_TEST_TARGET_ARCH_FLAGS_AS_LIST}")
+      set(UBSAN_TEST_TARGET_CFLAGS
+        "${UBSAN_TEST_TARGET_ARCH_FLAGS} -isysroot ${DARWIN_${platform}_SYSROOT}")
+      if (";${UBSAN_SUPPORTED_ARCH};" MATCHES ";${arch};")
+        add_ubsan_device_testsuite("Standalone" ubsan ${platform} ${arch})
+      endif()
+
+      if(COMPILER_RT_HAS_ASAN AND ";${ASAN_SUPPORTED_ARCH};" MATCHES ";${arch};")
+        add_ubsan_device_testsuite("AddressSanitizer" asan ${platform} ${arch})
+      endif()
+
+      if(COMPILER_RT_HAS_TSAN AND ";${TSAN_SUPPORTED_ARCH};" MATCHES ";${arch};")
+        add_ubsan_device_testsuite("ThreadSanitizer" tsan ${platform} ${arch})
+      endif()
+    endforeach()
+  endforeach()
+  set(EXCLUDE_FROM_ALL OFF)
 endif()
 
 add_lit_testsuite(check-ubsan "Running UndefinedBehaviorSanitizer tests"
   ${UBSAN_TESTSUITES}
   DEPENDS ${UBSAN_TEST_DEPS})
 set_target_properties(check-ubsan PROPERTIES FOLDER "Compiler-RT Misc")
+
diff --git a/test/ubsan/TestCases/ImplicitConversion/integer-conversion.c b/test/ubsan/TestCases/ImplicitConversion/integer-conversion.c
new file mode 100644
index 0000000..d49d043
--- /dev/null
+++ b/test/ubsan/TestCases/ImplicitConversion/integer-conversion.c
@@ -0,0 +1,324 @@
+// RUN: %clang   -x c   -fsanitize=implicit-conversion %s -DV0 -o %t && %run %t 2>&1 | FileCheck %s --implicit-check-not="implicit conversion" --check-prefixes=CHECK-V0
+// RUN: %clang   -x c   -fsanitize=implicit-conversion %s -DV1 -o %t && %run %t 2>&1 | FileCheck %s --implicit-check-not="implicit conversion" --check-prefixes=CHECK-V1
+// RUN: %clang   -x c   -fsanitize=implicit-conversion %s -DV2 -o %t && %run %t 2>&1 | FileCheck %s --implicit-check-not="implicit conversion" --check-prefixes=CHECK-V2
+// RUN: %clang   -x c   -fsanitize=implicit-conversion %s -DV3 -o %t && %run %t 2>&1 | FileCheck %s --implicit-check-not="implicit conversion" --check-prefixes=CHECK-V3
+// RUN: %clang   -x c   -fsanitize=implicit-conversion %s -DV4 -o %t && %run %t 2>&1 | FileCheck %s --implicit-check-not="implicit conversion" --check-prefixes=CHECK-V4
+// RUN: %clang   -x c   -fsanitize=implicit-conversion %s -DV5 -o %t && %run %t 2>&1 | FileCheck %s --implicit-check-not="implicit conversion" --check-prefixes=CHECK-V5
+// RUN: %clang   -x c   -fsanitize=implicit-conversion %s -DV6 -o %t && %run %t 2>&1 | FileCheck %s --implicit-check-not="implicit conversion" --check-prefixes=CHECK-V6
+// RUN: %clang   -x c   -fsanitize=implicit-conversion %s -DV7 -o %t && %run %t 2>&1 | FileCheck %s --implicit-check-not="implicit conversion" --check-prefixes=CHECK-V7
+
+// RUN: %clang   -x c++ -fsanitize=implicit-conversion %s -DV0 -o %t && %run %t 2>&1 | FileCheck %s --implicit-check-not="implicit conversion" --check-prefixes=CHECK-V0
+// RUN: %clang   -x c++ -fsanitize=implicit-conversion %s -DV1 -o %t && %run %t 2>&1 | FileCheck %s --implicit-check-not="implicit conversion" --check-prefixes=CHECK-V1
+// RUN: %clang   -x c++ -fsanitize=implicit-conversion %s -DV2 -o %t && %run %t 2>&1 | FileCheck %s --implicit-check-not="implicit conversion" --check-prefixes=CHECK-V2
+// RUN: %clang   -x c++ -fsanitize=implicit-conversion %s -DV3 -o %t && %run %t 2>&1 | FileCheck %s --implicit-check-not="implicit conversion" --check-prefixes=CHECK-V3
+// RUN: %clang   -x c++ -fsanitize=implicit-conversion %s -DV4 -o %t && %run %t 2>&1 | FileCheck %s --implicit-check-not="implicit conversion" --check-prefixes=CHECK-V4
+// RUN: %clang   -x c++ -fsanitize=implicit-conversion %s -DV5 -o %t && %run %t 2>&1 | FileCheck %s --implicit-check-not="implicit conversion" --check-prefixes=CHECK-V5
+// RUN: %clang   -x c++ -fsanitize=implicit-conversion %s -DV6 -o %t && %run %t 2>&1 | FileCheck %s --implicit-check-not="implicit conversion" --check-prefixes=CHECK-V6
+// RUN: %clang   -x c++ -fsanitize=implicit-conversion %s -DV7 -o %t && %run %t 2>&1 | FileCheck %s --implicit-check-not="implicit conversion" --check-prefixes=CHECK-V7
+
+#include <stdint.h>
+
+// Test plan:
+//  * Two types - int and char
+//  * Two signs - signed and unsigned
+//  * Square that - we have input and output types.
+// Thus, there are total of (2*2)^2 == 16 tests.
+// These are all the possible variations/combinations of casts.
+// However, not all of them should result in the check.
+// So here, we *only* check which should and which should not result in checks.
+
+unsigned int convert_unsigned_int_to_unsigned_int(unsigned int x) {
+#line 100
+  return x;
+}
+
+unsigned char convert_unsigned_char_to_unsigned_char(unsigned char x) {
+#line 200
+  return x;
+}
+
+signed int convert_signed_int_to_signed_int(signed int x) {
+#line 300
+  return x;
+}
+
+signed char convert_signed_char_to_signed_char(signed char x) {
+#line 400
+  return x;
+}
+
+unsigned char convert_unsigned_int_to_unsigned_char(unsigned int x) {
+#line 500
+  return x;
+}
+
+unsigned int convert_unsigned_char_to_unsigned_int(unsigned char x) {
+#line 600
+  return x;
+}
+
+signed int convert_unsigned_char_to_signed_int(unsigned char x) {
+#line 700
+  return x;
+}
+
+signed int convert_signed_char_to_signed_int(signed char x) {
+#line 800
+  return x;
+}
+
+signed int convert_unsigned_int_to_signed_int(unsigned int x) {
+#line 900
+  return x;
+}
+
+unsigned int convert_signed_int_to_unsigned_int(signed int x) {
+#line 1000
+  return x;
+}
+
+unsigned char convert_signed_int_to_unsigned_char(signed int x) {
+#line 1100
+  return x;
+}
+
+unsigned char convert_signed_char_to_unsigned_char(signed char x) {
+#line 1200
+  return x;
+}
+
+signed char convert_unsigned_char_to_signed_char(unsigned char x) {
+#line 1300
+  return x;
+}
+
+unsigned int convert_signed_char_to_unsigned_int(signed char x) {
+#line 1400
+  return x;
+}
+
+signed char convert_unsigned_int_to_signed_char(unsigned int x) {
+#line 1500
+  return x;
+}
+
+signed char convert_signed_int_to_signed_char(signed int x) {
+#line 1600
+  return x;
+}
+
+#line 1111 // !!!
+
+int main() {
+  // No bits set.
+  convert_unsigned_int_to_unsigned_int(0);
+  convert_unsigned_char_to_unsigned_char(0);
+  convert_signed_int_to_signed_int(0);
+  convert_signed_char_to_signed_char(0);
+  convert_unsigned_int_to_unsigned_char(0);
+  convert_unsigned_char_to_unsigned_int(0);
+  convert_unsigned_char_to_signed_int(0);
+  convert_signed_char_to_signed_int(0);
+  convert_unsigned_int_to_signed_int(0);
+  convert_signed_int_to_unsigned_int(0);
+  convert_signed_int_to_unsigned_char(0);
+  convert_signed_char_to_unsigned_char(0);
+  convert_unsigned_char_to_signed_char(0);
+  convert_signed_char_to_unsigned_int(0);
+  convert_unsigned_int_to_signed_char(0);
+  convert_signed_int_to_signed_char(0);
+
+  // One lowest bit set.
+  convert_unsigned_int_to_unsigned_int(1);
+  convert_unsigned_char_to_unsigned_char(1);
+  convert_signed_int_to_signed_int(1);
+  convert_signed_char_to_signed_char(1);
+  convert_unsigned_int_to_unsigned_char(1);
+  convert_unsigned_char_to_unsigned_int(1);
+  convert_unsigned_char_to_signed_int(1);
+  convert_signed_char_to_signed_int(1);
+  convert_unsigned_int_to_signed_int(1);
+  convert_signed_int_to_unsigned_int(1);
+  convert_signed_int_to_unsigned_char(1);
+  convert_signed_char_to_unsigned_char(1);
+  convert_unsigned_char_to_signed_char(1);
+  convert_signed_char_to_unsigned_int(1);
+  convert_unsigned_int_to_signed_char(1);
+  convert_signed_int_to_signed_char(1);
+
+#if defined(V0)
+  // All source bits set.
+  convert_unsigned_int_to_unsigned_int((uint32_t)UINT32_MAX);
+  convert_unsigned_char_to_unsigned_char((uint8_t)UINT8_MAX);
+  convert_signed_int_to_signed_int((int32_t)(uint32_t)UINT32_MAX);
+  convert_signed_char_to_signed_char((int8_t)UINT8_MAX);
+  convert_unsigned_int_to_unsigned_char((uint32_t)UINT32_MAX);
+// CHECK-V0: {{.*}}integer-conversion.c:500:10: runtime error: implicit conversion from type 'unsigned int' of value 4294967295 (32-bit, unsigned) to type 'unsigned char' changed the value to 255 (8-bit, unsigned
+  convert_unsigned_char_to_unsigned_int((uint8_t)UINT8_MAX);
+  convert_unsigned_char_to_signed_int((uint8_t)UINT8_MAX);
+  convert_signed_char_to_signed_int((int8_t)UINT8_MAX);
+  convert_unsigned_int_to_signed_int((uint32_t)UINT32_MAX);
+  convert_signed_int_to_unsigned_int((int32_t)(uint32_t)UINT32_MAX);
+  convert_signed_int_to_unsigned_char((int32_t)(uint32_t)UINT32_MAX);
+// CHECK-V0: {{.*}}integer-conversion.c:1100:10: runtime error: implicit conversion from type 'int' of value -1 (32-bit, signed) to type 'unsigned char' changed the value to 255 (8-bit, unsigned)
+  convert_signed_char_to_unsigned_char((int8_t)UINT8_MAX);
+  convert_unsigned_char_to_signed_char((uint8_t)UINT8_MAX);
+  convert_signed_char_to_unsigned_int((int8_t)UINT8_MAX);
+  convert_unsigned_int_to_signed_char((uint32_t)UINT32_MAX);
+  convert_signed_int_to_signed_char((int32_t)(uint32_t)UINT32_MAX);
+#elif defined(V1)
+   // Source 'Sign' bit set.
+  convert_unsigned_int_to_unsigned_int((uint32_t)INT32_MIN);
+  convert_unsigned_char_to_unsigned_char((uint8_t)INT8_MIN);
+  convert_signed_int_to_signed_int((int32_t)(uint32_t)INT32_MIN);
+  convert_signed_char_to_signed_char((int8_t)INT8_MIN);
+  convert_unsigned_int_to_unsigned_char((uint32_t)INT32_MIN);
+// CHECK-V1: {{.*}}integer-conversion.c:500:10: runtime error: implicit conversion from type 'unsigned int' of value 2147483648 (32-bit, unsigned) to type 'unsigned char' changed the value to 0 (8-bit, unsigned)
+  convert_unsigned_char_to_unsigned_int((uint8_t)INT8_MIN);
+  convert_unsigned_char_to_signed_int((uint8_t)INT8_MIN);
+  convert_signed_char_to_signed_int((int8_t)INT8_MIN);
+  convert_unsigned_int_to_signed_int((uint32_t)INT32_MIN);
+  convert_signed_int_to_unsigned_int((int32_t)(uint32_t)INT32_MIN);
+  convert_signed_int_to_unsigned_char((int32_t)(uint32_t)INT32_MIN);
+// CHECK-V1: {{.*}}integer-conversion.c:1100:10: runtime error: implicit conversion from type 'int' of value -2147483648 (32-bit, signed) to type 'unsigned char' changed the value to 0 (8-bit, unsigned)
+  convert_signed_char_to_unsigned_char((int8_t)INT8_MIN);
+  convert_unsigned_char_to_signed_char((uint8_t)INT8_MIN);
+  convert_signed_char_to_unsigned_int((int8_t)INT8_MIN);
+  convert_unsigned_int_to_signed_char((uint32_t)INT32_MIN);
+// CHECK-V1: {{.*}}integer-conversion.c:1500:10: runtime error: implicit conversion from type 'unsigned int' of value 2147483648 (32-bit, unsigned) to type 'signed char' changed the value to 0 (8-bit, signed)
+  convert_signed_int_to_signed_char((int32_t)(uint32_t)INT32_MIN);
+// CHECK-V1: {{.*}}integer-conversion.c:1600:10: runtime error: implicit conversion from type 'int' of value -2147483648 (32-bit, signed) to type 'signed char' changed the value to 0 (8-bit, signed)
+#elif defined(V2)
+  // All bits except the source 'Sign' bit are set.
+  convert_unsigned_int_to_unsigned_int((uint32_t)INT32_MAX);
+  convert_unsigned_char_to_unsigned_char((uint8_t)INT8_MAX);
+  convert_signed_int_to_signed_int((int32_t)(uint32_t)INT32_MAX);
+  convert_signed_char_to_signed_char((int8_t)INT8_MAX);
+  convert_unsigned_int_to_unsigned_char((uint32_t)INT32_MAX);
+// CHECK-V2: {{.*}}integer-conversion.c:500:10: runtime error: implicit conversion from type 'unsigned int' of value 2147483647 (32-bit, unsigned) to type 'unsigned char' changed the value to 255 (8-bit, unsigned)
+  convert_unsigned_char_to_unsigned_int((uint8_t)INT8_MAX);
+  convert_unsigned_char_to_signed_int((uint8_t)INT8_MAX);
+  convert_signed_char_to_signed_int((int8_t)INT8_MAX);
+  convert_unsigned_int_to_signed_int((uint32_t)INT32_MAX);
+  convert_signed_int_to_unsigned_int((int32_t)(uint32_t)INT32_MAX);
+  convert_signed_int_to_unsigned_char((int32_t)(uint32_t)INT32_MAX);
+// CHECK-V2: {{.*}}integer-conversion.c:1100:10: runtime error: implicit conversion from type 'int' of value 2147483647 (32-bit, signed) to type 'unsigned char' changed the value to 255 (8-bit, unsigned)
+  convert_signed_char_to_unsigned_char((int8_t)INT8_MAX);
+  convert_unsigned_char_to_signed_char((uint8_t)INT8_MAX);
+  convert_signed_char_to_unsigned_int((int8_t)INT8_MAX);
+  convert_unsigned_int_to_signed_char((uint32_t)INT32_MAX);
+// CHECK-V2: {{.*}}integer-conversion.c:1500:10: runtime error: implicit conversion from type 'unsigned int' of value 2147483647 (32-bit, unsigned) to type 'signed char' changed the value to -1 (8-bit, signed)
+  convert_signed_int_to_signed_char((int32_t)(uint32_t)INT32_MAX);
+// CHECK-V2: {{.*}}integer-conversion.c:1600:10: runtime error: implicit conversion from type 'int' of value 2147483647 (32-bit, signed) to type 'signed char' changed the value to -1 (8-bit, signed)
+#elif defined(V3)
+  // All destination bits set.
+  convert_unsigned_int_to_unsigned_int((uint32_t)UINT8_MAX);
+  convert_unsigned_char_to_unsigned_char((uint8_t)UINT8_MAX);
+  convert_signed_int_to_signed_int((int32_t)(uint32_t)UINT8_MAX);
+  convert_signed_char_to_signed_char((int8_t)UINT8_MAX);
+  convert_unsigned_int_to_unsigned_char((uint32_t)UINT8_MAX);
+  convert_unsigned_char_to_unsigned_int((uint8_t)UINT8_MAX);
+  convert_unsigned_char_to_signed_int((uint8_t)UINT8_MAX);
+  convert_signed_char_to_signed_int((int8_t)UINT8_MAX);
+  convert_unsigned_int_to_signed_int((uint32_t)UINT8_MAX);
+  convert_signed_int_to_unsigned_int((int32_t)(uint32_t)UINT8_MAX);
+  convert_signed_int_to_unsigned_char((int32_t)(uint32_t)UINT8_MAX);
+  convert_signed_char_to_unsigned_char((int8_t)UINT8_MAX);
+  convert_unsigned_char_to_signed_char((uint8_t)UINT8_MAX);
+  convert_signed_char_to_unsigned_int((int8_t)UINT8_MAX);
+  convert_unsigned_int_to_signed_char((uint32_t)UINT8_MAX);
+// CHECK-V3: {{.*}}integer-conversion.c:1500:10: runtime error: implicit conversion from type 'unsigned int' of value 255 (32-bit, unsigned) to type 'signed char' changed the value to -1 (8-bit, signed)
+  convert_signed_int_to_signed_char((int32_t)(uint32_t)UINT8_MAX);
+// CHECK-V3: {{.*}}integer-conversion.c:1600:10: runtime error: implicit conversion from type 'int' of value 255 (32-bit, signed) to type 'signed char' changed the value to -1 (8-bit, signed)
+#elif defined(V4)
+  // Destination 'sign' bit set.
+  convert_unsigned_int_to_unsigned_int((uint32_t)(uint8_t)INT8_MIN);
+  convert_unsigned_char_to_unsigned_char((uint8_t)(uint8_t)INT8_MIN);
+  convert_signed_int_to_signed_int((int32_t)(uint32_t)(uint8_t)INT8_MIN);
+  convert_signed_char_to_signed_char((int8_t)(uint8_t)INT8_MIN);
+  convert_unsigned_int_to_unsigned_char((uint32_t)(uint8_t)INT8_MIN);
+  convert_unsigned_char_to_unsigned_int((uint8_t)(uint8_t)INT8_MIN);
+  convert_unsigned_char_to_signed_int((uint8_t)(uint8_t)INT8_MIN);
+  convert_signed_char_to_signed_int((int8_t)(uint8_t)INT8_MIN);
+  convert_unsigned_int_to_signed_int((uint32_t)(uint8_t)INT8_MIN);
+  convert_signed_int_to_unsigned_int((int32_t)(uint32_t)(uint8_t)INT8_MIN);
+  convert_signed_int_to_unsigned_char((int32_t)(uint32_t)(uint8_t)INT8_MIN);
+  convert_signed_char_to_unsigned_char((int8_t)(uint8_t)INT8_MIN);
+  convert_unsigned_char_to_signed_char((uint8_t)(uint8_t)INT8_MIN);
+  convert_signed_char_to_unsigned_int((int8_t)(uint8_t)INT8_MIN);
+  convert_unsigned_int_to_signed_char((uint32_t)(uint8_t)INT8_MIN);
+// CHECK-V4: {{.*}}integer-conversion.c:1500:10: runtime error: implicit conversion from type 'unsigned int' of value 128 (32-bit, unsigned) to type 'signed char' changed the value to -128 (8-bit, signed)
+  convert_signed_int_to_signed_char((int32_t)(uint32_t)(uint8_t)INT8_MIN);
+// CHECK-V4: {{.*}}integer-conversion.c:1600:10: runtime error: implicit conversion from type 'int' of value 128 (32-bit, signed) to type 'signed char' changed the value to -128 (8-bit, signed)
+#elif defined(V5)
+  // All bits except the destination 'sign' bit are set.
+  convert_unsigned_int_to_unsigned_int((~((uint32_t)(uint8_t)INT8_MIN)));
+  convert_unsigned_char_to_unsigned_char((uint8_t)(uint8_t)INT8_MIN);
+  convert_signed_int_to_signed_int((int32_t)(~((uint32_t)(uint8_t)INT8_MIN)));
+  convert_signed_char_to_signed_char((int8_t)(uint8_t)INT8_MIN);
+  convert_unsigned_int_to_unsigned_char((~((uint32_t)(uint8_t)INT8_MIN)));
+// CHECK-V5: {{.*}}integer-conversion.c:500:10: runtime error: implicit conversion from type 'unsigned int' of value 4294967167 (32-bit, unsigned) to type 'unsigned char' changed the value to 127 (8-bit, unsigned)
+  convert_unsigned_char_to_unsigned_int((uint8_t)(uint8_t)INT8_MIN);
+  convert_unsigned_char_to_signed_int((uint8_t)(uint8_t)INT8_MIN);
+  convert_signed_char_to_signed_int((int8_t)(uint8_t)INT8_MIN);
+  convert_unsigned_int_to_signed_int((~((uint32_t)(uint8_t)INT8_MIN)));
+  convert_signed_int_to_unsigned_int((int32_t)(~((uint32_t)(uint8_t)INT8_MIN)));
+  convert_signed_int_to_unsigned_char((int32_t)(~((uint32_t)(uint8_t)INT8_MIN)));
+// CHECK-V5: {{.*}}integer-conversion.c:1100:10: runtime error: implicit conversion from type 'int' of value -129 (32-bit, signed) to type 'unsigned char' changed the value to 127 (8-bit, unsigned)
+  convert_signed_char_to_unsigned_char((int8_t)(uint8_t)INT8_MIN);
+  convert_unsigned_char_to_signed_char((uint8_t)(uint8_t)INT8_MIN);
+  convert_signed_char_to_unsigned_int((int8_t)(uint8_t)INT8_MIN);
+  convert_unsigned_int_to_signed_char((~((uint32_t)(uint8_t)INT8_MIN)));
+// CHECK-V5: {{.*}}integer-conversion.c:1500:10: runtime error: implicit conversion from type 'unsigned int' of value 4294967167 (32-bit, unsigned) to type 'signed char' changed the value to 127 (8-bit, signed)
+  convert_signed_int_to_signed_char((int32_t)(~((uint32_t)(uint8_t)INT8_MIN)));
+// CHECK-V5: {{.*}}integer-conversion.c:1600:10: runtime error: implicit conversion from type 'int' of value -129 (32-bit, signed) to type 'signed char' changed the value to 127 (8-bit, signed)
+#elif defined(V6)
+  // Only the source and destination sign bits are set.
+  convert_unsigned_int_to_unsigned_int((uint32_t)INT32_MIN);
+  convert_unsigned_char_to_unsigned_char((uint8_t)INT8_MIN);
+  convert_signed_int_to_signed_int((int32_t)INT32_MIN);
+  convert_signed_char_to_signed_char((int8_t)INT8_MIN);
+  convert_unsigned_int_to_unsigned_char(((uint32_t)INT32_MIN) | ((uint32_t)(uint8_t)INT8_MIN));
+// CHECK-V6: {{.*}}integer-conversion.c:500:10: runtime error: implicit conversion from type 'unsigned int' of value 2147483776 (32-bit, unsigned) to type 'unsigned char' changed the value to 128 (8-bit, unsigned)
+  convert_unsigned_char_to_unsigned_int((uint8_t)INT8_MIN);
+  convert_unsigned_char_to_signed_int((uint8_t)INT8_MIN);
+  convert_signed_char_to_signed_int((int8_t)INT8_MIN);
+  convert_unsigned_int_to_signed_int((uint32_t)INT32_MIN);
+  convert_signed_int_to_unsigned_int((uint32_t)INT32_MIN);
+  convert_signed_int_to_unsigned_char((int32_t)(((uint32_t)INT32_MIN) | ((uint32_t)(uint8_t)INT8_MIN)));
+// CHECK-V6: {{.*}}integer-conversion.c:1100:10: runtime error: implicit conversion from type 'int' of value -2147483520 (32-bit, signed) to type 'unsigned char' changed the value to 128 (8-bit, unsigned)
+  convert_signed_char_to_unsigned_char((int8_t)INT8_MIN);
+  convert_unsigned_char_to_signed_char((uint8_t)INT8_MIN);
+  convert_signed_char_to_unsigned_int((int8_t)INT8_MIN);
+  convert_unsigned_int_to_signed_char((((uint32_t)INT32_MIN) | ((uint32_t)(uint8_t)INT8_MIN)));
+// CHECK-V6: {{.*}}integer-conversion.c:1500:10: runtime error: implicit conversion from type 'unsigned int' of value 2147483776 (32-bit, unsigned) to type 'signed char' changed the value to -128 (8-bit, signed)
+  convert_signed_int_to_signed_char((int32_t)(((uint32_t)INT32_MIN) | ((uint32_t)(uint8_t)INT8_MIN)));
+// CHECK-V6: {{.*}}integer-conversion.c:1600:10: runtime error: implicit conversion from type 'int' of value -2147483520 (32-bit, signed) to type 'signed char' changed the value to -128 (8-bit, signed)
+#elif defined(V7)
+  // All bits except the source and destination sign bits are set.
+  convert_unsigned_int_to_unsigned_int((uint32_t)INT32_MAX);
+  convert_unsigned_char_to_unsigned_char((uint8_t)INT8_MAX);
+  convert_signed_int_to_signed_int((int32_t)INT32_MAX);
+  convert_signed_char_to_signed_char((int8_t)INT8_MAX);
+  convert_unsigned_int_to_unsigned_char(~(((uint32_t)INT32_MIN) | ((uint32_t)(uint8_t)INT8_MIN)));
+// CHECK-V7: {{.*}}integer-conversion.c:500:10: runtime error: implicit conversion from type 'unsigned int' of value 2147483519 (32-bit, unsigned) to type 'unsigned char' changed the value to 127 (8-bit, unsigned)
+  convert_unsigned_char_to_unsigned_int((uint8_t)INT8_MAX);
+  convert_unsigned_char_to_signed_int((uint8_t)INT8_MAX);
+  convert_signed_char_to_signed_int((int8_t)INT8_MAX);
+  convert_unsigned_int_to_signed_int((uint32_t)INT32_MAX);
+  convert_signed_int_to_unsigned_int((uint32_t)INT32_MAX);
+  convert_signed_int_to_unsigned_char((int32_t)(~(((uint32_t)INT32_MIN) | ((uint32_t)(uint8_t)INT8_MIN))));
+// CHECK-V7: {{.*}}integer-conversion.c:1100:10: runtime error: implicit conversion from type 'int' of value 2147483519 (32-bit, signed) to type 'unsigned char' changed the value to 127 (8-bit, unsigned)
+  convert_signed_char_to_unsigned_char((int8_t)INT8_MAX);
+  convert_unsigned_char_to_signed_char((uint8_t)INT8_MAX);
+  convert_signed_char_to_unsigned_int((int8_t)INT8_MAX);
+  convert_unsigned_int_to_signed_char(~(((uint32_t)INT32_MIN) | ((uint32_t)(uint8_t)INT8_MIN)));
+// CHECK-V7: {{.*}}integer-conversion.c:1500:10: runtime error: implicit conversion from type 'unsigned int' of value 2147483519 (32-bit, unsigned) to type 'signed char' changed the value to 127 (8-bit, signed)
+  convert_signed_int_to_signed_char((int32_t)~(((uint32_t)INT32_MIN) | ((uint32_t)(uint8_t)INT8_MIN)));
+// CHECK-V7: {{.*}}integer-conversion.c:1600:10: runtime error: implicit conversion from type 'int' of value 2147483519 (32-bit, signed) to type 'signed char' changed the value to 127 (8-bit, signed)
+#else
+#error Some V* needs to be defined!
+#endif
+
+  return 0;
+}
diff --git a/test/ubsan/TestCases/ImplicitConversion/integer-truncation-blacklist.c b/test/ubsan/TestCases/ImplicitConversion/integer-truncation-blacklist.c
index 13d4dca..a3650e0 100644
--- a/test/ubsan/TestCases/ImplicitConversion/integer-truncation-blacklist.c
+++ b/test/ubsan/TestCases/ImplicitConversion/integer-truncation-blacklist.c
@@ -3,16 +3,22 @@
 // XFAIL: android
 // UNSUPPORTED: ios
 
+// RUN: %clang -fsanitize=implicit-integer-truncation -fno-sanitize-recover=implicit-integer-truncation                           -O0 %s -o %t && not %run %t 2>&1 | FileCheck %s --implicit-check-not="implicit conversion"
+// RUN: %clang -fsanitize=implicit-integer-truncation -fno-sanitize-recover=implicit-integer-truncation                           -O1 %s -o %t && not %run %t 2>&1 | FileCheck %s --implicit-check-not="implicit conversion"
+// RUN: %clang -fsanitize=implicit-integer-truncation -fno-sanitize-recover=implicit-integer-truncation                           -O2 %s -o %t && not %run %t 2>&1 | FileCheck %s --implicit-check-not="implicit conversion"
+// RUN: %clang -fsanitize=implicit-integer-truncation -fno-sanitize-recover=implicit-integer-truncation                           -O3 %s -o %t && not %run %t 2>&1 | FileCheck %s --implicit-check-not="implicit conversion"
+
 // RUN: rm -f %tmp
 // RUN: echo "[implicit-integer-truncation]" >> %tmp
-// RUN: echo "fun:*implicitTruncation*" >> %tmp
-// RUN: %clang -fsanitize=implicit-integer-truncation -fno-sanitize-recover=implicit-integer-truncation -fsanitize-blacklist=%tmp -O0 %s -o %t && not %run %t 2>&1
-// RUN: %clang -fsanitize=implicit-integer-truncation -fno-sanitize-recover=implicit-integer-truncation -fsanitize-blacklist=%tmp -O1 %s -o %t && not %run %t 2>&1
-// RUN: %clang -fsanitize=implicit-integer-truncation -fno-sanitize-recover=implicit-integer-truncation -fsanitize-blacklist=%tmp -O2 %s -o %t && not %run %t 2>&1
-// RUN: %clang -fsanitize=implicit-integer-truncation -fno-sanitize-recover=implicit-integer-truncation -fsanitize-blacklist=%tmp -O3 %s -o %t && not %run %t 2>&1
+// RUN: echo "fun:implicitTruncation" >> %tmp
+// RUN: %clang -fsanitize=implicit-integer-truncation -fno-sanitize-recover=implicit-integer-truncation -fsanitize-blacklist=%tmp -O0 %s -o %t && not %run %t 2>&1 | not FileCheck %s
+// RUN: %clang -fsanitize=implicit-integer-truncation -fno-sanitize-recover=implicit-integer-truncation -fsanitize-blacklist=%tmp -O1 %s -o %t && not %run %t 2>&1 | not FileCheck %s
+// RUN: %clang -fsanitize=implicit-integer-truncation -fno-sanitize-recover=implicit-integer-truncation -fsanitize-blacklist=%tmp -O2 %s -o %t && not %run %t 2>&1 | not FileCheck %s
+// RUN: %clang -fsanitize=implicit-integer-truncation -fno-sanitize-recover=implicit-integer-truncation -fsanitize-blacklist=%tmp -O3 %s -o %t && not %run %t 2>&1 | not FileCheck %s
 
 unsigned char implicitTruncation(unsigned int argc) {
   return argc; // BOOM
+// CHECK: {{.*}}integer-truncation-blacklist.c:[[@LINE-1]]:10: runtime error: implicit conversion from type 'unsigned int' of value 4294967295 (32-bit, unsigned) to type 'unsigned char' changed the value to 255 (8-bit, unsigned)
 }
 
 int main(int argc, char **argv) {
diff --git a/test/ubsan/TestCases/ImplicitConversion/integer-truncation.c b/test/ubsan/TestCases/ImplicitConversion/integer-truncation.c
index 995eb7d..34a0f00 100644
--- a/test/ubsan/TestCases/ImplicitConversion/integer-truncation.c
+++ b/test/ubsan/TestCases/ImplicitConversion/integer-truncation.c
@@ -1,63 +1,324 @@
-// RUN: %clang   -x c   -fsanitize=implicit-integer-truncation %s -o %t && %run %t 2>&1 | FileCheck %s --check-prefixes=CHECK
-// RUN: %clangxx -x c++ -fsanitize=implicit-integer-truncation %s -o %t && %run %t 2>&1 | FileCheck %s --check-prefixes=CHECK
+// RUN: %clang   -x c   -fsanitize=implicit-integer-truncation %s -DV0 -o %t && %run %t 2>&1 | FileCheck %s --implicit-check-not="implicit conversion" --check-prefixes=CHECK-V0
+// RUN: %clang   -x c   -fsanitize=implicit-integer-truncation %s -DV1 -o %t && %run %t 2>&1 | FileCheck %s --implicit-check-not="implicit conversion" --check-prefixes=CHECK-V1
+// RUN: %clang   -x c   -fsanitize=implicit-integer-truncation %s -DV2 -o %t && %run %t 2>&1 | FileCheck %s --implicit-check-not="implicit conversion" --check-prefixes=CHECK-V2
+// RUN: %clang   -x c   -fsanitize=implicit-integer-truncation %s -DV3 -o %t && %run %t 2>&1 | FileCheck %s --implicit-check-not="implicit conversion" --check-prefixes=CHECK-V3
+// RUN: %clang   -x c   -fsanitize=implicit-integer-truncation %s -DV4 -o %t && %run %t 2>&1 | FileCheck %s --implicit-check-not="implicit conversion" --check-prefixes=CHECK-V4
+// RUN: %clang   -x c   -fsanitize=implicit-integer-truncation %s -DV5 -o %t && %run %t 2>&1 | FileCheck %s --implicit-check-not="implicit conversion" --check-prefixes=CHECK-V5
+// RUN: %clang   -x c   -fsanitize=implicit-integer-truncation %s -DV6 -o %t && %run %t 2>&1 | FileCheck %s --implicit-check-not="implicit conversion" --check-prefixes=CHECK-V6
+// RUN: %clang   -x c   -fsanitize=implicit-integer-truncation %s -DV7 -o %t && %run %t 2>&1 | FileCheck %s --implicit-check-not="implicit conversion" --check-prefixes=CHECK-V7
+
+// RUN: %clang   -x c++ -fsanitize=implicit-integer-truncation %s -DV0 -o %t && %run %t 2>&1 | FileCheck %s --implicit-check-not="implicit conversion" --check-prefixes=CHECK-V0
+// RUN: %clang   -x c++ -fsanitize=implicit-integer-truncation %s -DV1 -o %t && %run %t 2>&1 | FileCheck %s --implicit-check-not="implicit conversion" --check-prefixes=CHECK-V1
+// RUN: %clang   -x c++ -fsanitize=implicit-integer-truncation %s -DV2 -o %t && %run %t 2>&1 | FileCheck %s --implicit-check-not="implicit conversion" --check-prefixes=CHECK-V2
+// RUN: %clang   -x c++ -fsanitize=implicit-integer-truncation %s -DV3 -o %t && %run %t 2>&1 | FileCheck %s --implicit-check-not="implicit conversion" --check-prefixes=CHECK-V3
+// RUN: %clang   -x c++ -fsanitize=implicit-integer-truncation %s -DV4 -o %t && %run %t 2>&1 | FileCheck %s --implicit-check-not="implicit conversion" --check-prefixes=CHECK-V4
+// RUN: %clang   -x c++ -fsanitize=implicit-integer-truncation %s -DV5 -o %t && %run %t 2>&1 | FileCheck %s --implicit-check-not="implicit conversion" --check-prefixes=CHECK-V5
+// RUN: %clang   -x c++ -fsanitize=implicit-integer-truncation %s -DV6 -o %t && %run %t 2>&1 | FileCheck %s --implicit-check-not="implicit conversion" --check-prefixes=CHECK-V6
+// RUN: %clang   -x c++ -fsanitize=implicit-integer-truncation %s -DV7 -o %t && %run %t 2>&1 | FileCheck %s --implicit-check-not="implicit conversion" --check-prefixes=CHECK-V7
 
 #include <stdint.h>
 
-#if !defined(__cplusplus)
-#define bool _Bool
-#endif
+// Test plan:
+//  * Two types - int and char
+//  * Two signs - signed and unsigned
+//  * Square that - we have input and output types.
+// Thus, there are total of (2*2)^2 == 16 tests.
+// These are all the possible variations/combinations of casts.
+// However, not all of them should result in the check.
+// So here, we *only* check which should and which should not result in checks.
+
+unsigned int convert_unsigned_int_to_unsigned_int(unsigned int x) {
+#line 100
+  return x;
+}
+
+unsigned char convert_unsigned_char_to_unsigned_char(unsigned char x) {
+#line 200
+  return x;
+}
+
+signed int convert_signed_int_to_signed_int(signed int x) {
+#line 300
+  return x;
+}
+
+signed char convert_signed_char_to_signed_char(signed char x) {
+#line 400
+  return x;
+}
+
+unsigned char convert_unsigned_int_to_unsigned_char(unsigned int x) {
+#line 500
+  return x;
+}
+
+unsigned int convert_unsigned_char_to_unsigned_int(unsigned char x) {
+#line 600
+  return x;
+}
+
+signed int convert_unsigned_char_to_signed_int(unsigned char x) {
+#line 700
+  return x;
+}
+
+signed int convert_signed_char_to_signed_int(signed char x) {
+#line 800
+  return x;
+}
+
+signed int convert_unsigned_int_to_signed_int(unsigned int x) {
+#line 900
+  return x;
+}
+
+unsigned int convert_signed_int_to_unsigned_int(signed int x) {
+#line 1000
+  return x;
+}
+
+unsigned char convert_signed_int_to_unsigned_char(signed int x) {
+#line 1100
+  return x;
+}
+
+unsigned char convert_signed_char_to_unsigned_char(signed char x) {
+#line 1200
+  return x;
+}
+
+signed char convert_unsigned_char_to_signed_char(unsigned char x) {
+#line 1300
+  return x;
+}
+
+unsigned int convert_signed_char_to_unsigned_int(signed char x) {
+#line 1400
+  return x;
+}
+
+signed char convert_unsigned_int_to_signed_char(unsigned int x) {
+#line 1500
+  return x;
+}
+
+signed char convert_signed_int_to_signed_char(signed int x) {
+#line 1600
+  return x;
+}
+
+#line 1111 // !!!
 
 int main() {
-// CHECK-NOT: integer-truncation.c
+  // No bits set.
+  convert_unsigned_int_to_unsigned_int(0);
+  convert_unsigned_char_to_unsigned_char(0);
+  convert_signed_int_to_signed_int(0);
+  convert_signed_char_to_signed_char(0);
+  convert_unsigned_int_to_unsigned_char(0);
+  convert_unsigned_char_to_unsigned_int(0);
+  convert_unsigned_char_to_signed_int(0);
+  convert_signed_char_to_signed_int(0);
+  convert_unsigned_int_to_signed_int(0);
+  convert_signed_int_to_unsigned_int(0);
+  convert_signed_int_to_unsigned_char(0);
+  convert_signed_char_to_unsigned_char(0);
+  convert_unsigned_char_to_signed_char(0);
+  convert_signed_char_to_unsigned_int(0);
+  convert_unsigned_int_to_signed_char(0);
+  convert_signed_int_to_signed_char(0);
 
-  // Negative tests. Even if they produce unexpected results, this sanitizer does not care.
-  int8_t n0 = (~((uint32_t)0)); // ~0 -> -1, but do not warn.
-  uint8_t n2 = 128;
-  uint8_t n3 = 255;
-  // Bools do not count
-  bool b0 = (~((uint32_t)0));
-  bool b1 = 255;
+  // One lowest bit set.
+  convert_unsigned_int_to_unsigned_int(1);
+  convert_unsigned_char_to_unsigned_char(1);
+  convert_signed_int_to_signed_int(1);
+  convert_signed_char_to_signed_char(1);
+  convert_unsigned_int_to_unsigned_char(1);
+  convert_unsigned_char_to_unsigned_int(1);
+  convert_unsigned_char_to_signed_int(1);
+  convert_signed_char_to_signed_int(1);
+  convert_unsigned_int_to_signed_int(1);
+  convert_signed_int_to_unsigned_int(1);
+  convert_signed_int_to_unsigned_char(1);
+  convert_signed_char_to_unsigned_char(1);
+  convert_unsigned_char_to_signed_char(1);
+  convert_signed_char_to_unsigned_int(1);
+  convert_unsigned_int_to_signed_char(1);
+  convert_signed_int_to_signed_char(1);
 
-  // Explicit and-ing of bits will silence it.
-  uint8_t nc0 = (~((uint32_t)0)) & 255;
-
-  // Explicit casts
-  uint8_t i0 = (uint8_t)(~((uint32_t)0));
-
-#if defined(__cplusplus)
-  uint8_t i1 = uint8_t(~(uint32_t(0)));
-  uint8_t i2 = static_cast<uint8_t>(~(uint32_t(0)));
+#if defined(V0)
+  // All source bits set.
+  convert_unsigned_int_to_unsigned_int((uint32_t)UINT32_MAX);
+  convert_unsigned_char_to_unsigned_char((uint8_t)UINT8_MAX);
+  convert_signed_int_to_signed_int((int32_t)(uint32_t)UINT32_MAX);
+  convert_signed_char_to_signed_char((int8_t)UINT8_MAX);
+  convert_unsigned_int_to_unsigned_char((uint32_t)UINT32_MAX);
+// CHECK-V0: {{.*}}integer-truncation.c:500:10: runtime error: implicit conversion from type 'unsigned int' of value 4294967295 (32-bit, unsigned) to type 'unsigned char' changed the value to 255 (8-bit, unsigned
+  convert_unsigned_char_to_unsigned_int((uint8_t)UINT8_MAX);
+  convert_unsigned_char_to_signed_int((uint8_t)UINT8_MAX);
+  convert_signed_char_to_signed_int((int8_t)UINT8_MAX);
+  convert_unsigned_int_to_signed_int((uint32_t)UINT32_MAX);
+  convert_signed_int_to_unsigned_int((int32_t)(uint32_t)UINT32_MAX);
+  convert_signed_int_to_unsigned_char((int32_t)(uint32_t)UINT32_MAX);
+// CHECK-V0: {{.*}}integer-truncation.c:1100:10: runtime error: implicit conversion from type 'int' of value -1 (32-bit, signed) to type 'unsigned char' changed the value to 255 (8-bit, unsigned)
+  convert_signed_char_to_unsigned_char((int8_t)UINT8_MAX);
+  convert_unsigned_char_to_signed_char((uint8_t)UINT8_MAX);
+  convert_signed_char_to_unsigned_int((int8_t)UINT8_MAX);
+  convert_unsigned_int_to_signed_char((uint32_t)UINT32_MAX);
+  convert_signed_int_to_signed_char((int32_t)(uint32_t)UINT32_MAX);
+#elif defined(V1)
+   // Source 'Sign' bit set.
+  convert_unsigned_int_to_unsigned_int((uint32_t)INT32_MIN);
+  convert_unsigned_char_to_unsigned_char((uint8_t)INT8_MIN);
+  convert_signed_int_to_signed_int((int32_t)(uint32_t)INT32_MIN);
+  convert_signed_char_to_signed_char((int8_t)INT8_MIN);
+  convert_unsigned_int_to_unsigned_char((uint32_t)INT32_MIN);
+// CHECK-V1: {{.*}}integer-truncation.c:500:10: runtime error: implicit conversion from type 'unsigned int' of value 2147483648 (32-bit, unsigned) to type 'unsigned char' changed the value to 0 (8-bit, unsigned)
+  convert_unsigned_char_to_unsigned_int((uint8_t)INT8_MIN);
+  convert_unsigned_char_to_signed_int((uint8_t)INT8_MIN);
+  convert_signed_char_to_signed_int((int8_t)INT8_MIN);
+  convert_unsigned_int_to_signed_int((uint32_t)INT32_MIN);
+  convert_signed_int_to_unsigned_int((int32_t)(uint32_t)INT32_MIN);
+  convert_signed_int_to_unsigned_char((int32_t)(uint32_t)INT32_MIN);
+// CHECK-V1: {{.*}}integer-truncation.c:1100:10: runtime error: implicit conversion from type 'int' of value -2147483648 (32-bit, signed) to type 'unsigned char' changed the value to 0 (8-bit, unsigned)
+  convert_signed_char_to_unsigned_char((int8_t)INT8_MIN);
+  convert_unsigned_char_to_signed_char((uint8_t)INT8_MIN);
+  convert_signed_char_to_unsigned_int((int8_t)INT8_MIN);
+  convert_unsigned_int_to_signed_char((uint32_t)INT32_MIN);
+// CHECK-V1: {{.*}}integer-truncation.c:1500:10: runtime error: implicit conversion from type 'unsigned int' of value 2147483648 (32-bit, unsigned) to type 'signed char' changed the value to 0 (8-bit, signed)
+  convert_signed_int_to_signed_char((int32_t)(uint32_t)INT32_MIN);
+// CHECK-V1: {{.*}}integer-truncation.c:1600:10: runtime error: implicit conversion from type 'int' of value -2147483648 (32-bit, signed) to type 'signed char' changed the value to 0 (8-bit, signed)
+#elif defined(V2)
+  // All bits except the source 'Sign' bit are set.
+  convert_unsigned_int_to_unsigned_int((uint32_t)INT32_MAX);
+  convert_unsigned_char_to_unsigned_char((uint8_t)INT8_MAX);
+  convert_signed_int_to_signed_int((int32_t)(uint32_t)INT32_MAX);
+  convert_signed_char_to_signed_char((int8_t)INT8_MAX);
+  convert_unsigned_int_to_unsigned_char((uint32_t)INT32_MAX);
+// CHECK-V2: {{.*}}integer-truncation.c:500:10: runtime error: implicit conversion from type 'unsigned int' of value 2147483647 (32-bit, unsigned) to type 'unsigned char' changed the value to 255 (8-bit, unsigned)
+  convert_unsigned_char_to_unsigned_int((uint8_t)INT8_MAX);
+  convert_unsigned_char_to_signed_int((uint8_t)INT8_MAX);
+  convert_signed_char_to_signed_int((int8_t)INT8_MAX);
+  convert_unsigned_int_to_signed_int((uint32_t)INT32_MAX);
+  convert_signed_int_to_unsigned_int((int32_t)(uint32_t)INT32_MAX);
+  convert_signed_int_to_unsigned_char((int32_t)(uint32_t)INT32_MAX);
+// CHECK-V2: {{.*}}integer-truncation.c:1100:10: runtime error: implicit conversion from type 'int' of value 2147483647 (32-bit, signed) to type 'unsigned char' changed the value to 255 (8-bit, unsigned)
+  convert_signed_char_to_unsigned_char((int8_t)INT8_MAX);
+  convert_unsigned_char_to_signed_char((uint8_t)INT8_MAX);
+  convert_signed_char_to_unsigned_int((int8_t)INT8_MAX);
+  convert_unsigned_int_to_signed_char((uint32_t)INT32_MAX);
+// CHECK-V2: {{.*}}integer-truncation.c:1500:10: runtime error: implicit conversion from type 'unsigned int' of value 2147483647 (32-bit, unsigned) to type 'signed char' changed the value to -1 (8-bit, signed)
+  convert_signed_int_to_signed_char((int32_t)(uint32_t)INT32_MAX);
+// CHECK-V2: {{.*}}integer-truncation.c:1600:10: runtime error: implicit conversion from type 'int' of value 2147483647 (32-bit, signed) to type 'signed char' changed the value to -1 (8-bit, signed)
+#elif defined(V3)
+  // All destination bits set.
+  convert_unsigned_int_to_unsigned_int((uint32_t)UINT8_MAX);
+  convert_unsigned_char_to_unsigned_char((uint8_t)UINT8_MAX);
+  convert_signed_int_to_signed_int((int32_t)(uint32_t)UINT8_MAX);
+  convert_signed_char_to_signed_char((int8_t)UINT8_MAX);
+  convert_unsigned_int_to_unsigned_char((uint32_t)UINT8_MAX);
+  convert_unsigned_char_to_unsigned_int((uint8_t)UINT8_MAX);
+  convert_unsigned_char_to_signed_int((uint8_t)UINT8_MAX);
+  convert_signed_char_to_signed_int((int8_t)UINT8_MAX);
+  convert_unsigned_int_to_signed_int((uint32_t)UINT8_MAX);
+  convert_signed_int_to_unsigned_int((int32_t)(uint32_t)UINT8_MAX);
+  convert_signed_int_to_unsigned_char((int32_t)(uint32_t)UINT8_MAX);
+  convert_signed_char_to_unsigned_char((int8_t)UINT8_MAX);
+  convert_unsigned_char_to_signed_char((uint8_t)UINT8_MAX);
+  convert_signed_char_to_unsigned_int((int8_t)UINT8_MAX);
+  convert_unsigned_int_to_signed_char((uint32_t)UINT8_MAX);
+// CHECK-V3: {{.*}}integer-truncation.c:1500:10: runtime error: implicit conversion from type 'unsigned int' of value 255 (32-bit, unsigned) to type 'signed char' changed the value to -1 (8-bit, signed)
+  convert_signed_int_to_signed_char((int32_t)(uint32_t)UINT8_MAX);
+// CHECK-V3: {{.*}}integer-truncation.c:1600:10: runtime error: implicit conversion from type 'int' of value 255 (32-bit, signed) to type 'signed char' changed the value to -1 (8-bit, signed)
+#elif defined(V4)
+  // Destination 'sign' bit set.
+  convert_unsigned_int_to_unsigned_int((uint32_t)(uint8_t)INT8_MIN);
+  convert_unsigned_char_to_unsigned_char((uint8_t)(uint8_t)INT8_MIN);
+  convert_signed_int_to_signed_int((int32_t)(uint32_t)(uint8_t)INT8_MIN);
+  convert_signed_char_to_signed_char((int8_t)(uint8_t)INT8_MIN);
+  convert_unsigned_int_to_unsigned_char((uint32_t)(uint8_t)INT8_MIN);
+  convert_unsigned_char_to_unsigned_int((uint8_t)(uint8_t)INT8_MIN);
+  convert_unsigned_char_to_signed_int((uint8_t)(uint8_t)INT8_MIN);
+  convert_signed_char_to_signed_int((int8_t)(uint8_t)INT8_MIN);
+  convert_unsigned_int_to_signed_int((uint32_t)(uint8_t)INT8_MIN);
+  convert_signed_int_to_unsigned_int((int32_t)(uint32_t)(uint8_t)INT8_MIN);
+  convert_signed_int_to_unsigned_char((int32_t)(uint32_t)(uint8_t)INT8_MIN);
+  convert_signed_char_to_unsigned_char((int8_t)(uint8_t)INT8_MIN);
+  convert_unsigned_char_to_signed_char((uint8_t)(uint8_t)INT8_MIN);
+  convert_signed_char_to_unsigned_int((int8_t)(uint8_t)INT8_MIN);
+  convert_unsigned_int_to_signed_char((uint32_t)(uint8_t)INT8_MIN);
+// CHECK-V4: {{.*}}integer-truncation.c:1500:10: runtime error: implicit conversion from type 'unsigned int' of value 128 (32-bit, unsigned) to type 'signed char' changed the value to -128 (8-bit, signed)
+  convert_signed_int_to_signed_char((int32_t)(uint32_t)(uint8_t)INT8_MIN);
+// CHECK-V4: {{.*}}integer-truncation.c:1600:10: runtime error: implicit conversion from type 'int' of value 128 (32-bit, signed) to type 'signed char' changed the value to -128 (8-bit, signed)
+#elif defined(V5)
+  // All bits except the destination 'sign' bit are set.
+  convert_unsigned_int_to_unsigned_int((~((uint32_t)(uint8_t)INT8_MIN)));
+  convert_unsigned_char_to_unsigned_char((uint8_t)(uint8_t)INT8_MIN);
+  convert_signed_int_to_signed_int((int32_t)(~((uint32_t)(uint8_t)INT8_MIN)));
+  convert_signed_char_to_signed_char((int8_t)(uint8_t)INT8_MIN);
+  convert_unsigned_int_to_unsigned_char((~((uint32_t)(uint8_t)INT8_MIN)));
+// CHECK-V5: {{.*}}integer-truncation.c:500:10: runtime error: implicit conversion from type 'unsigned int' of value 4294967167 (32-bit, unsigned) to type 'unsigned char' changed the value to 127 (8-bit, unsigned)
+  convert_unsigned_char_to_unsigned_int((uint8_t)(uint8_t)INT8_MIN);
+  convert_unsigned_char_to_signed_int((uint8_t)(uint8_t)INT8_MIN);
+  convert_signed_char_to_signed_int((int8_t)(uint8_t)INT8_MIN);
+  convert_unsigned_int_to_signed_int((~((uint32_t)(uint8_t)INT8_MIN)));
+  convert_signed_int_to_unsigned_int((int32_t)(~((uint32_t)(uint8_t)INT8_MIN)));
+  convert_signed_int_to_unsigned_char((int32_t)(~((uint32_t)(uint8_t)INT8_MIN)));
+// CHECK-V5: {{.*}}integer-truncation.c:1100:10: runtime error: implicit conversion from type 'int' of value -129 (32-bit, signed) to type 'unsigned char' changed the value to 127 (8-bit, unsigned)
+  convert_signed_char_to_unsigned_char((int8_t)(uint8_t)INT8_MIN);
+  convert_unsigned_char_to_signed_char((uint8_t)(uint8_t)INT8_MIN);
+  convert_signed_char_to_unsigned_int((int8_t)(uint8_t)INT8_MIN);
+  convert_unsigned_int_to_signed_char((~((uint32_t)(uint8_t)INT8_MIN)));
+// CHECK-V5: {{.*}}integer-truncation.c:1500:10: runtime error: implicit conversion from type 'unsigned int' of value 4294967167 (32-bit, unsigned) to type 'signed char' changed the value to 127 (8-bit, signed)
+  convert_signed_int_to_signed_char((int32_t)(~((uint32_t)(uint8_t)INT8_MIN)));
+// CHECK-V5: {{.*}}integer-truncation.c:1600:10: runtime error: implicit conversion from type 'int' of value -129 (32-bit, signed) to type 'signed char' changed the value to 127 (8-bit, signed)
+#elif defined(V6)
+  // Only the source and destination sign bits are set.
+  convert_unsigned_int_to_unsigned_int((uint32_t)INT32_MIN);
+  convert_unsigned_char_to_unsigned_char((uint8_t)INT8_MIN);
+  convert_signed_int_to_signed_int((int32_t)INT32_MIN);
+  convert_signed_char_to_signed_char((int8_t)INT8_MIN);
+  convert_unsigned_int_to_unsigned_char(((uint32_t)INT32_MIN) | ((uint32_t)(uint8_t)INT8_MIN));
+// CHECK-V6: {{.*}}integer-truncation.c:500:10: runtime error: implicit conversion from type 'unsigned int' of value 2147483776 (32-bit, unsigned) to type 'unsigned char' changed the value to 128 (8-bit, unsigned)
+  convert_unsigned_char_to_unsigned_int((uint8_t)INT8_MIN);
+  convert_unsigned_char_to_signed_int((uint8_t)INT8_MIN);
+  convert_signed_char_to_signed_int((int8_t)INT8_MIN);
+  convert_unsigned_int_to_signed_int((uint32_t)INT32_MIN);
+  convert_signed_int_to_unsigned_int((uint32_t)INT32_MIN);
+  convert_signed_int_to_unsigned_char((int32_t)(((uint32_t)INT32_MIN) | ((uint32_t)(uint8_t)INT8_MIN)));
+// CHECK-V6: {{.*}}integer-truncation.c:1100:10: runtime error: implicit conversion from type 'int' of value -2147483520 (32-bit, signed) to type 'unsigned char' changed the value to 128 (8-bit, unsigned)
+  convert_signed_char_to_unsigned_char((int8_t)INT8_MIN);
+  convert_unsigned_char_to_signed_char((uint8_t)INT8_MIN);
+  convert_signed_char_to_unsigned_int((int8_t)INT8_MIN);
+  convert_unsigned_int_to_signed_char((((uint32_t)INT32_MIN) | ((uint32_t)(uint8_t)INT8_MIN)));
+// CHECK-V6: {{.*}}integer-truncation.c:1500:10: runtime error: implicit conversion from type 'unsigned int' of value 2147483776 (32-bit, unsigned) to type 'signed char' changed the value to -128 (8-bit, signed)
+  convert_signed_int_to_signed_char((int32_t)(((uint32_t)INT32_MIN) | ((uint32_t)(uint8_t)INT8_MIN)));
+// CHECK-V6: {{.*}}integer-truncation.c:1600:10: runtime error: implicit conversion from type 'int' of value -2147483520 (32-bit, signed) to type 'signed char' changed the value to -128 (8-bit, signed)
+#elif defined(V7)
+  // All bits except the source and destination sign bits are set.
+  convert_unsigned_int_to_unsigned_int((uint32_t)INT32_MAX);
+  convert_unsigned_char_to_unsigned_char((uint8_t)INT8_MAX);
+  convert_signed_int_to_signed_int((int32_t)INT32_MAX);
+  convert_signed_char_to_signed_char((int8_t)INT8_MAX);
+  convert_unsigned_int_to_unsigned_char(~(((uint32_t)INT32_MIN) | ((uint32_t)(uint8_t)INT8_MIN)));
+// CHECK-V7: {{.*}}integer-truncation.c:500:10: runtime error: implicit conversion from type 'unsigned int' of value 2147483519 (32-bit, unsigned) to type 'unsigned char' changed the value to 127 (8-bit, unsigned)
+  convert_unsigned_char_to_unsigned_int((uint8_t)INT8_MAX);
+  convert_unsigned_char_to_signed_int((uint8_t)INT8_MAX);
+  convert_signed_char_to_signed_int((int8_t)INT8_MAX);
+  convert_unsigned_int_to_signed_int((uint32_t)INT32_MAX);
+  convert_signed_int_to_unsigned_int((uint32_t)INT32_MAX);
+  convert_signed_int_to_unsigned_char((int32_t)(~(((uint32_t)INT32_MIN) | ((uint32_t)(uint8_t)INT8_MIN))));
+// CHECK-V7: {{.*}}integer-truncation.c:1100:10: runtime error: implicit conversion from type 'int' of value 2147483519 (32-bit, signed) to type 'unsigned char' changed the value to 127 (8-bit, unsigned)
+  convert_signed_char_to_unsigned_char((int8_t)INT8_MAX);
+  convert_unsigned_char_to_signed_char((uint8_t)INT8_MAX);
+  convert_signed_char_to_unsigned_int((int8_t)INT8_MAX);
+  convert_unsigned_int_to_signed_char(~(((uint32_t)INT32_MIN) | ((uint32_t)(uint8_t)INT8_MIN)));
+// CHECK-V7: {{.*}}integer-truncation.c:1500:10: runtime error: implicit conversion from type 'unsigned int' of value 2147483519 (32-bit, unsigned) to type 'signed char' changed the value to 127 (8-bit, signed)
+  convert_signed_int_to_signed_char((int32_t)~(((uint32_t)INT32_MIN) | ((uint32_t)(uint8_t)INT8_MIN)));
+// CHECK-V7: {{.*}}integer-truncation.c:1600:10: runtime error: implicit conversion from type 'int' of value 2147483519 (32-bit, signed) to type 'signed char' changed the value to 127 (8-bit, signed)
+#else
+#error Some V* needs to be defined!
 #endif
 
-  // Positive tests.
-
-  uint8_t t_b0 = (~((uint16_t)(0)));
-// CHECK: {{.*}}integer-truncation.c:[[@LINE-1]]:18: runtime error: implicit conversion from type 'int' of value -1 (32-bit, signed) to type 'uint8_t' (aka 'unsigned char') changed the value to 255 (8-bit, unsigned)
-
-  uint8_t t_b1 = (~((uint32_t)0));
-// CHECK: {{.*}}integer-truncation.c:[[@LINE-1]]:18: runtime error: implicit conversion from type 'uint32_t' (aka 'unsigned int') of value 4294967295 (32-bit, unsigned) to type 'uint8_t' (aka 'unsigned char') changed the value to 255 (8-bit, unsigned)
-  uint16_t t_b2 = (~((uint32_t)0));
-// CHECK: {{.*}}integer-truncation.c:[[@LINE-1]]:19: runtime error: implicit conversion from type 'uint32_t' (aka 'unsigned int') of value 4294967295 (32-bit, unsigned) to type 'uint16_t' (aka 'unsigned short') changed the value to 65535 (16-bit, unsigned)
-
-  uint8_t t_b3 = ~((uint64_t)0);
-// CHECK: {{.*}}integer-truncation.c:[[@LINE-1]]:18: runtime error: implicit conversion from type 'uint64_t' (aka 'unsigned long{{[^']*}}') of value 18446744073709551615 (64-bit, unsigned) to type 'uint8_t' (aka 'unsigned char') changed the value to 255 (8-bit, unsigned)
-  uint16_t t_b4 = ~((uint64_t)0);
-// CHECK: {{.*}}integer-truncation.c:[[@LINE-1]]:19: runtime error: implicit conversion from type 'uint64_t' (aka 'unsigned long{{[^']*}}') of value 18446744073709551615 (64-bit, unsigned) to type 'uint16_t' (aka 'unsigned short') changed the value to 65535 (16-bit, unsigned)
-  uint32_t t_b5 = ~((uint64_t)0);
-// CHECK: {{.*}}integer-truncation.c:[[@LINE-1]]:19: runtime error: implicit conversion from type 'uint64_t' (aka 'unsigned long{{[^']*}}') of value 18446744073709551615 (64-bit, unsigned) to type 'uint32_t' (aka 'unsigned int') changed the value to 4294967295 (32-bit, unsigned)
-
-  int8_t t1 = 255;
-// CHECK: {{.*}}integer-truncation.c:[[@LINE-1]]:15: runtime error: implicit conversion from type 'int' of value 255 (32-bit, signed) to type 'int8_t' (aka 'signed char') changed the value to -1 (8-bit, signed)
-  uint8_t t2 = 256;
-// CHECK: {{.*}}integer-truncation.c:[[@LINE-1]]:16: runtime error: implicit conversion from type 'int' of value 256 (32-bit, signed) to type 'uint8_t' (aka 'unsigned char') changed the value to 0 (8-bit, unsigned)
-  int8_t t3 = 256;
-// CHECK: {{.*}}integer-truncation.c:[[@LINE-1]]:15: runtime error: implicit conversion from type 'int' of value 256 (32-bit, signed) to type 'int8_t' (aka 'signed char') changed the value to 0 (8-bit, signed)
-  uint8_t t4 = 257;
-// CHECK: {{.*}}integer-truncation.c:[[@LINE-1]]:16: runtime error: implicit conversion from type 'int' of value 257 (32-bit, signed) to type 'uint8_t' (aka 'unsigned char') changed the value to 1 (8-bit, unsigned)
-  int8_t t5 = 257;
-// CHECK: {{.*}}integer-truncation.c:[[@LINE-1]]:15: runtime error: implicit conversion from type 'int' of value 257 (32-bit, signed) to type 'int8_t' (aka 'signed char') changed the value to 1 (8-bit, signed)
-  int8_t t6 = 128;
-// CHECK: {{.*}}integer-truncation.c:[[@LINE-1]]:15: runtime error: implicit conversion from type 'int' of value 128 (32-bit, signed) to type 'int8_t' (aka 'signed char') changed the value to -128 (8-bit, signed)
-
   return 0;
 }
diff --git a/test/ubsan/TestCases/Misc/log-path_test.cc b/test/ubsan/TestCases/Misc/log-path_test.cc
index bb89f14..23ad7e2 100644
--- a/test/ubsan/TestCases/Misc/log-path_test.cc
+++ b/test/ubsan/TestCases/Misc/log-path_test.cc
@@ -12,11 +12,13 @@
 
 // Good log_path.
 // RUN: rm -f %t.log.*
+// RUN: %device_rm -f '%t.log.*'
 // RUN: %env_ubsan_opts=log_path='"%t.log"' %run %t -4 2> %t.out
 // RUN: FileCheck %s --check-prefix=CHECK-ERROR < %t.log.*
 
 // Run w/o errors should not produce any log.
 // RUN: rm -f %t.log.*
+// RUN: %device_rm -f '%t.log.*'
 // RUN: %env_ubsan_opts=log_path='"%t.log"'  %run %t 4
 // RUN: not cat %t.log.*
 
diff --git a/test/ubsan/TestCases/Pointer/index-overflow.cpp b/test/ubsan/TestCases/Pointer/index-overflow.cpp
index eb7f95e..f9b1fea 100644
--- a/test/ubsan/TestCases/Pointer/index-overflow.cpp
+++ b/test/ubsan/TestCases/Pointer/index-overflow.cpp
@@ -1,7 +1,7 @@
 // RUN: %clangxx -fsanitize=pointer-overflow %s -o %t
-// RUN: %t 1 2>&1 | FileCheck %s --check-prefix=ERR
-// RUN: %t 0 2>&1 | FileCheck %s --check-prefix=SAFE
-// RUN: %t -1 2>&1 | FileCheck %s --check-prefix=SAFE
+// RUN: %run %t 1 2>&1 | FileCheck %s --check-prefix=ERR
+// RUN: %run %t 0 2>&1 | FileCheck %s --check-prefix=SAFE
+// RUN: %run %t -1 2>&1 | FileCheck %s --check-prefix=SAFE
 
 #include <stdio.h>
 #include <stdint.h>
diff --git a/test/ubsan/TestCases/Pointer/unsigned-index-expression.cpp b/test/ubsan/TestCases/Pointer/unsigned-index-expression.cpp
index 0002c71..5a14326 100644
--- a/test/ubsan/TestCases/Pointer/unsigned-index-expression.cpp
+++ b/test/ubsan/TestCases/Pointer/unsigned-index-expression.cpp
@@ -1,5 +1,5 @@
 // RUN: %clangxx -std=c++11 -fsanitize=pointer-overflow %s -o %t
-// RUN: %t 2>&1 | FileCheck %s
+// RUN: %run %t 2>&1 | FileCheck %s
 
 int main(int argc, char *argv[]) {
   char c;
diff --git a/test/ubsan/lit.common.cfg b/test/ubsan/lit.common.cfg
index e20832b..9a7a890 100644
--- a/test/ubsan/lit.common.cfg
+++ b/test/ubsan/lit.common.cfg
@@ -74,3 +74,8 @@
 config.available_features.add('arch=' + config.target_arch)
 
 config.excludes = ['Inputs']
+
+# Limit parallelism if necessary
+if config.host_os == 'Darwin':
+  if config.apple_platform != "osx" and not config.apple_platform.endswith("sim"):
+    config.parallelism_group = "darwin-ios-device-sanitizer"
diff --git a/test/ubsan/lit.site.cfg.in b/test/ubsan/lit.site.cfg.in
index a4d7b50..60c9c84 100644
--- a/test/ubsan/lit.site.cfg.in
+++ b/test/ubsan/lit.site.cfg.in
@@ -7,6 +7,7 @@
 config.target_arch = "@UBSAN_TEST_TARGET_ARCH@"
 config.use_lld = @UBSAN_TEST_USE_LLD@
 config.use_thinlto = @UBSAN_TEST_USE_THINLTO@
+config.apple_platform = "@UBSAN_TEST_APPLE_PLATFORM@"
 
 # Load common config for all compiler-rt lit tests.
 lit_config.load_config(config, "@COMPILER_RT_BINARY_DIR@/test/lit.common.configured")
diff --git a/test/ubsan_minimal/TestCases/implicit-integer-truncation.c b/test/ubsan_minimal/TestCases/implicit-integer-truncation.c
index 1db6e69..28711d2 100644
--- a/test/ubsan_minimal/TestCases/implicit-integer-truncation.c
+++ b/test/ubsan_minimal/TestCases/implicit-integer-truncation.c
@@ -3,7 +3,7 @@
 #include <stdint.h>
 
 int main() {
-// CHECK-NOT: integer-truncation.c
+// CHECK-NOT: implicit-conversion
 
   // Negative tests. Even if they produce unexpected results, this sanitizer does not care.
   int8_t n0 = (~((uint32_t)(0))); // ~0 -> -1, but do not warn.
@@ -19,6 +19,7 @@
   // Positive tests.
   uint8_t t0 = (~((uint32_t)(0)));
 // CHECK: implicit-conversion
+// CHECK-NOT: implicit-conversion
 
   return 0;
 }
diff --git a/test/xray/TestCases/Posix/argv0-log-file-name.cc b/test/xray/TestCases/Posix/argv0-log-file-name.cc
index 2f9a234..a2cb11b 100644
--- a/test/xray/TestCases/Posix/argv0-log-file-name.cc
+++ b/test/xray/TestCases/Posix/argv0-log-file-name.cc
@@ -4,7 +4,7 @@
 // RUN: %clangxx_xray -std=c++11 %s -o %t
 // RUN: XRAY_OPTIONS="patch_premain=true xray_naive_log=true" %run %t > xray.log.file.name 2>&1
 // RUN: ls | FileCheck xray.log.file.name
-// RUN: rm xray-log.* xray.log.file.name
+// RUN: rm xray-log.argv0-log-file-name.* xray.log.file.name
 
 // UNSUPPORTED: target-is-mips64,target-is-mips64el
 
diff --git a/test/xray/TestCases/Posix/clang-no-xray-instrument.cc b/test/xray/TestCases/Posix/clang-no-xray-instrument.cc
new file mode 100644
index 0000000..c2444b1
--- /dev/null
+++ b/test/xray/TestCases/Posix/clang-no-xray-instrument.cc
@@ -0,0 +1,11 @@
+// Test that we cannot actually find XRay instrumentation when we build with
+// -fno-xray-instrument but have code that's marked as 'xray_always_instrument'.
+//
+// RUN: %clangxx -fno-xray-instrument -c %s -o %t.o
+// RUN: not %llvm_xray extract -symbolize %t.o 2>&1 | FileCheck %s
+// REQUIRES: x86_64-target-arch
+// REQUIRES: built-in-llvm-tree
+
+// CHECK: llvm-xray: Cannot extract instrumentation map
+// CHECK-NOT: {{.*always_instrumented.*}}
+[[clang::xray_always_instrument]] int always_instrumented() { return 42; }
diff --git a/test/xray/TestCases/Posix/fdr-mode.cc b/test/xray/TestCases/Posix/fdr-mode.cc
index b12d97c..17d115a 100644
--- a/test/xray/TestCases/Posix/fdr-mode.cc
+++ b/test/xray/TestCases/Posix/fdr-mode.cc
@@ -42,7 +42,6 @@
 void __attribute__((noinline)) fArg(int) { }
 
 int main(int argc, char *argv[]) {
-  using namespace __xray;
   std::cout << "Logging before init." << std::endl;
   // CHECK: Logging before init.
   assert(__xray_log_select_mode("xray-fdr") ==
diff --git a/test/xray/TestCases/Posix/fdr-single-thread.cc b/test/xray/TestCases/Posix/fdr-single-thread.cc
index 480502b..fb65ad8 100644
--- a/test/xray/TestCases/Posix/fdr-single-thread.cc
+++ b/test/xray/TestCases/Posix/fdr-single-thread.cc
@@ -13,16 +13,10 @@
 #include "xray/xray_log_interface.h"
 #include <cassert>
 
-constexpr auto kBufferSize = 16384;
-constexpr auto kBufferMax = 10;
-
 [[clang::xray_always_instrument]] void __attribute__((noinline)) fn() { }
 
 int main(int argc, char *argv[]) {
-  using namespace __xray;
-  FDRLoggingOptions Opts;
-
-  auto status = __xray_log_init(kBufferSize, kBufferMax, &Opts, sizeof(Opts));
+  auto status = __xray_log_init_mode("xray-fdr", "");
   assert(status == XRayLogInitStatus::XRAY_LOG_INITIALIZED);
 
   __xray_patch();
diff --git a/test/xray/TestCases/Posix/fdr-thread-order.cc b/test/xray/TestCases/Posix/fdr-thread-order.cc
index 1d6b017..a5d7e6f 100644
--- a/test/xray/TestCases/Posix/fdr-thread-order.cc
+++ b/test/xray/TestCases/Posix/fdr-thread-order.cc
@@ -1,15 +1,12 @@
-// RUN: %clangxx_xray -g -std=c++11 %s -o %t
-// RUN: rm fdr-thread-order.* || true
-// RUN: XRAY_OPTIONS="patch_premain=false xray_naive_log=false \
-// RUN:    xray_logfile_base=fdr-thread-order. xray_fdr_log=true verbosity=1 \
-// RUN:    xray_fdr_log_func_duration_threshold_us=0" %run %t 2>&1 | \
+// RUN: rm -rf %t && mkdir %t
+// RUN: %clangxx_xray -g -std=c++11 %s -o %t.exe
+// RUN: XRAY_OPTIONS="patch_premain=false \
+// RUN:    xray_logfile_base=%t/ xray_mode=xray-fdr verbosity=1" \
+// RUN:    XRAY_FDR_OPTIONS=func_duration_threshold_us=0 %run %t.exe 2>&1 | \
 // RUN:    FileCheck %s
-// RUN: %llvm_xray convert --symbolize --output-format=yaml -instr_map=%t \
-// RUN:    "`ls fdr-thread-order.* | head -1`"
-// RUN: %llvm_xray convert --symbolize --output-format=yaml -instr_map=%t \
-// RUN:    "`ls fdr-thread-order.* | head -1`" | \
-// RUN:    FileCheck %s --check-prefix TRACE
-// RUN: rm fdr-thread-order.*
+// RUN: %llvm_xray convert --symbolize --output-format=yaml -instr_map=%t.exe %t/*
+// RUN: %llvm_xray convert --symbolize --output-format=yaml -instr_map=%t.exe %t/* | \
+// RUN:   FileCheck %s --check-prefix TRACE
 // FIXME: Make llvm-xray work on non-x86_64 as well.
 // REQUIRES: x86_64-target-arch
 // REQUIRES: built-in-llvm-tree
@@ -19,9 +16,6 @@
 #include <cassert>
 #include <thread>
 
-constexpr auto kBufferSize = 16384;
-constexpr auto kBufferMax = 10;
-
 std::atomic<uint64_t> var{0};
 
 [[clang::xray_always_instrument]] void __attribute__((noinline)) f1() {
@@ -35,11 +29,8 @@
 }
 
 int main(int argc, char *argv[]) {
-  using namespace __xray;
-  FDRLoggingOptions Options;
   __xray_patch();
-  assert(__xray_log_init(kBufferSize, kBufferMax, &Options,
-                         sizeof(FDRLoggingOptions)) ==
+  assert(__xray_log_init_mode("xray-fdr", "") ==
          XRayLogInitStatus::XRAY_LOG_INITIALIZED);
 
   std::atomic_thread_fence(std::memory_order_acq_rel);
diff --git a/test/xray/TestCases/Posix/profiling-multi-threaded.cc b/test/xray/TestCases/Posix/profiling-multi-threaded.cc
index 45e5e70..8bd15b7 100644
--- a/test/xray/TestCases/Posix/profiling-multi-threaded.cc
+++ b/test/xray/TestCases/Posix/profiling-multi-threaded.cc
@@ -41,7 +41,7 @@
   std::string current_mode = __xray_log_get_current_mode();
   assert(current_mode == "xray-profiling");
   assert(__xray_patch() == XRayPatchingStatus::SUCCESS);
-  assert(__xray_log_init(0, 0, nullptr, 0) ==
+  assert(__xray_log_init_mode("xray-profiling", "") ==
          XRayLogInitStatus::XRAY_LOG_INITIALIZED);
   std::thread t0([] { f0(); });
   std::thread t1([] { f0(); });
diff --git a/test/xray/Unit/lit.site.cfg.in b/test/xray/Unit/lit.site.cfg.in
index be860de..54fcc1c 100644
--- a/test/xray/Unit/lit.site.cfg.in
+++ b/test/xray/Unit/lit.site.cfg.in
@@ -14,3 +14,11 @@
 # Do not patch the XRay unit tests pre-main, and also make the error logging
 # verbose to get a more accurate error logging mechanism.
 config.environment['XRAY_OPTIONS'] = 'patch_premain=false'
+
+# Add the LLVM Library directory to the LD_LIBRARY_PATH to allow tests to look
+# up the shared libraries.
+if 'LD_LIBRARY_PATH' in os.environ:
+  libdirs = os.path.pathsep.join((config.llvm_lib_dir, os.environ['LD_LIBRARY_PATH']))
+  config.environment['LD_LIBRARY_PATH'] = libdirs
+else:
+  config.environment['LD_LIBRARY_PATH'] = config.llvm_lib_dir
diff --git a/unittests/lit.common.unit.configured.in b/unittests/lit.common.unit.configured.in
index fafac19..9f6fada 100644
--- a/unittests/lit.common.unit.configured.in
+++ b/unittests/lit.common.unit.configured.in
@@ -10,6 +10,7 @@
 config.llvm_build_mode = "@LLVM_BUILD_MODE@"
 config.host_arch = "@HOST_ARCH@"
 config.host_os = "@HOST_OS@"
+config.llvm_lib_dir = "@LLVM_LIBRARY_DIR@"
 
 # LLVM tools dir and build mode can be passed in lit parameters,
 # so try to apply substitution.