Enable win-arm64

This patch enables Pytorch build from source with Ninja and
'Visual Studio 16 2019' CMake generator on Windows on Arm.

Tests:
- Build from source: 'python setup.py develop'.
- Run simple Pytorch example: passed
- python test\test_torch.py:
-- same results as on x64
-- Ran 1344 tests, failures=2
Pull Request resolved: https://github.com/pytorch/pytorch/pull/72424
diff --git a/c10/util/Bitset.h b/c10/util/Bitset.h
index bed04a4..4143ae5 100644
--- a/c10/util/Bitset.h
+++ b/c10/util/Bitset.h
@@ -76,7 +76,7 @@
   // (i.e. if the very first bit is set, this function returns '1'), and a
   // return of '0' means that there was no bit set.
   size_t find_first_set() const {
-#if defined(_MSC_VER) && defined(_M_X64)
+#if defined(_MSC_VER) && (defined(_M_X64) || defined(_M_ARM64))
     unsigned long result;
     bool has_bits_set = (0 != _BitScanForward64(&result, bitset_));
     if (!has_bits_set) {
diff --git a/caffe2/serialize/crc_alt.h b/caffe2/serialize/crc_alt.h
index 108d998..3299327 100644
--- a/caffe2/serialize/crc_alt.h
+++ b/caffe2/serialize/crc_alt.h
@@ -101,8 +101,11 @@
   // Windows always little endian
   #define __BYTE_ORDER __LITTLE_ENDIAN
 
+  #if !defined(_M_ARM64)
   // intrinsics / prefetching
   #include <xmmintrin.h>
+  #endif
+
   #ifdef __MINGW32__
     #define PREFETCH(location) __builtin_prefetch(location)
   #else
diff --git a/tools/build_pytorch_libs.py b/tools/build_pytorch_libs.py
index d795770..60a2e3c 100644
--- a/tools/build_pytorch_libs.py
+++ b/tools/build_pytorch_libs.py
@@ -1,4 +1,5 @@
 import os
+import platform
 from glob import glob
 import shutil
 from typing import Dict, Optional
@@ -10,6 +11,22 @@
 
 def _overlay_windows_vcvars(env: Dict[str, str]) -> Dict[str, str]:
     vc_arch = 'x64' if IS_64BIT else 'x86'
+
+    if platform.machine() == 'ARM64':
+        vc_arch = 'x64_arm64'
+
+        # First Win11 Windows on Arm build version that supports x64 emulation
+        # is 10.0.22000.
+        win11_1st_version = (10, 0, 22000)
+        current_win_version = tuple(int(version_part) for version_part in
+                                    platform.version().split('.'))
+        if current_win_version < win11_1st_version:
+            vc_arch = 'x86_arm64'
+            print("Warning: 32-bit toolchain will be used, but 64-bit linker "
+                  "is recommended to avoid out-of-memory linker error!")
+            print("Warning: Please consider upgrading to Win11, where x64 "
+                  "emulation is enabled!")
+
     vc_env: Dict[str, str] = distutils._msvccompiler._get_vc_env(vc_arch)
     # Keys in `_get_vc_env` are always lowercase.
     # We turn them into uppercase before overlaying vcvars
diff --git a/tools/setup_helpers/cmake.py b/tools/setup_helpers/cmake.py
index ff17577..a22c58e 100644
--- a/tools/setup_helpers/cmake.py
+++ b/tools/setup_helpers/cmake.py
@@ -4,6 +4,7 @@
 
 import multiprocessing
 import os
+import platform
 import re
 from subprocess import check_call, check_output, CalledProcessError
 import sys
@@ -220,8 +221,11 @@
                           'in the build steps carefully.')
                     sys.exit(1)
             if IS_64BIT:
-                args.append('-Ax64')
-                toolset_dict['host'] = 'x64'
+                if platform.machine() == 'ARM64':
+                    args.append('-A ARM64')
+                else:
+                    args.append('-Ax64')
+                    toolset_dict['host'] = 'x64'
             if toolset_dict:
                 toolset_expr = ','.join(["{}={}".format(k, v) for k, v in toolset_dict.items()])
                 args.append('-T' + toolset_expr)
diff --git a/torch/csrc/jit/codegen/fuser/cpu/fused_kernel.cpp b/torch/csrc/jit/codegen/fuser/cpu/fused_kernel.cpp
index 68fee99..013a8e8 100644
--- a/torch/csrc/jit/codegen/fuser/cpu/fused_kernel.cpp
+++ b/torch/csrc/jit/codegen/fuser/cpu/fused_kernel.cpp
@@ -230,7 +230,7 @@
 // understand for AVX512. When we need better CPU performance this
 // optimization can be re-enabled by tracking down the platforms where
 // this error occurs and only selectively disabling it.
-#ifdef _MSC_VER
+#if (defined(_MSC_VER) && !defined(_M_ARM64))
 // According to https://stackoverflow.com/a/29178079, we are able to
 // detect which arch level is supported by the vectorizer using
 // the macro __isa_available. It is added during runtime.