[pytorch-vulkan] floor-divide for tensor, tensor (#112190)

Summary: tsia

Test Plan:
## Compile on Mac and run on Android

```
buck2 build -c ndk.static_linking=true -c pt.enable_qpl=0  --target-platforms=ovr_config//platform/android:arm32-fbsource //xplat/caffe2:pt_vulkan_api_test_binAndroid  --show-output && adb push buck-out/v2/gen/fbsource/f1f3f9bed27e143c/xplat/caffe2/__pt_vulkan_api_test_binAndroid__/pt_vulkan_api_test_binAndroid /data/local/tmp
```

Run on android
```
$ adb shell /data/local/tmp/pt_vulkan_api_test_binAndroid
...
[ RUN      ] VulkanAPITest.lstm_prepack_success
[       OK ] VulkanAPITest.lstm_prepack_success (11 ms)
[ RUN      ] VulkanAPITest.querypool_flushed_shader_log
xplat/caffe2/aten/src/ATen/test/vulkan_api_test.cpp:7667: Skipped
QueryPool is not available
[  SKIPPED ] VulkanAPITest.querypool_flushed_shader_log (0 ms)
[----------] 396 tests from VulkanAPITest (29980 ms total)
[----------] Global test environment tear-down
[==========] 396 tests from 1 test suite ran. (29980 ms total)
[  PASSED  ] 395 tests.
[  SKIPPED ] 1 test, listed below:
[  SKIPPED ] VulkanAPITest.querypool_flushed_shader_log
  YOU HAVE 7 DISABLED TESTS

```

All Passed.
Full Output: P865232089

Reviewed By: copyrightly

Differential Revision: D50677361

Pull Request resolved: https://github.com/pytorch/pytorch/pull/112190
Approved by: https://github.com/manuelcandales
diff --git a/aten/src/ATen/native/vulkan/glsl/templates/binary_op_params.yaml b/aten/src/ATen/native/vulkan/glsl/templates/binary_op_params.yaml
index c41a760..87bb76d 100644
--- a/aten/src/ATen/native/vulkan/glsl/templates/binary_op_params.yaml
+++ b/aten/src/ATen/native/vulkan/glsl/templates/binary_op_params.yaml
@@ -40,6 +40,9 @@
       - NAME: pow
         IS_DIV: 0
         OPERATOR: pow(X, Y)
+      - NAME: floor_divide
+        IS_DIV: 1
+        OPERATOR: floor(X / Y)
 
 binary_op_tensor_inplace:
   parameter_names_with_default_values:
@@ -59,3 +62,6 @@
       - NAME: pow_
         IS_DIV: 0
         OPERATOR: pow(X, Y)
+      - NAME: floor_divide_
+        IS_DIV: 1
+        OPERATOR: floor(X / Y)
diff --git a/aten/src/ATen/native/vulkan/ops/BinaryOp.cpp b/aten/src/ATen/native/vulkan/ops/BinaryOp.cpp
index 754fa49..4bd6611 100644
--- a/aten/src/ATen/native/vulkan/ops/BinaryOp.cpp
+++ b/aten/src/ATen/native/vulkan/ops/BinaryOp.cpp
@@ -539,6 +539,16 @@
       VK_KERNEL(floor_mul_scalar_));
 }
 
+Tensor floor_divide_tensor(const Tensor& self, const Tensor& other) {
+  return binary_op_tensor(
+      self, other, c10::optional<Scalar>(), VK_KERNEL(floor_divide));
+}
+
+Tensor& floor_divide_tensor_(Tensor& self, const Tensor& other_arg) {
+  return binary_op_tensor_(
+      self, other_arg, c10::optional<Scalar>(), VK_KERNEL(floor_divide_));
+}
+
 #ifdef USE_VULKAN_API
 
 TORCH_LIBRARY_IMPL(aten, Vulkan, m) {
@@ -572,6 +582,12 @@
   m.impl(
       TORCH_SELECTIVE_NAME("aten::floor_divide_.Scalar"),
       TORCH_FN(floor_divide_scalar_));
+  m.impl(
+      TORCH_SELECTIVE_NAME("aten::floor_divide"),
+      TORCH_FN(floor_divide_tensor));
+  m.impl(
+      TORCH_SELECTIVE_NAME("aten::floor_divide_.Tensor"),
+      TORCH_FN(floor_divide_tensor_));
 }
 
 #endif /* USE_VULKAN_API */
diff --git a/aten/src/ATen/test/vulkan_api_test.cpp b/aten/src/ATen/test/vulkan_api_test.cpp
index f61aa06..02a6a60 100644
--- a/aten/src/ATen/test/vulkan_api_test.cpp
+++ b/aten/src/ATen/test/vulkan_api_test.cpp
@@ -3720,6 +3720,94 @@
   test_floor_divide_scalar_inplace({3, 3, 12, 12}, 0.3, 0.08);
 }
 
+TEST_F(VulkanAPITest, floor_divide_zero_dim_tensor) {
+  c10::InferenceMode mode;
+
+  std::vector<int64_t> input_shape{5, 3, 4, 5};
+  float input_scale = 100.0;
+
+  auto in_cpu = at::rand(input_shape, at::device(at::kCPU).dtype(at::kFloat));
+  in_cpu = at::mul(in_cpu, input_scale);
+  auto in_vk = in_cpu.vulkan();
+
+  auto other_cpu = at::zeros({}, at::device(at::kCPU).dtype(at::kFloat)) + 10.0f;
+  auto other_vk = other_cpu.vulkan();
+
+  auto out_cpu = at::floor_divide(in_cpu, other_cpu);
+  auto out_vk = at::floor_divide(in_vk, other_vk);
+
+  // max tolerance is 1.0 due to floor.
+  // may consider adding extra check on number of violation. it should be rare.
+  const auto check = checkRtol(out_cpu - out_vk.cpu(), 1.0f);
+  if (!check) {
+    std::cout << "floor_divide test failed with "
+              << "scale: " << input_scale
+              << std::endl;
+  }
+
+  ASSERT_TRUE(check);
+}
+
+TEST_F(VulkanAPITest, floor_divide_tensor) {
+  c10::InferenceMode mode;
+
+  std::vector<int64_t> input_shape{6, 3, 5, 5};
+  float input_scale = 10.0;
+
+  auto in_cpu = at::rand(input_shape, at::device(at::kCPU).dtype(at::kFloat));
+  in_cpu = at::mul(in_cpu, input_scale);
+  // "other" is at least 0.5 to avoid rounding error causes by very small
+  // values.
+  auto other_cpu =
+      at::rand(input_shape, at::device(at::kCPU).dtype(at::kFloat)) + 0.5;
+
+  auto in_vk = in_cpu.vulkan();
+  auto other_vk = other_cpu.vulkan();
+
+  auto out_cpu = at::floor_divide(in_cpu, other_cpu);
+  auto out_vk = at::floor_divide(in_vk, other_vk);
+
+  // max tolerance is 1.0 due to floor.
+  // may consider adding extra check on number of violation. it should be rare.
+  const auto check = checkRtol(out_cpu - out_vk.cpu(), 1.0f);
+  if (!check) {
+    std::cout << "floor_divide test failed with "
+              << "scale: " << input_scale << std::endl;
+  }
+
+  ASSERT_TRUE(check);
+}
+
+TEST_F(VulkanAPITest, floor_divide_tensor_inplace) {
+  c10::InferenceMode mode;
+
+  std::vector<int64_t> input_shape{5, 3, 5, 5};
+  float input_scale = 10.0;
+
+  auto in_cpu = at::rand(input_shape, at::device(at::kCPU).dtype(at::kFloat));
+  in_cpu = at::mul(in_cpu, input_scale);
+  // "other" is at least 0.5 to avoid rounding error causes by very small
+  // values.
+  auto other_cpu =
+      at::rand(input_shape, at::device(at::kCPU).dtype(at::kFloat)) + 0.5;
+
+  auto in_vk = in_cpu.vulkan();
+  auto other_vk = other_cpu.vulkan();
+
+  in_cpu.floor_divide_(other_cpu);
+  in_vk.floor_divide_(other_vk);
+
+  // max tolerance is 1.0 due to floor.
+  // may consider adding extra check on number of violation. it should be rare.
+  const auto check = checkRtol(in_cpu - in_vk.cpu(), 1.0f);
+  if (!check) {
+    std::cout << "floor_divide test failed with "
+              << "scale: " << input_scale << std::endl;
+  }
+
+  ASSERT_TRUE(check);
+}
+
 TEST_F(VulkanAPITest, relu) {
   const auto in_cpu = at::rand({17, 197, 302, 5}, at::device(at::kCPU).dtype(at::kFloat));
   const auto in_vulkan = in_cpu.vulkan();