Revert "Change ATEN generator argument type to const std::optional<Generator>& (#120076)"

This reverts commit 4305c64fea154ee1ab566e19bd7568753fc30916.

Reverted https://github.com/pytorch/pytorch/pull/120076 on behalf of https://github.com/izaitsevfb due to breaking internal builds(take 3) ([comment](https://github.com/pytorch/pytorch/pull/120076#issuecomment-1986338164))
diff --git a/.github/ci_commit_pins/xla.txt b/.github/ci_commit_pins/xla.txt
index 20e7a61..75b1f91 100644
--- a/.github/ci_commit_pins/xla.txt
+++ b/.github/ci_commit_pins/xla.txt
@@ -1 +1 @@
-8f913829abd9de749339d8d74b7357e1be3a7907
+fba464b199559f61faa720de8bf64cf955cfdce7
diff --git a/aten/src/ATen/VmapModeRegistrations.cpp b/aten/src/ATen/VmapModeRegistrations.cpp
index 171a8dc..ab4556c 100644
--- a/aten/src/ATen/VmapModeRegistrations.cpp
+++ b/aten/src/ATen/VmapModeRegistrations.cpp
@@ -42,34 +42,34 @@
 #define TENSOROPTIONS c10::optional<c10::ScalarType>, c10::optional<c10::Layout>, c10::optional<c10::Device>, c10::optional<bool>
 
   // random operations (out-of-place)
-  m.impl("bernoulli", unsupportedRandomOp<const Tensor&, const optional<Generator>&>);
-  m.impl("bernoulli.out", unsupportedRandomOp_<const Tensor&, const optional<Generator>&, Tensor&>);
-  m.impl("bernoulli.p", unsupportedRandomOp<const Tensor&, double, const optional<Generator>&>);
-  m.impl("bernoulli_.Tensor", unsupportedRandomOp_<Tensor&, const Tensor&, const optional<Generator>&>);
-  m.impl("bernoulli_.float", unsupportedRandomOp_<Tensor&, double, const optional<Generator>&>);
+  m.impl("bernoulli", unsupportedRandomOp<const Tensor&, optional<Generator>>);
+  m.impl("bernoulli.out", unsupportedRandomOp_<const Tensor&, optional<Generator>, Tensor&>);
+  m.impl("bernoulli.p", unsupportedRandomOp<const Tensor&, double, optional<Generator>>);
+  m.impl("bernoulli_.Tensor", unsupportedRandomOp_<Tensor&, const Tensor&, optional<Generator>>);
+  m.impl("bernoulli_.float", unsupportedRandomOp_<Tensor&, double, optional<Generator>>);
 
-  m.impl("cauchy_", unsupportedRandomOp_<Tensor&, double, double, const optional<Generator>&>);
-  m.impl("exponential_", unsupportedRandomOp_<Tensor&, double, const optional<Generator>&>);
-  m.impl("geometric_", unsupportedRandomOp_<Tensor&, double, const optional<Generator>&>);
-  m.impl("log_normal_", unsupportedRandomOp_<Tensor&, double, double, const optional<Generator>&>);
-  m.impl("multinomial", unsupportedRandomOp<const Tensor&, int64_t, bool, const optional<Generator>&>);
-  m.impl("multinomial.out", unsupportedRandomOp_<const Tensor&, int64_t, bool, const optional<Generator>&, Tensor&>);
+  m.impl("cauchy_", unsupportedRandomOp_<Tensor&, double, double, optional<Generator>>);
+  m.impl("exponential_", unsupportedRandomOp_<Tensor&, double, optional<Generator>>);
+  m.impl("geometric_", unsupportedRandomOp_<Tensor&, double, optional<Generator>>);
+  m.impl("log_normal_", unsupportedRandomOp_<Tensor&, double, double, optional<Generator>>);
+  m.impl("multinomial", unsupportedRandomOp<const Tensor&, int64_t, bool, optional<Generator>>);
+  m.impl("multinomial.out", unsupportedRandomOp_<const Tensor&, int64_t, bool, optional<Generator>, Tensor&>);
 
-  m.impl("normal.Tensor_float", unsupportedRandomOp<const Tensor&, double, const optional<Generator>&>);
-  m.impl("normal.Tensor_float_out", unsupportedRandomOp_<const Tensor&, double, const optional<Generator>&, Tensor&>);
-  m.impl("normal.float_Tensor_out", unsupportedRandomOp_<double, const Tensor&, const optional<Generator>&, Tensor&>);
-  m.impl("normal.float_Tensor", unsupportedRandomOp<double, const Tensor&, const optional<Generator>&>);
-  m.impl("normal.Tensor_Tensor", unsupportedRandomOp<const Tensor&, const Tensor&, const optional<Generator>&>);
-  m.impl("normal.Tensor_Tensor_out", unsupportedRandomOp_<const Tensor&, const Tensor&, const optional<Generator>&, Tensor&>);
-  m.impl("normal.float_float", unsupportedRandomOp<double, double, IntArrayRef, const optional<Generator>&, TENSOROPTIONS>);
-  m.impl("normal.float_float_out", unsupportedRandomOp_<double, double, IntArrayRef, const optional<Generator>&, Tensor&>);
-  m.impl("normal_", unsupportedRandomOp_<Tensor&, double, double, const optional<Generator>&>);
+  m.impl("normal.Tensor_float", unsupportedRandomOp<const Tensor&, double, optional<Generator>>);
+  m.impl("normal.Tensor_float_out", unsupportedRandomOp_<const Tensor&, double, optional<Generator>, Tensor&>);
+  m.impl("normal.float_Tensor_out", unsupportedRandomOp_<double, const Tensor&, optional<Generator>, Tensor&>);
+  m.impl("normal.float_Tensor", unsupportedRandomOp<double, const Tensor&, optional<Generator>>);
+  m.impl("normal.Tensor_Tensor", unsupportedRandomOp<const Tensor&, const Tensor&, optional<Generator>>);
+  m.impl("normal.Tensor_Tensor_out", unsupportedRandomOp_<const Tensor&, const Tensor&, optional<Generator>, Tensor&>);
+  m.impl("normal.float_float", unsupportedRandomOp<double, double, IntArrayRef, optional<Generator>, TENSOROPTIONS>);
+  m.impl("normal.float_float_out", unsupportedRandomOp_<double, double, IntArrayRef, optional<Generator>, Tensor&>);
+  m.impl("normal_", unsupportedRandomOp_<Tensor&, double, double, optional<Generator>>);
 
-  m.impl("poisson", unsupportedRandomOp<const Tensor&, const optional<Generator>&>);
+  m.impl("poisson", unsupportedRandomOp<const Tensor&, optional<Generator>>);
 
-  m.impl("random_.from", unsupportedRandomOp_<Tensor&, int64_t, optional<int64_t>, const optional<Generator>&>);
-  m.impl("random_.to", unsupportedRandomOp_<Tensor&, int64_t, const optional<Generator>&>);
-  m.impl("random_", unsupportedRandomOp_<Tensor&, const optional<Generator>&>);
+  m.impl("random_.from", unsupportedRandomOp_<Tensor&, int64_t, optional<int64_t>, optional<Generator>>);
+  m.impl("random_.to", unsupportedRandomOp_<Tensor&, int64_t, optional<Generator>>);
+  m.impl("random_", unsupportedRandomOp_<Tensor&, optional<Generator>>);
 
   m.impl("rand_like", unsupportedRandomOp<const Tensor&, TENSOROPTIONS, optional<MemoryFormat>>);
   m.impl("randn_like", unsupportedRandomOp<const Tensor&, TENSOROPTIONS, optional<MemoryFormat>>);
@@ -78,34 +78,34 @@
   m.impl("randint_like.low_dtype", unsupportedRandomOp<const Tensor&, int64_t, int64_t, TENSOROPTIONS, optional<MemoryFormat>>);
 
   m.impl("rand", unsupportedRandomOp<IntArrayRef, TENSOROPTIONS>);
-  m.impl("rand.generator", unsupportedRandomOp<IntArrayRef, const optional<Generator>&, TENSOROPTIONS>);
+  m.impl("rand.generator", unsupportedRandomOp<IntArrayRef, optional<Generator>, TENSOROPTIONS>);
   m.impl("rand.names", unsupportedRandomOp<IntArrayRef, optional<DimnameList>, TENSOROPTIONS>);
-  m.impl("rand.generator_with_names", unsupportedRandomOp<IntArrayRef, const optional<Generator>&, optional<DimnameList>, TENSOROPTIONS>);
+  m.impl("rand.generator_with_names", unsupportedRandomOp<IntArrayRef, optional<Generator>, optional<DimnameList>, TENSOROPTIONS>);
   m.impl("rand.out", unsupportedRandomOp_<IntArrayRef, Tensor&>);
-  m.impl("rand.generator_out", unsupportedRandomOp_<IntArrayRef, const optional<Generator>&, Tensor&>);
+  m.impl("rand.generator_out", unsupportedRandomOp_<IntArrayRef, optional<Generator>, Tensor&>);
 
   m.impl("randn", unsupportedRandomOp<IntArrayRef, TENSOROPTIONS>);
-  m.impl("randn.generator", unsupportedRandomOp<IntArrayRef, const optional<Generator>&, TENSOROPTIONS>);
+  m.impl("randn.generator", unsupportedRandomOp<IntArrayRef, optional<Generator>, TENSOROPTIONS>);
   m.impl("randn.names", unsupportedRandomOp<IntArrayRef, optional<DimnameList>, TENSOROPTIONS>);
-  m.impl("randn.generator_with_names", unsupportedRandomOp<IntArrayRef, const optional<Generator>&, optional<DimnameList>, TENSOROPTIONS>);
+  m.impl("randn.generator_with_names", unsupportedRandomOp<IntArrayRef, optional<Generator>, optional<DimnameList>, TENSOROPTIONS>);
   m.impl("randn.out", unsupportedRandomOp_<IntArrayRef, Tensor&>);
-  m.impl("randn.generator_out", unsupportedRandomOp_<IntArrayRef, const optional<Generator>&, Tensor&>);
+  m.impl("randn.generator_out", unsupportedRandomOp_<IntArrayRef, optional<Generator>, Tensor&>);
 
   m.impl("randperm", unsupportedRandomOp<int64_t, TENSOROPTIONS>);
-  m.impl("randperm.generator", unsupportedRandomOp<int64_t, const optional<Generator>&, TENSOROPTIONS>);
+  m.impl("randperm.generator", unsupportedRandomOp<int64_t, optional<Generator>, TENSOROPTIONS>);
   m.impl("randperm.out", unsupportedRandomOp_<int64_t, Tensor&>);
-  m.impl("randperm.generator_out", unsupportedRandomOp_<int64_t, const optional<Generator>&, Tensor&>);
+  m.impl("randperm.generator_out", unsupportedRandomOp_<int64_t, optional<Generator>, Tensor&>);
 
   m.impl("randint", unsupportedRandomOp<int64_t, IntArrayRef, TENSOROPTIONS>);
-  m.impl("randint.generator", unsupportedRandomOp<int64_t, IntArrayRef, const optional<Generator>&, TENSOROPTIONS>);
+  m.impl("randint.generator", unsupportedRandomOp<int64_t, IntArrayRef, optional<Generator>, TENSOROPTIONS>);
   m.impl("randint.low", unsupportedRandomOp<int64_t, int64_t, IntArrayRef, TENSOROPTIONS>);
-  m.impl("randint.low_generator", unsupportedRandomOp<int64_t, int64_t, IntArrayRef, const optional<Generator>&, TENSOROPTIONS>);
+  m.impl("randint.low_generator", unsupportedRandomOp<int64_t, int64_t, IntArrayRef, optional<Generator>, TENSOROPTIONS>);
   m.impl("randint.out", unsupportedRandomOp_<int64_t, IntArrayRef, Tensor&>);
-  m.impl("randint.generator_out", unsupportedRandomOp_<int64_t, IntArrayRef, const optional<Generator>&, Tensor&>);
+  m.impl("randint.generator_out", unsupportedRandomOp_<int64_t, IntArrayRef, optional<Generator>, Tensor&>);
   m.impl("randint.low_out", unsupportedRandomOp_<int64_t, int64_t, IntArrayRef, Tensor&>);
-  m.impl("randint.low_generator_out", unsupportedRandomOp_<int64_t, int64_t, IntArrayRef, const optional<Generator>&, Tensor&>);
+  m.impl("randint.low_generator_out", unsupportedRandomOp_<int64_t, int64_t, IntArrayRef, optional<Generator>, Tensor&>);
 
-  m.impl("uniform_", unsupportedRandomOp_<Tensor&, double, double, const optional<Generator>&>);
+  m.impl("uniform_", unsupportedRandomOp_<Tensor&, double, double, optional<Generator>>);
 
 #undef TENSOROPTIONS
 }
diff --git a/aten/src/ATen/core/Generator.h b/aten/src/ATen/core/Generator.h
index 4447557..8b00024 100644
--- a/aten/src/ATen/core/Generator.h
+++ b/aten/src/ATen/core/Generator.h
@@ -149,7 +149,7 @@
  * the backend generator type (CPU/CUDAGeneratorImpl etc.)
  */
 template <typename T>
-static inline T * check_generator(const std::optional<Generator>& gen) {
+static inline T * check_generator(c10::optional<Generator> gen) {
   TORCH_CHECK(gen.has_value(), "Expected Generator but received nullopt");
   TORCH_CHECK(gen->defined(), "Generator with undefined implementation is not allowed");
   TORCH_CHECK(T::device_type() == gen->device().type(), "Expected a '", T::device_type(), "' device type for generator but found '", gen->device().type(), "'");
@@ -163,7 +163,7 @@
  * the backend generator type (CPU/CUDAGeneratorImpl etc.)
  */
 template <typename T>
-static inline T* get_generator_or_default(const std::optional<Generator>& gen, const Generator& default_gen) {
+static inline T* get_generator_or_default(const c10::optional<Generator>& gen, const Generator& default_gen) {
   return gen.has_value() && gen->defined() ? check_generator<T>(gen) : check_generator<T>(default_gen);
 }
 
diff --git a/aten/src/ATen/core/dispatch/DispatchKeyExtractor.h b/aten/src/ATen/core/dispatch/DispatchKeyExtractor.h
index 35863a5..33e9105 100644
--- a/aten/src/ATen/core/dispatch/DispatchKeyExtractor.h
+++ b/aten/src/ATen/core/dispatch/DispatchKeyExtractor.h
@@ -89,7 +89,7 @@
         ts = ts | gen.key_set();
       }
     }
-    void operator()(const std::optional<at::Generator>& gen) {
+    void operator()(const c10::optional<at::Generator>& gen) {
       if (gen.has_value() && gen->defined()) {
         ts = ts | gen->key_set();
       }
diff --git a/aten/src/ATen/functorch/BatchRulesBinaryOps.cpp b/aten/src/ATen/functorch/BatchRulesBinaryOps.cpp
index b7d5472..44ca280 100644
--- a/aten/src/ATen/functorch/BatchRulesBinaryOps.cpp
+++ b/aten/src/ATen/functorch/BatchRulesBinaryOps.cpp
@@ -303,8 +303,8 @@
   return std::make_tuple(at::log_sigmoid_backward(out_grad, out_self, out_buffer), 0);
 }
 
-static Tensor binomial_wrapper(const Tensor& count, const Tensor& prob, const std::optional<Generator>& gen) {
-  return at::binomial(count, prob.contiguous(), gen); // Bug in PyTorch, prob shouldn't need to be contiguous
+static Tensor binomial_wrapper(const Tensor& count, const Tensor& prob, c10::optional<Generator> gen) {
+  return at::binomial(count, prob.contiguous(), std::move(gen)); // Bug in PyTorch, prob shouldn't need to be contiguous
 }
 
 TORCH_LIBRARY_IMPL(aten, FuncTorchVmapMode, m) {
diff --git a/aten/src/ATen/functorch/BatchRulesRandomness.cpp b/aten/src/ATen/functorch/BatchRulesRandomness.cpp
index ada0542..00d3e1d 100644
--- a/aten/src/ATen/functorch/BatchRulesRandomness.cpp
+++ b/aten/src/ATen/functorch/BatchRulesRandomness.cpp
@@ -58,7 +58,7 @@
   }
 }
 
-static Tensor& bernoulli_inplace_Tensor_batching_rule(Tensor& self, const Tensor& p_, const std::optional<Generator>& gen) {
+static Tensor& bernoulli_inplace_Tensor_batching_rule(Tensor& self, const Tensor& p_, c10::optional<Generator> gen) {
   c10::impl::ExcludeDispatchKeyGuard guard(DispatchKey::FuncTorchVmapMode);
   auto maybe_layer = maybeCurrentDynamicLayer();
   auto cur_level = maybe_layer->layerId();
@@ -94,11 +94,11 @@
     "If this is necessary for your usage, please file an issue with functorch.");
   if (randomness == RandomnessType::Same && self_bdim) {
     auto intermediate = empty(self.sizes(), self.options());
-    intermediate.bernoulli_(other_, gen);
+    intermediate.bernoulli_(other_, std::move(gen));
     self.copy_(intermediate); // batching should make this just work out...
     return self;
   } else {
-    self_.bernoulli_(other_, gen);
+    self_.bernoulli_(other_, std::move(gen));
     return self;
   }
 }
@@ -213,7 +213,7 @@
   return std::make_tuple(output, mask);
 }
 
-static Tensor multinomial_batching_rule(const Tensor& self, const int64_t num_samples, const bool replacement, const std::optional<Generator>& generator) {
+static Tensor multinomial_batching_rule(const Tensor& self, const int64_t num_samples, const bool replacement, const c10::optional<Generator> generator) {
   c10::impl::ExcludeDispatchKeyGuard guard(DispatchKey::FuncTorchVmapMode);
   auto maybe_layer = maybeCurrentDynamicLayer();
   const auto cur_level = maybe_layer->layerId();
diff --git a/aten/src/ATen/native/Activation.cpp b/aten/src/ATen/native/Activation.cpp
index dfcea09..7f5c696 100644
--- a/aten/src/ATen/native/Activation.cpp
+++ b/aten/src/ATen/native/Activation.cpp
@@ -573,7 +573,7 @@
     const Tensor& noise,
     const Scalar& lower_,
     const Scalar& upper_,
-    const std::optional<Generator>& generator) {
+    c10::optional<Generator> generator) {
   using opmath_t = at::opmath_type<scalar_t>;
   opmath_t lower = lower_.to<opmath_t>();
   opmath_t upper = upper_.to<opmath_t>();
@@ -604,7 +604,7 @@
     const Scalar& lower,
     const Scalar& upper,
     bool training,
-    const std::optional<Generator>& generator,
+    c10::optional<Generator> generator,
     Tensor& output) {
   if (training) {
     AT_DISPATCH_FLOATING_TYPES_AND(ScalarType::BFloat16, self.scalar_type(), "rrelu_with_noise_out_cpu", [&] {
@@ -626,10 +626,10 @@
     const Scalar& lower,
     const Scalar& upper,
     bool training,
-    const std::optional<Generator>& generator) {
+    c10::optional<Generator> generator) {
   auto output = at::empty_like(self, LEGACY_CONTIGUOUS_MEMORY_FORMAT);
   return at::native::rrelu_with_noise_out_cpu(
-      self, noise, lower, upper, training, generator, output);
+      self, noise, lower, upper, training, std::move(generator), output);
 }
 
 Tensor& rrelu_with_noise_cpu_(
@@ -638,9 +638,9 @@
     const Scalar& lower,
     const Scalar& upper,
     bool training,
-    const std::optional<Generator>& generator) {
+    c10::optional<Generator> generator) {
   return at::native::rrelu_with_noise_out_cpu(
-      self, noise, lower, upper, training, generator, self);
+      self, noise, lower, upper, training, std::move(generator), self);
 }
 
 Tensor rrelu_with_noise_backward(
@@ -661,14 +661,14 @@
   }
 }
 
-Tensor rrelu(const Tensor & self, const Scalar& lower, const Scalar& upper, bool training, const std::optional<Generator>& generator) {
+Tensor rrelu(const Tensor & self, const Scalar& lower, const Scalar& upper, bool training, c10::optional<Generator> generator) {
   TORCH_CHECK(lower.to<double>() <= upper.to<double>(), "Lower bound should be less than or equal to the upper bound")
-  return at::rrelu_with_noise(self, at::empty_like(self, LEGACY_CONTIGUOUS_MEMORY_FORMAT), lower, upper, training, generator);
+  return at::rrelu_with_noise(self, at::empty_like(self, LEGACY_CONTIGUOUS_MEMORY_FORMAT), lower, upper, training, std::move(generator));
 }
 
-Tensor & rrelu_(Tensor & self, const Scalar& lower, const Scalar& upper, bool training, const std::optional<Generator>& generator) {
+Tensor & rrelu_(Tensor & self, const Scalar& lower, const Scalar& upper, bool training, c10::optional<Generator> generator) {
   TORCH_CHECK(lower.to<double>() <= upper.to<double>(), "Lower bound should be less than or equal to the upper bound")
-  return at::rrelu_with_noise_(self, at::empty_like(self, LEGACY_CONTIGUOUS_MEMORY_FORMAT), lower, upper, training, generator);
+  return at::rrelu_with_noise_(self, at::empty_like(self, LEGACY_CONTIGUOUS_MEMORY_FORMAT), lower, upper, training, std::move(generator));
 }
 
 TORCH_IMPL_FUNC(threshold_out)(const Tensor& self, const Scalar& threshold, const Scalar& value, const Tensor& result) {
diff --git a/aten/src/ATen/native/DistributionTemplates.h b/aten/src/ATen/native/DistributionTemplates.h
index 00e7704..a5ed952 100644
--- a/aten/src/ATen/native/DistributionTemplates.h
+++ b/aten/src/ATen/native/DistributionTemplates.h
@@ -81,7 +81,7 @@
   }
 
 template<template<typename> class random_kernel, typename RNG>
-at::Tensor& random_impl(at::Tensor& self, const std::optional<Generator>& generator) {
+at::Tensor& random_impl(at::Tensor& self, c10::optional<Generator> generator) {
   CHECK_EMPTY_AND_RETURN(self);
   auto iter = at::TensorIterator::borrowing_nullary_op(self);
   random_kernel<RNG>()(iter, generator);
@@ -132,7 +132,7 @@
 }
 
 template<template<typename> class random_from_to_kernel, typename RNG>
-at::Tensor& random_from_to_impl(at::Tensor& self, int64_t from, c10::optional<int64_t> to_opt, const std::optional<Generator>& generator) {
+at::Tensor& random_from_to_impl(at::Tensor& self, int64_t from, c10::optional<int64_t> to_opt, c10::optional<Generator> generator) {
   uint64_t range = 0;
   auto iter = at::TensorIterator::borrowing_nullary_op(self);
   if (to_opt.has_value()) {
@@ -200,7 +200,7 @@
   TORCH_CHECK(std >= 0.0, "normal expects std >= 0.0, but found std ", std);
 
 template<template<typename> class normal_kernel, typename RNG>
-Tensor& normal_impl_(Tensor& self, double mean, double std, const std::optional<Generator>& gen) {
+Tensor& normal_impl_(Tensor& self, double mean, double std, c10::optional<Generator> gen) {
   CHECK_NORMAL_STD(std);
   CHECK_EMPTY_AND_RETURN(self);
 
@@ -216,7 +216,7 @@
 }
 
 template<template<typename> class normal_kernel, typename RNG>
-Tensor& normal_out_impl(Tensor& output, const Tensor& mean, double std, const std::optional<Generator>& gen) {
+Tensor& normal_out_impl(Tensor& output, const Tensor& mean, double std, c10::optional<Generator> gen) {
   CHECK_NORMAL_STD(std);
   auto std_tensor = at::empty_like(output, MemoryFormat::Contiguous);
   auto shape = at::infer_size(mean.sizes(), std_tensor.sizes());
@@ -227,7 +227,7 @@
 }
 
 template<template<typename> class normal_kernel, typename RNG>
-Tensor& normal_out_impl(Tensor& output, double mean, const Tensor& std, const std::optional<Generator>& gen) {
+Tensor& normal_out_impl(Tensor& output, double mean, const Tensor& std, c10::optional<Generator> gen) {
   CHECK_NORMAL_TENSOR_STD(std);
   auto mean_tensor = at::full({}, mean, output.options());
   auto shape = at::infer_size(mean_tensor.sizes(), std.sizes());
@@ -242,7 +242,7 @@
 }
 
 template<template<typename> class normal_kernel, typename RNG>
-Tensor& normal_out_impl(Tensor& output, const Tensor& mean, const Tensor& std, const std::optional<Generator>& gen) {
+Tensor& normal_out_impl(Tensor& output, const Tensor& mean, const Tensor& std, c10::optional<Generator> gen) {
   CHECK_NORMAL_TENSOR_STD(std);
   auto shape = at::infer_size(mean.sizes(), std.sizes());
   at::native::resize_output(output, shape);
@@ -256,7 +256,7 @@
 }
 
 template<template<typename> class normal_kernel, typename RNG>
-Tensor normal_impl(const Tensor& mean, double std, const std::optional<Generator>& gen) {
+Tensor normal_impl(const Tensor& mean, double std, c10::optional<Generator> gen) {
   CHECK_NORMAL_STD(std);
   Tensor ret = at::empty_like(mean, MemoryFormat::Contiguous);
   normal_out_impl<normal_kernel, RNG>(ret, mean, std, gen);
@@ -264,7 +264,7 @@
 }
 
 template<template<typename> class normal_kernel, typename RNG>
-Tensor normal_impl(double mean, const Tensor& std, const std::optional<Generator>& gen) {
+Tensor normal_impl(double mean, const Tensor& std, c10::optional<Generator> gen) {
   CHECK_NORMAL_TENSOR_STD(std);
   Tensor ret = at::empty_like(std, MemoryFormat::Contiguous);
   normal_out_impl<normal_kernel, RNG>(ret, mean, std, gen);
@@ -272,7 +272,7 @@
 }
 
 template<template<typename> class normal_kernel, typename RNG>
-Tensor normal_impl(const Tensor& mean, const Tensor& std, const std::optional<Generator>& gen) {
+Tensor normal_impl(const Tensor& mean, const Tensor& std, c10::optional<Generator> gen) {
   CHECK_NORMAL_TENSOR_STD(std);
   auto shape = at::infer_size(mean.sizes(), std.sizes());
   Tensor ret = at::empty(shape, mean.options(), MemoryFormat::Contiguous);
@@ -283,7 +283,7 @@
 // ==================================================== Uniform =======================================================
 
 template<template<typename> class uniform_kernel, typename RNG>
-at::Tensor& uniform_impl_(at::Tensor& self, double from, double to, const std::optional<Generator>& generator) {
+at::Tensor& uniform_impl_(at::Tensor& self, double from, double to, c10::optional<Generator> generator) {
   if (self.is_complex()) {
     CHECK_EMPTY_AND_RETURN(self);
     auto float_tensor = at::view_as_real(self);
@@ -313,7 +313,7 @@
 // ================================================== LogNormal =======================================================
 
 template<template<typename> class log_normal_kernel, typename RNG>
-at::Tensor& log_normal_impl_(at::Tensor& self, double mean, double std, const std::optional<Generator>& gen) {
+at::Tensor& log_normal_impl_(at::Tensor& self, double mean, double std, c10::optional<Generator> gen) {
   TORCH_CHECK(std > 0.0, "log_normal_ expects std > 0.0, but found std=", std);
   CHECK_EMPTY_AND_RETURN(self);
   auto iter = TensorIterator::borrowing_nullary_op(self);
@@ -324,7 +324,7 @@
 // =================================================== Geometric ======================================================
 
 template<template<typename> class geometric_kernel, typename RNG>
-Tensor& geometric_impl_(Tensor& self, double p, const std::optional<Generator>& gen) {
+Tensor& geometric_impl_(Tensor& self, double p, c10::optional<Generator> gen) {
   TORCH_CHECK(0 < p && p < 1, "geometric_ expects p to be in (0, 1), but got p=", p);
   CHECK_EMPTY_AND_RETURN(self);
   auto iter = TensorIterator::borrowing_nullary_op(self);
@@ -335,7 +335,7 @@
 // ================================================== Exponential =====================================================
 
 template<template<typename> class exponential_kernel, typename RNG>
-Tensor& exponential_impl_(Tensor& self, double lambda, const std::optional<Generator>& gen) {
+Tensor& exponential_impl_(Tensor& self, double lambda, c10::optional<Generator> gen) {
   TORCH_CHECK(lambda > 0.0, "exponential_ expects lambda > 0.0, but found lambda=", lambda);
   CHECK_EMPTY_AND_RETURN(self);
   auto iter = TensorIterator::borrowing_nullary_op(self);
@@ -346,7 +346,7 @@
 // ==================================================== Cauchy ========================================================
 
 template<template<typename> class cauchy_kernel, typename RNG>
-Tensor& cauchy_impl_(Tensor& self, double median, double sigma, const std::optional<Generator>& gen) {
+Tensor& cauchy_impl_(Tensor& self, double median, double sigma, c10::optional<Generator> gen) {
   // TODO: instead of variable name 'sigma', use 'gamma' or 'scale'
   // the variance, squared sigma, is undefined for cauchy distribution
   TORCH_CHECK(sigma > 0.0, "cauchy_ expects sigma > 0.0, but found sigma=", sigma);
@@ -360,7 +360,7 @@
 // ==================================================== Bernoulli =====================================================
 
 template<template<typename> class bernoulli_tensor_kernel, typename RNG>
-Tensor& bernoulli_impl_(Tensor& self, const Tensor& p_, const std::optional<Generator>& gen) {
+Tensor& bernoulli_impl_(Tensor& self, const Tensor& p_, c10::optional<Generator> gen) {
   CHECK_EMPTY_AND_RETURN(self);
   NoNamesGuard guard;
   at::assert_no_internal_overlap(self);
@@ -369,7 +369,7 @@
 }
 
 template<template<typename> class bernoulli_scalar_kernel, typename RNG>
-Tensor& bernoulli_impl_(Tensor& self, double p, const std::optional<Generator>& gen) {
+Tensor& bernoulli_impl_(Tensor& self, double p, c10::optional<Generator> gen) {
   TORCH_CHECK(0 <= p && p <= 1, "bernoulli_ expects p to be in [0, 1], but got p=", p);
   CHECK_EMPTY_AND_RETURN(self);
   at::assert_no_internal_overlap(self);
@@ -378,7 +378,7 @@
 }
 
 template<template<typename> class bernoulli_tensor_kernel, typename RNG>
-Tensor& bernoulli_out_impl(Tensor& result, const Tensor& self, const std::optional<Generator>& gen) {
+Tensor& bernoulli_out_impl(Tensor& result, const Tensor& self, c10::optional<Generator> gen) {
   // result.resize_as_(self) requires self to have same dtype as result, so we
   // use resize_ instead.
   // TODO: Fix resize_as_. See pytorch/pytorch#11665.
diff --git a/aten/src/ATen/native/Distributions.cpp b/aten/src/ATen/native/Distributions.cpp
index 4088719..4d4eb2e 100644
--- a/aten/src/ATen/native/Distributions.cpp
+++ b/aten/src/ATen/native/Distributions.cpp
@@ -160,96 +160,96 @@
 
 template<typename RNG>
 struct BernoulliStub {
-  void operator()(Tensor& self, const Tensor& p_, const std::optional<Generator>& gen) {
+  void operator()(Tensor& self, const Tensor& p_, c10::optional<Generator> gen) {
     bernoulli_tensor_stub(self.device().type(), self, p_, gen);
   }
 
-  void operator()(Tensor& self, double p, const std::optional<Generator>& gen) {
+  void operator()(Tensor& self, double p, c10::optional<Generator> gen) {
     bernoulli_scalar_stub(self.device().type(), self, p, gen);
   }
 };
 
-Tensor bernoulli(const Tensor& self, const std::optional<Generator>& gen) {
+Tensor bernoulli(const Tensor& self, c10::optional<Generator> gen) {
   Tensor result = at::empty_like(self, LEGACY_CONTIGUOUS_MEMORY_FORMAT);
-  result.bernoulli_(self, gen);
+  result.bernoulli_(self, std::move(gen));
   return result;
 }
 
-Tensor bernoulli(const Tensor& self, double p, const std::optional<Generator>& gen) {
+Tensor bernoulli(const Tensor& self, double p, c10::optional<Generator> gen) {
   Tensor result = at::empty_like(self, LEGACY_CONTIGUOUS_MEMORY_FORMAT);
-  result.bernoulli_(p, gen);
+  result.bernoulli_(p, std::move(gen));
   return result;
 }
 
-Tensor& bernoulli_out(const Tensor& self, const std::optional<Generator>& gen, Tensor& result) {
-  return at::native::templates::bernoulli_out_impl<BernoulliStub, Generator>(result, self, gen);
+Tensor& bernoulli_out(const Tensor& self, c10::optional<Generator> gen, Tensor& result) {
+  return at::native::templates::bernoulli_out_impl<BernoulliStub, Generator>(result, self, std::move(gen));
 }
 
-Tensor& bernoulli_(Tensor& self, const Tensor& p_, const std::optional<Generator>& gen) {
-  return at::native::templates::bernoulli_impl_<BernoulliStub, Generator>(self, p_, gen);
+Tensor& bernoulli_(Tensor& self, const Tensor& p_, c10::optional<Generator> gen) {
+  return at::native::templates::bernoulli_impl_<BernoulliStub, Generator>(self, p_, std::move(gen));
 }
 
-Tensor& bernoulli_(Tensor& self, double p, const std::optional<Generator>& gen) {
-  return at::native::templates::bernoulli_impl_<BernoulliStub, Generator>(self, p, gen);
+Tensor& bernoulli_(Tensor& self, double p, c10::optional<Generator> gen) {
+  return at::native::templates::bernoulli_impl_<BernoulliStub, Generator>(self, p, std::move(gen));
 }
 
 // ================================================== LogNormal =======================================================
 
 template<typename RNG>
 struct LogNormalStub {
-  void operator()(TensorIteratorBase& iter, double mean, double std, const std::optional<Generator>& gen) {
+  void operator()(TensorIteratorBase& iter, double mean, double std, c10::optional<Generator> gen) {
     log_normal_stub(iter.device_type(), iter, mean, std, gen);
   }
 };
 
-Tensor& log_normal_(Tensor& self, double mean, double std, const std::optional<Generator>& gen) {
-  return at::native::templates::log_normal_impl_<LogNormalStub, Generator>(self, mean, std, gen);
+Tensor& log_normal_(Tensor& self, double mean, double std, c10::optional<Generator> gen) {
+  return at::native::templates::log_normal_impl_<LogNormalStub, Generator>(self, mean, std, std::move(gen));
 }
 
 // ==================================================== Cauchy ========================================================
 
 template<typename RNG>
 struct CauchyStub {
-  void operator()(TensorIteratorBase& iter, double median, double sigma, const std::optional<Generator>& gen) {
+  void operator()(TensorIteratorBase& iter, double median, double sigma, c10::optional<Generator> gen) {
     cauchy_stub(iter.device_type(), iter, median, sigma, gen);
   }
 };
 
-Tensor& cauchy_(Tensor& self, double median, double sigma, const std::optional<Generator>& gen) {
-  return at::native::templates::cauchy_impl_<CauchyStub, Generator>(self, median, sigma, gen);
+Tensor& cauchy_(Tensor& self, double median, double sigma, c10::optional<Generator> gen) {
+  return at::native::templates::cauchy_impl_<CauchyStub, Generator>(self, median, sigma, std::move(gen));
 }
 
 // ================================================== Exponential =====================================================
 
 template<typename RNG>
 struct ExponentialStub {
-  void operator()(TensorIteratorBase& iter, double lambda, const std::optional<Generator>& gen) {
+  void operator()(TensorIteratorBase& iter, double lambda, c10::optional<Generator> gen) {
     exponential_stub(iter.device_type(), iter, lambda, gen);
   }
 };
 
-Tensor& exponential_(Tensor& self, double lambda, const std::optional<Generator>& gen) {
-  return at::native::templates::exponential_impl_<ExponentialStub, Generator>(self, lambda, gen);
+Tensor& exponential_(Tensor& self, double lambda, c10::optional<Generator> gen) {
+  return at::native::templates::exponential_impl_<ExponentialStub, Generator>(self, lambda, std::move(gen));
 }
 
 // =================================================== Geometric ======================================================
 
 template<typename RNG>
 struct GeometricStub {
-  void operator()(TensorIteratorBase& iter, double p, const std::optional<Generator>& gen) {
+  void operator()(TensorIteratorBase& iter, double p, c10::optional<Generator> gen) {
     geometric_stub(iter.device_type(), iter, p, gen);
   }
 };
 
-Tensor& geometric_(Tensor& self, double p, const std::optional<Generator>& gen) {
-  return at::native::templates::geometric_impl_<GeometricStub, Generator>(self, p, gen);
+Tensor& geometric_(Tensor& self, double p, c10::optional<Generator> gen) {
+  return at::native::templates::geometric_impl_<GeometricStub, Generator>(self, p, std::move(gen));
 }
 
 // ==================================================== Uniform =======================================================
 
 template<typename RNG>
 struct UniformStub {
-  void operator()(TensorIteratorBase& iter, double from, double to, const std::optional<Generator>& gen) {
+  void operator()(TensorIteratorBase& iter, double from, double to, c10::optional<Generator> gen) {
     uniform_stub(iter.device_type(), iter, from, to, gen);
   }
 };
@@ -257,23 +257,23 @@
 template<typename RNG>
 struct UniformMeta {
   // No-op!
-  void operator()(TensorIteratorBase& iter, double from, double to, const std::optional<Generator>& gen) {
+  void operator()(TensorIteratorBase& iter, double from, double to, c10::optional<Generator> gen) {
   }
 };
 
-Tensor& uniform_(Tensor& self, double from, double to, const std::optional<Generator>& gen) {
-  return at::native::templates::uniform_impl_<UniformStub, Generator>(self, from, to, gen);
+Tensor& uniform_(Tensor& self, double from, double to, c10::optional<Generator> gen) {
+  return at::native::templates::uniform_impl_<UniformStub, Generator>(self, from, to, std::move(gen));
 }
 
-Tensor& uniform_meta_(Tensor& self, double from, double to, const std::optional<Generator>& gen) {
-  return at::native::templates::uniform_impl_<UniformMeta, Generator>(self, from, to, gen);
+Tensor& uniform_meta_(Tensor& self, double from, double to, c10::optional<Generator> gen) {
+  return at::native::templates::uniform_impl_<UniformMeta, Generator>(self, from, to, std::move(gen));
 }
 
 // ==================================================== Normal ========================================================
 
 template<typename RNG>
 struct NormalStub {
-  void operator()(Tensor& self, double mean, double std, const std::optional<Generator>& gen) {
+  void operator()(Tensor& self, double mean, double std, c10::optional<Generator> gen) {
     normal_stub(self.device().type(), self, mean, std, gen);
   }
 };
@@ -281,76 +281,76 @@
 template<typename RNG>
 struct NormalMeta {
   // No-op!
-  void operator()(Tensor& self, double mean, double std, const std::optional<Generator>& gen) {
+  void operator()(Tensor& self, double mean, double std, c10::optional<Generator> gen) {
   }
 };
 
 // inplace
-Tensor& normal_(Tensor& self, double mean, double std, const std::optional<Generator>& gen) {
-  return at::native::templates::normal_impl_<NormalStub, Generator>(self, mean, std, gen);
+Tensor& normal_(Tensor& self, double mean, double std, c10::optional<Generator> gen) {
+  return at::native::templates::normal_impl_<NormalStub, Generator>(self, mean, std, std::move(gen));
 }
 
-Tensor& normal_meta_(Tensor& self, double mean, double std, const std::optional<Generator>& gen) {
-  return at::native::templates::normal_impl_<NormalMeta, Generator>(self, mean, std, gen);
+Tensor& normal_meta_(Tensor& self, double mean, double std, c10::optional<Generator> gen) {
+  return at::native::templates::normal_impl_<NormalMeta, Generator>(self, mean, std, std::move(gen));
 }
 
 // out tensor float
-Tensor& normal_out(const Tensor& mean, double std, const std::optional<Generator>& gen, Tensor& output) {
-  return at::native::templates::normal_out_impl<NormalStub, Generator>(output, mean, std, gen);
+Tensor& normal_out(const Tensor& mean, double std, c10::optional<Generator> gen, Tensor& output) {
+  return at::native::templates::normal_out_impl<NormalStub, Generator>(output, mean, std, std::move(gen));
 }
 
-Tensor& normal_out_meta(const Tensor& mean, double std, const std::optional<Generator>& gen, Tensor& output) {
-  return at::native::templates::normal_out_impl<NormalMeta, Generator>(output, mean, std, gen);
+Tensor& normal_out_meta(const Tensor& mean, double std, c10::optional<Generator> gen, Tensor& output) {
+  return at::native::templates::normal_out_impl<NormalMeta, Generator>(output, mean, std, std::move(gen));
 }
 
 // out float tensor
-Tensor& normal_out(double mean, const Tensor& std, const std::optional<Generator>& gen, Tensor& output) {
-  return at::native::templates::normal_out_impl<NormalStub, Generator>(output, mean, std, gen);
+Tensor& normal_out(double mean, const Tensor& std, c10::optional<Generator> gen, Tensor& output) {
+  return at::native::templates::normal_out_impl<NormalStub, Generator>(output, mean, std, std::move(gen));
 }
 
-Tensor& normal_out_meta(double mean, const Tensor& std, const std::optional<Generator>& gen, Tensor& output) {
-  return at::native::templates::normal_out_impl<NormalMeta, Generator>(output, mean, std, gen);
+Tensor& normal_out_meta(double mean, const Tensor& std, c10::optional<Generator> gen, Tensor& output) {
+  return at::native::templates::normal_out_impl<NormalMeta, Generator>(output, mean, std, std::move(gen));
 
 }
 
 // out tensor tensor
-Tensor& normal_out(const Tensor& mean, const Tensor& std, const std::optional<Generator>& gen, Tensor& output) {
-  return at::native::templates::normal_out_impl<NormalStub, Generator>(output, mean, std, gen);
+Tensor& normal_out(const Tensor& mean, const Tensor& std, c10::optional<Generator> gen, Tensor& output) {
+  return at::native::templates::normal_out_impl<NormalStub, Generator>(output, mean, std, std::move(gen));
 }
 
-Tensor& normal_out_meta(const Tensor& mean, const Tensor& std, const std::optional<Generator>& gen, Tensor& output) {
-  return at::native::templates::normal_out_impl<NormalMeta, Generator>(output, mean, std, gen);
+Tensor& normal_out_meta(const Tensor& mean, const Tensor& std, c10::optional<Generator> gen, Tensor& output) {
+  return at::native::templates::normal_out_impl<NormalMeta, Generator>(output, mean, std, std::move(gen));
 }
 
 // functional tensor float
-Tensor normal(const Tensor& mean, double std, const std::optional<Generator>& gen) {
-  return at::native::templates::normal_impl<NormalStub, Generator>(mean, std, gen);
+Tensor normal(const Tensor& mean, double std, c10::optional<Generator> gen) {
+  return at::native::templates::normal_impl<NormalStub, Generator>(mean, std, std::move(gen));
 }
 
-Tensor normal_meta(const Tensor& mean, double std, const std::optional<Generator>& gen) {
-  return at::native::templates::normal_impl<NormalMeta, Generator>(mean, std, gen);
+Tensor normal_meta(const Tensor& mean, double std, c10::optional<Generator> gen) {
+  return at::native::templates::normal_impl<NormalMeta, Generator>(mean, std, std::move(gen));
 }
 
 // functional float tensor
-Tensor normal(double mean, const Tensor& std, const std::optional<Generator>& gen) {
-  return at::native::templates::normal_impl<NormalStub, Generator>(mean, std, gen);
+Tensor normal(double mean, const Tensor& std, c10::optional<Generator> gen) {
+  return at::native::templates::normal_impl<NormalStub, Generator>(mean, std, std::move(gen));
 }
 
-Tensor normal_meta(double mean, const Tensor& std, const std::optional<Generator>& gen) {
-  return at::native::templates::normal_impl<NormalMeta, Generator>(mean, std, gen);
+Tensor normal_meta(double mean, const Tensor& std, c10::optional<Generator> gen) {
+  return at::native::templates::normal_impl<NormalMeta, Generator>(mean, std, std::move(gen));
 }
 
 // functional tensor tensor
-Tensor normal(const Tensor& mean, const Tensor& std, const std::optional<Generator>& gen) {
-  return at::native::templates::normal_impl<NormalStub, Generator>(mean, std, gen);
+Tensor normal(const Tensor& mean, const Tensor& std, c10::optional<Generator> gen) {
+  return at::native::templates::normal_impl<NormalStub, Generator>(mean, std, std::move(gen));
 }
 
-Tensor normal_meta(const Tensor& mean, const Tensor& std, const std::optional<Generator>& gen) {
-  return at::native::templates::normal_impl<NormalMeta, Generator>(mean, std, gen);
+Tensor normal_meta(const Tensor& mean, const Tensor& std, c10::optional<Generator> gen) {
+  return at::native::templates::normal_impl<NormalMeta, Generator>(mean, std, std::move(gen));
 }
 
 // functional variant, only used by the functionalization pass.
-Tensor normal_functional(const Tensor& self, double mean, double std, const std::optional<at::Generator>& generator) {
+Tensor normal_functional(const Tensor& self, double mean, double std, c10::optional<at::Generator> generator) {
   return self.clone().normal_(mean, std, std::move(generator));
 }
 
@@ -358,44 +358,44 @@
 
 template<typename RNG>
 struct RandomStub {
-  void operator()(TensorIteratorBase& iter, const std::optional<Generator>& gen) {
+  void operator()(TensorIteratorBase& iter, c10::optional<Generator> gen) {
     random_stub(iter.device_type(), iter, gen);
   }
 };
 
-Tensor& random_(Tensor& self, const std::optional<Generator>& gen) {
-  return at::native::templates::random_impl<RandomStub, Generator>(self, gen);
+Tensor& random_(Tensor& self, c10::optional<Generator> gen) {
+  return at::native::templates::random_impl<RandomStub, Generator>(self, std::move(gen));
 }
 
 template<typename RNG>
 struct RandomFromToStub {
-  void operator()(TensorIteratorBase& iter, uint64_t range, int64_t from, const std::optional<Generator>& gen) {
+  void operator()(TensorIteratorBase& iter, uint64_t range, int64_t from, c10::optional<Generator> gen) {
     random_from_to_stub(iter.device_type(), iter, range, from, gen);
   }
-  void operator()(TensorIteratorBase& iter, const std::optional<Generator>& gen) {
+  void operator()(TensorIteratorBase& iter, c10::optional<Generator> gen) {
     random_full_64_bits_range_stub(iter.device_type(), iter, gen);
   }
 };
 
-Tensor& random_(Tensor& self, int64_t from, optional<int64_t> to, const std::optional<Generator>& gen) {
-  return at::native::templates::random_from_to_impl<RandomFromToStub, Generator>(self, from, to, gen);
+Tensor& random_(Tensor& self, int64_t from, optional<int64_t> to, c10::optional<Generator> gen) {
+  return at::native::templates::random_from_to_impl<RandomFromToStub, Generator>(self, from, to, std::move(gen));
 }
 
-Tensor& random_(Tensor& self, int64_t to, const std::optional<Generator>& gen) {
-  return random_(self, 0, to, gen);
+Tensor& random_(Tensor& self, int64_t to, c10::optional<Generator> gen) {
+  return random_(self, 0, to, std::move(gen));
 }
 
-Tensor& random_meta_(Tensor& self, const std::optional<Generator>& gen) {
+Tensor& random_meta_(Tensor& self, c10::optional<Generator> gen) {
   // No error checking yay
   return self;
 }
 
-Tensor& random_meta_(Tensor& self, int64_t from, optional<int64_t> to, const std::optional<Generator>& gen) {
+Tensor& random_meta_(Tensor& self, int64_t from, optional<int64_t> to, c10::optional<Generator> gen) {
   // No error checking yay
   return self;
 }
 
-Tensor& random_meta_(Tensor& self, int64_t to, const std::optional<Generator>& gen) {
+Tensor& random_meta_(Tensor& self, int64_t to, c10::optional<Generator> gen) {
   // No error checking yay
   return self;
 }
@@ -437,7 +437,7 @@
  * This section is a counterpart to Distributions.cu
  */
 
-Tensor _s_binomial_cpu(const Tensor& count, const Tensor& prob, const std::optional<Generator>& gen) {
+Tensor _s_binomial_cpu(const Tensor& count, const Tensor& prob, c10::optional<Generator> gen) {
   Tensor ret = at::zeros(count.sizes(), count.options());
   auto iter = TensorIteratorConfig()
     .add_output(ret)
@@ -462,7 +462,7 @@
   return ret;
 }
 
-Tensor _s_poisson_cpu(const Tensor& lambda, const std::optional<Generator>& gen) {
+Tensor _s_poisson_cpu(const Tensor& lambda, c10::optional<Generator> gen) {
   Tensor ret = at::zeros(lambda.sizes(), lambda.options());
   auto iter = TensorIteratorConfig()
     .add_output(ret)
@@ -479,7 +479,7 @@
   return ret;
 }
 
-Tensor _s_gamma_cpu(const Tensor& alpha, const std::optional<Generator>& gen) {
+Tensor _s_gamma_cpu(const Tensor& alpha, c10::optional<Generator> gen) {
   Tensor ret = at::zeros(alpha.sizes(), alpha.options());
   auto iter = TensorIteratorConfig()
     .add_output(ret)
@@ -509,7 +509,7 @@
   return ret;
 }
 
-Tensor _s_dirichlet_cpu(const Tensor& alpha, const std::optional<Generator>& gen) {
+Tensor _s_dirichlet_cpu(const Tensor& alpha, c10::optional<Generator> gen) {
   Tensor ret = at::zeros(alpha.sizes(), alpha.options());
   AT_DISPATCH_FLOATING_TYPES(ret.scalar_type(), "dirichlet", [&] {
     Tensor gamma = at::zeros(alpha.sizes(), alpha.options().dtype(ScalarType::Double));
@@ -562,7 +562,7 @@
 Tensor& multinomial_out(const Tensor& self,
     int64_t n_sample,
     bool with_replacement,
-    const std::optional<Generator>& gen,
+    c10::optional<Generator> gen,
     Tensor& result) {
   TORCH_CHECK(
       result.device() == self.device(),
@@ -622,7 +622,7 @@
     // s = argmax( p / (-log(eps)) ) where eps ~ U(0, 1).
     // We can also simplify the formula above by
     // s = argmax( p / q ) where q ~ Exp(1)
-    Tensor q = at::empty_like(self).exponential_(1, gen);
+    Tensor q = at::empty_like(self).exponential_(1, std::move(gen));
     // In theory the probability to generate 0 from exponential distribution is
     // 0. However, on CUDA side there is a protection to avoid 0s, but on CPU
     // side, there is a very low probability to generate 0 from
@@ -647,9 +647,9 @@
     const Tensor& self,
     int64_t n_sample,
     bool with_replacement,
-    const std::optional<Generator>& gen) {
+    c10::optional<Generator> gen) {
   Tensor result = at::empty({0}, self.options().dtype(kLong));
-  native::multinomial_out(self, n_sample, with_replacement, gen, result);
+  native::multinomial_out(self, n_sample, with_replacement, std::move(gen), result);
   return result;
 }
 
diff --git a/aten/src/ATen/native/TensorFactories.cpp b/aten/src/ATen/native/TensorFactories.cpp
index bbe5311..add09e2 100644
--- a/aten/src/ATen/native/TensorFactories.cpp
+++ b/aten/src/ATen/native/TensorFactories.cpp
@@ -878,10 +878,10 @@
     c10::optional<Layout> layout,
     c10::optional<Device> device,
     c10::optional<bool> pin_memory) {
-  return native::rand(size, static_cast<const std::optional<Generator>&>(c10::nullopt), dtype, layout, device, pin_memory);
+  return native::rand(size, static_cast<c10::optional<Generator>>(c10::nullopt), dtype, layout, device, pin_memory);
 }
 
-Tensor rand(IntArrayRef size, const std::optional<Generator>& generator,
+Tensor rand(IntArrayRef size, c10::optional<Generator> generator,
     c10::optional<ScalarType> dtype,
     c10::optional<Layout> layout,
     c10::optional<Device> device,
@@ -897,7 +897,7 @@
   return native::rand_out(size, c10::nullopt, result);
 }
 
-Tensor& rand_out(IntArrayRef size, const std::optional<Generator>& generator, Tensor& result) {
+Tensor& rand_out(IntArrayRef size, c10::optional<Generator> generator, Tensor& result) {
   result.resize_(size);
   return result.uniform_(0, 1, std::move(generator));
 }
@@ -929,7 +929,7 @@
 Tensor randint(
     int64_t high,
     IntArrayRef size,
-    const std::optional<Generator>& generator,
+    c10::optional<Generator> generator,
     c10::optional<ScalarType> dtype,
     c10::optional<Layout> layout,
     c10::optional<Device> device,
@@ -952,7 +952,7 @@
     int64_t low,
     int64_t high,
     IntArrayRef size,
-    const std::optional<Generator>& generator,
+    c10::optional<Generator> generator,
     c10::optional<ScalarType> dtype,
     c10::optional<Layout> layout,
     c10::optional<Device> device,
@@ -970,7 +970,7 @@
 
 Tensor& randint_out(int64_t high,
     IntArrayRef size,
-    const std::optional<Generator>& generator,
+    c10::optional<Generator> generator,
     Tensor& result) {
   result.resize_(size);
   return result.random_(0, high, std::move(generator));
@@ -983,7 +983,7 @@
 Tensor& randint_out(int64_t low,
     int64_t high,
     IntArrayRef size,
-    const std::optional<Generator>& generator,
+    c10::optional<Generator> generator,
     Tensor& result) {
   result.resize_(size);
   return result.random_(low, high, std::move(generator));
@@ -1027,10 +1027,10 @@
     c10::optional<Layout> layout,
     c10::optional<Device> device,
     c10::optional<bool> pin_memory) {
-  return native::randn(size, static_cast<const std::optional<Generator>&>(c10::nullopt), dtype, layout, device, pin_memory);
+  return native::randn(size, static_cast<c10::optional<Generator>>(c10::nullopt), dtype, layout, device, pin_memory);
 }
 
-Tensor randn(IntArrayRef size, const std::optional<Generator>& generator,
+Tensor randn(IntArrayRef size, c10::optional<Generator> generator,
     c10::optional<ScalarType> dtype,
     c10::optional<Layout> layout,
     c10::optional<Device> device,
@@ -1046,13 +1046,13 @@
   return native::randn_out(size, c10::nullopt, result);
 }
 
-Tensor& randn_out(IntArrayRef size, const std::optional<Generator>& generator, Tensor& result) {
+Tensor& randn_out(IntArrayRef size, c10::optional<Generator> generator, Tensor& result) {
   result.resize_(size);
   return result.normal_(0, 1, std::move(generator));
 }
 
 Tensor normal(double mean, double std, IntArrayRef size,
-              const std::optional<Generator>& generator,
+              c10::optional<Generator> generator,
     c10::optional<ScalarType> dtype,
     c10::optional<Layout> layout,
     c10::optional<Device> device,
@@ -1065,7 +1065,7 @@
 }
 
 Tensor& normal_out(double mean, double std,
-                   IntArrayRef size, const std::optional<Generator>& generator, Tensor& result) {
+                   IntArrayRef size, c10::optional<Generator> generator, Tensor& result) {
   result.resize_(size);
   return result.normal_(mean, std, std::move(generator));
 }
@@ -1120,7 +1120,7 @@
   return native::randperm(n, c10::nullopt, dtype, layout, device, pin_memory);
 }
 
-Tensor randperm(int64_t n, const std::optional<Generator>& generator,
+Tensor randperm(int64_t n, c10::optional<Generator> generator,
     c10::optional<ScalarType> dtype,
     c10::optional<Layout> layout,
     c10::optional<Device> device,
@@ -1140,7 +1140,7 @@
   return at::randperm_out(result, n, c10::nullopt);
 }
 
-Tensor& randperm_out_cpu(int64_t n, const std::optional<Generator>& generator, Tensor& result) {
+Tensor& randperm_out_cpu(int64_t n, c10::optional<Generator> generator, Tensor& result) {
   TORCH_CHECK(n >= 0, "n must be non-negative, got", n);
   TORCH_CHECK(!generator.has_value() || (generator.has_value() && result.device() == generator->device()), "Expected a '", result.device(), "' generator device but found '", generator->device(), "'");
   check_supported_max_int_with_precision(n, result);
@@ -1809,7 +1809,7 @@
 
 Tensor randn(
     IntArrayRef size,
-    const std::optional<Generator>& generator,
+    c10::optional<Generator> generator,
     optional<DimnameList> names,
     c10::optional<ScalarType> dtype,
     c10::optional<Layout> layout,
@@ -1834,7 +1834,7 @@
 
 Tensor rand(
     IntArrayRef size,
-    const std::optional<Generator>& generator,
+    c10::optional<Generator> generator,
     optional<DimnameList> names,
     c10::optional<ScalarType> dtype,
     c10::optional<Layout> layout,
diff --git a/aten/src/ATen/native/UnaryOps.h b/aten/src/ATen/native/UnaryOps.h
index e791a2f..91d4d84 100644
--- a/aten/src/ATen/native/UnaryOps.h
+++ b/aten/src/ATen/native/UnaryOps.h
@@ -93,23 +93,23 @@
 DECLARE_DISPATCH(unary_fn, special_spherical_bessel_j0_stub);
 
 // NB: these are actually defined in Distribution
-DECLARE_DISPATCH(void(*)(const TensorBase&, const TensorBase&, const std::optional<Generator>&), bernoulli_tensor_stub);
-DECLARE_DISPATCH(void(*)(const TensorBase&, const double, const std::optional<Generator>&), bernoulli_scalar_stub);
-DECLARE_DISPATCH(void(*)(TensorIteratorBase&, const double, const double, const std::optional<Generator>&), cauchy_stub);
-DECLARE_DISPATCH(void(*)(TensorIteratorBase&, const double, const std::optional<Generator>&), exponential_stub);
-DECLARE_DISPATCH(void(*)(TensorIteratorBase&, const double, const std::optional<Generator>&), geometric_stub);
-DECLARE_DISPATCH(void(*)(TensorIteratorBase&, const double, const double, const std::optional<Generator>&), log_normal_stub);
-DECLARE_DISPATCH(void(*)(TensorIteratorBase&, const double, const double, const std::optional<Generator>&), uniform_stub);
-DECLARE_DISPATCH(void(*)(const TensorBase&, const double, const double, const std::optional<Generator>&), normal_stub);
-DECLARE_DISPATCH(void(*)(TensorIteratorBase&, const uint64_t, const int64_t, const std::optional<Generator>&), random_from_to_stub);
-DECLARE_DISPATCH(void(*)(TensorIteratorBase&, const std::optional<Generator>&), random_full_64_bits_range_stub);
-DECLARE_DISPATCH(void(*)(TensorIteratorBase&, const std::optional<Generator>&), random_stub);
+DECLARE_DISPATCH(void(*)(const TensorBase&, const TensorBase&, c10::optional<Generator>), bernoulli_tensor_stub);
+DECLARE_DISPATCH(void(*)(const TensorBase&, const double, c10::optional<Generator>), bernoulli_scalar_stub);
+DECLARE_DISPATCH(void(*)(TensorIteratorBase&, const double, const double, c10::optional<Generator>), cauchy_stub);
+DECLARE_DISPATCH(void(*)(TensorIteratorBase&, const double, c10::optional<Generator>), exponential_stub);
+DECLARE_DISPATCH(void(*)(TensorIteratorBase&, const double, c10::optional<Generator>), geometric_stub);
+DECLARE_DISPATCH(void(*)(TensorIteratorBase&, const double, const double, c10::optional<Generator>), log_normal_stub);
+DECLARE_DISPATCH(void(*)(TensorIteratorBase&, const double, const double, c10::optional<Generator>), uniform_stub);
+DECLARE_DISPATCH(void(*)(const TensorBase&, const double, const double, c10::optional<Generator>), normal_stub);
+DECLARE_DISPATCH(void(*)(TensorIteratorBase&, const uint64_t, const int64_t, c10::optional<Generator>), random_from_to_stub);
+DECLARE_DISPATCH(void(*)(TensorIteratorBase&, c10::optional<Generator>), random_full_64_bits_range_stub);
+DECLARE_DISPATCH(void(*)(TensorIteratorBase&, c10::optional<Generator>), random_stub);
 
 DECLARE_DISPATCH(void(*)(TensorIteratorBase&, const int64_t, const double), kaiser_window_stub);
 DECLARE_DISPATCH(void(*)(TensorIteratorBase&, const int64_t), polygamma_stub);
 DECLARE_DISPATCH(void(*)(TensorIteratorBase&, const Scalar& a, const Scalar& b), clamp_stub);
 DECLARE_DISPATCH(
-    void (*)(Tensor&, const Tensor&, int64_t, const std::optional<Generator>&),
+    void (*)(Tensor&, const Tensor&, int64_t, c10::optional<Generator>),
     multinomial_with_replacement_stub);
 DECLARE_DISPATCH(
     void (*)(
diff --git a/aten/src/ATen/native/cpu/DistributionKernels.cpp b/aten/src/ATen/native/cpu/DistributionKernels.cpp
index c12ee3a..6dce481 100644
--- a/aten/src/ATen/native/cpu/DistributionKernels.cpp
+++ b/aten/src/ATen/native/cpu/DistributionKernels.cpp
@@ -26,27 +26,27 @@
 namespace at::native {
 namespace {
 
-static void cauchy_kernel(TensorIteratorBase& iter, double median, double sigma, const std::optional<Generator>& gen) {
+static void cauchy_kernel(TensorIteratorBase& iter, double median, double sigma, c10::optional<Generator> gen) {
   CPUGeneratorImpl* generator = get_generator_or_default<CPUGeneratorImpl>(gen, detail::getDefaultCPUGenerator());
   templates::cpu::cauchy_kernel(iter, median, sigma, generator);
 }
 
-void bernoulli_tensor_kernel(const TensorBase &self, const TensorBase &p_, const std::optional<Generator>& gen) {
+void bernoulli_tensor_kernel(const TensorBase &self, const TensorBase &p_, c10::optional<Generator> gen) {
   CPUGeneratorImpl* generator = get_generator_or_default<CPUGeneratorImpl>(gen, detail::getDefaultCPUGenerator());
   templates::cpu::bernoulli_kernel(self, p_, generator);
 }
 
 #if !AT_MKL_ENABLED()
-void bernoulli_scalar_kernel_default(const TensorBase &self, double p, const std::optional<Generator>& gen) {
+void bernoulli_scalar_kernel_default(const TensorBase &self, double p, c10::optional<Generator> gen) {
   CPUGeneratorImpl* generator = get_generator_or_default<CPUGeneratorImpl>(gen, detail::getDefaultCPUGenerator());
   templates::cpu::bernoulli_kernel(self, p, generator);
 }
 
-void bernoulli_scalar_kernel(const TensorBase &self, double p, const std::optional<Generator>& gen) {
+void bernoulli_scalar_kernel(const TensorBase &self, double p, c10::optional<Generator> gen) {
   bernoulli_scalar_kernel_default(self, p, gen);
 }
 #else
-void bernoulli_scalar_kernel(const TensorBase &self, double p, const std::optional<Generator>& gen) {
+void bernoulli_scalar_kernel(const TensorBase &self, double p, c10::optional<Generator> gen) {
   CPUGeneratorImpl* generator = get_generator_or_default<CPUGeneratorImpl>(gen, detail::getDefaultCPUGenerator());
   int64_t seed;
   {
@@ -99,17 +99,17 @@
 }
 #endif
 
-static void exponential_kernel_default(TensorIteratorBase& iter, double lambda, const std::optional<Generator>& gen) {
+static void exponential_kernel_default(TensorIteratorBase& iter, double lambda, c10::optional<Generator> gen) {
   CPUGeneratorImpl* generator = get_generator_or_default<CPUGeneratorImpl>(gen, detail::getDefaultCPUGenerator());
   templates::cpu::exponential_kernel(iter, lambda, generator);
 }
 
 #if (!AT_MKL_ENABLED() || defined(FBCODE_CAFFE2))
-void exponential_kernel(TensorIteratorBase& iter, double lambda, const std::optional<Generator>& gen) {
+void exponential_kernel(TensorIteratorBase& iter, double lambda, c10::optional<Generator> gen) {
   exponential_kernel_default(iter, lambda, gen);
 }
 #else
-void exponential_kernel(TensorIteratorBase &iter, double lambda, const std::optional<Generator>& gen) {
+void exponential_kernel(TensorIteratorBase &iter, double lambda, c10::optional<Generator> gen) {
   TORCH_CHECK(isFloatingType(iter.dtype()), "Exponential distribution is a continuous probability distribution. dtype must be a floating point but you specified ", iter.dtype());
 
   Tensor self = iter.tensor(0);
@@ -195,32 +195,32 @@
 }
 #endif
 
-static void geometric_kernel(TensorIteratorBase& iter, double p, const std::optional<Generator>& gen) {
+static void geometric_kernel(TensorIteratorBase& iter, double p, c10::optional<Generator> gen) {
   CPUGeneratorImpl* generator = get_generator_or_default<CPUGeneratorImpl>(gen, detail::getDefaultCPUGenerator());
   templates::cpu::geometric_kernel(iter, p, generator);
 }
 
-static void log_normal_kernel(TensorIteratorBase& iter, double mean, double std, const std::optional<Generator>& gen) {
+static void log_normal_kernel(TensorIteratorBase& iter, double mean, double std, c10::optional<Generator> gen) {
   CPUGeneratorImpl* generator = get_generator_or_default<CPUGeneratorImpl>(gen, detail::getDefaultCPUGenerator());
   templates::cpu::log_normal_kernel(iter, mean, std, generator);
 }
 
-void uniform_kernel(TensorIteratorBase& iter, double from, double to, const std::optional<Generator>& gen) {
+void uniform_kernel(TensorIteratorBase& iter, double from, double to, c10::optional<Generator> gen) {
   CPUGeneratorImpl* generator = get_generator_or_default<CPUGeneratorImpl>(gen, detail::getDefaultCPUGenerator());
   templates::cpu::uniform_kernel(iter, from, to, generator);
 }
 
-void normal_kernel(const TensorBase &self, double mean, double std, const std::optional<Generator>& gen) {
+void normal_kernel(const TensorBase &self, double mean, double std, c10::optional<Generator> gen) {
   CPUGeneratorImpl* generator = get_generator_or_default<CPUGeneratorImpl>(gen, detail::getDefaultCPUGenerator());
   templates::cpu::normal_kernel(self, mean, std, generator);
 }
 
-static void random_from_to_kernel(TensorIteratorBase& iter, uint64_t range, int64_t base, const std::optional<Generator>& gen) {
+static void random_from_to_kernel(TensorIteratorBase& iter, uint64_t range, int64_t base, c10::optional<Generator> gen) {
   CPUGeneratorImpl* generator = get_generator_or_default<CPUGeneratorImpl>(gen, detail::getDefaultCPUGenerator());
   templates::cpu::random_from_to_kernel(iter, range, base, generator);
 }
 
-static void random_kernel(TensorIteratorBase& iter, const std::optional<Generator>& gen) {
+static void random_kernel(TensorIteratorBase& iter, c10::optional<Generator> gen) {
   CPUGeneratorImpl* generator = get_generator_or_default<CPUGeneratorImpl>(gen, detail::getDefaultCPUGenerator());
   templates::cpu::random_kernel(iter, generator);
 }
@@ -228,7 +228,7 @@
 // This is the special kernel to handle single specific case:
 // from(inclusive) = std::numeric_limits<int64_t>::lowest()
 // to(exclusive) = None (= std::numeric_limits<int64_t>::max() + 1)
-static void random_full_64_bits_range_kernel(TensorIteratorBase& iter, const std::optional<Generator>& gen) {
+static void random_full_64_bits_range_kernel(TensorIteratorBase& iter, c10::optional<Generator> gen) {
   CPUGeneratorImpl* generator = get_generator_or_default<CPUGeneratorImpl>(gen, detail::getDefaultCPUGenerator());
   templates::cpu::random_full_64_bits_range_kernel(iter, generator);
 }
diff --git a/aten/src/ATen/native/cpu/DistributionTemplates.h b/aten/src/ATen/native/cpu/DistributionTemplates.h
index 5419293..1a1039b 100644
--- a/aten/src/ATen/native/cpu/DistributionTemplates.h
+++ b/aten/src/ATen/native/cpu/DistributionTemplates.h
@@ -58,10 +58,10 @@
 
 template<typename RNG>
 struct RandomFromToKernel {
-  void operator()(TensorIteratorBase& iter, uint64_t range, int64_t base, const std::optional<Generator>& gen) {
+  void operator()(TensorIteratorBase& iter, uint64_t range, int64_t base, c10::optional<Generator> gen) {
     random_from_to_kernel(iter, range, base, check_generator<RNG>(gen));
   }
-  void operator()(TensorIteratorBase& iter, const std::optional<Generator>& gen) {
+  void operator()(TensorIteratorBase& iter, c10::optional<Generator> gen) {
     random_full_64_bits_range_kernel(iter, check_generator<RNG>(gen));
   }
 };
@@ -79,7 +79,7 @@
 
 template<typename RNG>
 struct RandomKernel {
-  void operator()(TensorIteratorBase& iter, const std::optional<Generator>& gen) {
+  void operator()(TensorIteratorBase& iter, c10::optional<Generator> gen) {
     random_kernel(iter, check_generator<RNG>(gen));
   }
 };
@@ -200,7 +200,7 @@
 
 template<typename RNG>
 struct NormalKernel {
-  void operator()(Tensor& self, double mean, double std, const std::optional<Generator>& gen) {
+  void operator()(Tensor& self, double mean, double std, c10::optional<Generator> gen) {
     normal_kernel(self, mean, std, check_generator<RNG>(gen));
   }
 };
@@ -222,7 +222,7 @@
 
 template<typename RNG>
 struct UniformKernel {
-  void operator()(TensorIteratorBase& iter, double from, double to, const std::optional<Generator>& gen) {
+  void operator()(TensorIteratorBase& iter, double from, double to, c10::optional<Generator> gen) {
     uniform_kernel(iter, from, to, check_generator<RNG>(gen));
   }
 };
@@ -242,7 +242,7 @@
 
 template<typename RNG>
 struct CauchyKernel {
-  void operator()(TensorIteratorBase& iter, double median, double sigma, const std::optional<Generator>& gen) {
+  void operator()(TensorIteratorBase& iter, double median, double sigma, c10::optional<Generator> gen) {
     cauchy_kernel(iter, median, sigma, check_generator<RNG>(gen));
   }
 };
@@ -262,7 +262,7 @@
 
 template<typename RNG>
 struct LogNormalKernel {
-  void operator()(TensorIteratorBase& iter, double mean, double std, const std::optional<Generator>& gen) {
+  void operator()(TensorIteratorBase& iter, double mean, double std, c10::optional<Generator> gen) {
     log_normal_kernel(iter, mean, std, check_generator<RNG>(gen));
   }
 };
@@ -282,7 +282,7 @@
 
 template<typename RNG>
 struct GeometricKernel {
-  void operator()(TensorIteratorBase& iter, double p, const std::optional<Generator>& gen) {
+  void operator()(TensorIteratorBase& iter, double p, c10::optional<Generator> gen) {
     geometric_kernel(iter, p, check_generator<RNG>(gen));
   }
 };
@@ -303,7 +303,7 @@
 
 template<typename RNG>
 struct ExponentialKernel {
-  void operator()(TensorIteratorBase& iter, double lambda, const std::optional<Generator>& gen) {
+  void operator()(TensorIteratorBase& iter, double lambda, c10::optional<Generator> gen) {
     exponential_kernel(iter, lambda, check_generator<RNG>(gen));
   }
 };
@@ -358,10 +358,10 @@
 
 template<typename RNG>
 struct BernoulliKernel {
-  void operator()(const TensorBase &self, double p, const std::optional<Generator>& gen) {
+  void operator()(const TensorBase &self, double p, c10::optional<Generator> gen) {
     bernoulli_kernel(self, p, check_generator<RNG>(gen));
   }
-  void operator()(const TensorBase &self, const TensorBase &p_, const std::optional<Generator>& gen) {
+  void operator()(const TensorBase &self, const TensorBase &p_, c10::optional<Generator> gen) {
     bernoulli_kernel(self, p_, check_generator<RNG>(gen));
   }
 };
diff --git a/aten/src/ATen/native/cpu/MultinomialKernel.cpp b/aten/src/ATen/native/cpu/MultinomialKernel.cpp
index 720b108..c5c2eeb 100644
--- a/aten/src/ATen/native/cpu/MultinomialKernel.cpp
+++ b/aten/src/ATen/native/cpu/MultinomialKernel.cpp
@@ -24,7 +24,7 @@
     Tensor& result,
     const Tensor& self,
     const int64_t n_sample,
-    const std::optional<Generator>& generator) {
+    c10::optional<Generator> generator) {
   auto gen = get_generator_or_default<CPUGeneratorImpl>(
       generator, detail::getDefaultCPUGenerator());
   // See Note [Acquire lock when using random generators]
@@ -128,7 +128,7 @@
     Tensor& result,
     const Tensor& self,
     const int64_t n_sample,
-    const std::optional<Generator>& generator) {
+    c10::optional<Generator> generator) {
   auto gen = get_generator_or_default<CPUGeneratorImpl>(
       generator, detail::getDefaultCPUGenerator());
   // See Note [Acquire lock when using random generators]
@@ -230,7 +230,7 @@
     Tensor& result,
     const Tensor& self,
     const int64_t n_sample,
-    const std::optional<Generator>& gen) {
+    c10::optional<Generator> gen) {
   AT_DISPATCH_FLOATING_TYPES_AND2(
       kHalf, kBFloat16, self.scalar_type(), "multinomial", [&] {
         multinomial_with_replacement_apply<scalar_t>(
diff --git a/aten/src/ATen/native/cuda/DistributionBernoulli.cu b/aten/src/ATen/native/cuda/DistributionBernoulli.cu
index 298a1b1..89a5182 100644
--- a/aten/src/ATen/native/cuda/DistributionBernoulli.cu
+++ b/aten/src/ATen/native/cuda/DistributionBernoulli.cu
@@ -23,12 +23,12 @@
 
 namespace at::native {
 
-void bernoulli_tensor_kernel(const TensorBase &self, const TensorBase &p_, const std::optional<Generator>& gen_) {
+void bernoulli_tensor_kernel(const TensorBase &self, const TensorBase &p_, c10::optional<Generator> gen_) {
   auto generator = get_generator_or_default<CUDAGeneratorImpl>(gen_, cuda::detail::getDefaultCUDAGenerator());
   at::native::templates::cuda::bernoulli_kernel(self, p_, generator);
 }
 
-void bernoulli_scalar_kernel(const TensorBase &self, double p, const std::optional<Generator>& gen) {
+void bernoulli_scalar_kernel(const TensorBase &self, double p, c10::optional<Generator> gen) {
   auto iter = TensorIterator::borrowing_nullary_op(self);
   auto generator = get_generator_or_default<CUDAGeneratorImpl>(gen, cuda::detail::getDefaultCUDAGenerator());
   at::native::templates::cuda::bernoulli_kernel(iter, p, generator);
diff --git a/aten/src/ATen/native/cuda/DistributionCauchyKernel.cu b/aten/src/ATen/native/cuda/DistributionCauchyKernel.cu
index 5e5cd01..a66d3cf 100644
--- a/aten/src/ATen/native/cuda/DistributionCauchyKernel.cu
+++ b/aten/src/ATen/native/cuda/DistributionCauchyKernel.cu
@@ -5,7 +5,7 @@
 
 namespace at::native {
 
-void cauchy_kernel(TensorIteratorBase& iter, double median, double sigma, const std::optional<Generator>& gen) {
+void cauchy_kernel(TensorIteratorBase& iter, double median, double sigma, c10::optional<Generator> gen) {
   auto generator = get_generator_or_default<CUDAGeneratorImpl>(gen, cuda::detail::getDefaultCUDAGenerator());
   at::native::templates::cuda::cauchy_kernel(iter, median, sigma, generator);
 }
diff --git a/aten/src/ATen/native/cuda/DistributionExponentialKernel.cu b/aten/src/ATen/native/cuda/DistributionExponentialKernel.cu
index 3890470..76cb94f 100644
--- a/aten/src/ATen/native/cuda/DistributionExponentialKernel.cu
+++ b/aten/src/ATen/native/cuda/DistributionExponentialKernel.cu
@@ -5,7 +5,7 @@
 
 namespace at::native {
 
-void exponential_kernel(TensorIteratorBase& iter, double lambda, const std::optional<Generator>& gen) {
+void exponential_kernel(TensorIteratorBase& iter, double lambda, c10::optional<Generator> gen) {
   auto generator = get_generator_or_default<CUDAGeneratorImpl>(gen, cuda::detail::getDefaultCUDAGenerator());
   at::native::templates::cuda::exponential_kernel(iter, lambda, generator);
 }
diff --git a/aten/src/ATen/native/cuda/DistributionGeometricKernel.cu b/aten/src/ATen/native/cuda/DistributionGeometricKernel.cu
index 092cb47..0fe49d7 100644
--- a/aten/src/ATen/native/cuda/DistributionGeometricKernel.cu
+++ b/aten/src/ATen/native/cuda/DistributionGeometricKernel.cu
@@ -5,7 +5,7 @@
 
 namespace at::native {
 
-void geometric_kernel(TensorIteratorBase& iter, double p_, const std::optional<Generator>& gen) {
+void geometric_kernel(TensorIteratorBase& iter, double p_, c10::optional<Generator> gen) {
   auto generator = get_generator_or_default<CUDAGeneratorImpl>(gen, cuda::detail::getDefaultCUDAGenerator());
   at::native::templates::cuda::geometric_kernel(iter, p_, generator);
 }
diff --git a/aten/src/ATen/native/cuda/DistributionLogNormalKernel.cu b/aten/src/ATen/native/cuda/DistributionLogNormalKernel.cu
index d49cadf..f394d4f 100644
--- a/aten/src/ATen/native/cuda/DistributionLogNormalKernel.cu
+++ b/aten/src/ATen/native/cuda/DistributionLogNormalKernel.cu
@@ -5,7 +5,7 @@
 
 namespace at::native {
 
-void log_normal_kernel(TensorIteratorBase& iter, double mean, double std, const std::optional<Generator>& gen) {
+void log_normal_kernel(TensorIteratorBase& iter, double mean, double std, c10::optional<Generator> gen) {
   auto generator = get_generator_or_default<CUDAGeneratorImpl>(gen, cuda::detail::getDefaultCUDAGenerator());
   at::native::templates::cuda::log_normal_kernel(iter, mean, std, generator);
 }
diff --git a/aten/src/ATen/native/cuda/DistributionNormal.cu b/aten/src/ATen/native/cuda/DistributionNormal.cu
index 45e4e20..a17c3e3 100644
--- a/aten/src/ATen/native/cuda/DistributionNormal.cu
+++ b/aten/src/ATen/native/cuda/DistributionNormal.cu
@@ -5,7 +5,7 @@
 
 namespace at::native {
 
-void normal_kernel(const TensorBase &self, double mean, double std, const std::optional<Generator>& gen) {
+void normal_kernel(const TensorBase &self, double mean, double std, c10::optional<Generator> gen) {
   auto generator = get_generator_or_default<CUDAGeneratorImpl>(gen, cuda::detail::getDefaultCUDAGenerator());
   at::native::templates::cuda::normal_kernel(self, mean, std, generator);
 }
diff --git a/aten/src/ATen/native/cuda/DistributionRandomKernel.cu b/aten/src/ATen/native/cuda/DistributionRandomKernel.cu
index 734f775..034a19c 100644
--- a/aten/src/ATen/native/cuda/DistributionRandomKernel.cu
+++ b/aten/src/ATen/native/cuda/DistributionRandomKernel.cu
@@ -5,17 +5,17 @@
 
 namespace at::native {
 
-void random_from_to_kernel(TensorIteratorBase& iter, uint64_t range, int64_t base, const std::optional<Generator>& gen_) {
+void random_from_to_kernel(TensorIteratorBase& iter, uint64_t range, int64_t base, c10::optional<Generator> gen_) {
   auto gen = get_generator_or_default<CUDAGeneratorImpl>(gen_, cuda::detail::getDefaultCUDAGenerator());
   at::native::templates::cuda::random_from_to_kernel(iter, range, base, gen);
 }
 
-void random_full_64_bits_range_kernel(TensorIteratorBase& iter, const std::optional<Generator>& gen_) {
+void random_full_64_bits_range_kernel(TensorIteratorBase& iter, c10::optional<Generator> gen_) {
   auto gen = get_generator_or_default<CUDAGeneratorImpl>(gen_, cuda::detail::getDefaultCUDAGenerator());
   at::native::templates::cuda::random_full_64_bits_range_kernel(iter, gen);
 }
 
-void random_kernel(TensorIteratorBase& iter, const std::optional<Generator>& gen_) {
+void random_kernel(TensorIteratorBase& iter, c10::optional<Generator> gen_) {
   auto gen = get_generator_or_default<CUDAGeneratorImpl>(gen_, cuda::detail::getDefaultCUDAGenerator());
   at::native::templates::cuda::random_kernel(iter, gen);
 }
diff --git a/aten/src/ATen/native/cuda/DistributionTemplates.h b/aten/src/ATen/native/cuda/DistributionTemplates.h
index 8e37138..04a278d 100644
--- a/aten/src/ATen/native/cuda/DistributionTemplates.h
+++ b/aten/src/ATen/native/cuda/DistributionTemplates.h
@@ -352,10 +352,10 @@
 
 template<typename RNG>
 struct RandomFromToKernel {
-  void operator()(TensorIteratorBase& iter, uint64_t range, int64_t base, const std::optional<Generator>& gen) {
+  void operator()(TensorIteratorBase& iter, uint64_t range, int64_t base, c10::optional<Generator> gen) {
     random_from_to_kernel(iter, range, base, check_generator<RNG>(gen));
   }
-  void operator()(TensorIteratorBase& iter, const std::optional<Generator>& gen) {
+  void operator()(TensorIteratorBase& iter, c10::optional<Generator> gen) {
     random_full_64_bits_range_kernel(iter, check_generator<RNG>(gen));
   }
 };
@@ -448,7 +448,7 @@
 
 template<typename RNG>
 struct NormalKernel {
-  void operator()(const TensorBase &self, double mean, double std, const std::optional<Generator>& gen) {
+  void operator()(const TensorBase &self, double mean, double std, c10::optional<Generator> gen) {
     normal_kernel(self, mean, std, check_generator<RNG>(gen));
   }
 };
@@ -481,7 +481,7 @@
 
 template<typename RNG>
 struct UniformKernel {
-  void operator()(TensorIteratorBase& iter, double from, double to, const std::optional<Generator>& gen) {
+  void operator()(TensorIteratorBase& iter, double from, double to, c10::optional<Generator> gen) {
     uniform_kernel(iter, from, to, check_generator<RNG>(gen));
   }
 };
@@ -504,7 +504,7 @@
 
 template<typename RNG>
 struct LogNormalKernel {
-  void operator()(TensorIteratorBase& iter, double mean, double std, const std::optional<Generator>& gen) {
+  void operator()(TensorIteratorBase& iter, double mean, double std, c10::optional<Generator> gen) {
     log_normal_kernel(iter, mean, std, check_generator<RNG>(gen));
   }
 };
@@ -525,7 +525,7 @@
 
 template<typename RNG>
 struct GeometricKernel {
-  void operator()(TensorIteratorBase& iter, double p, const std::optional<Generator>& gen) {
+  void operator()(TensorIteratorBase& iter, double p, c10::optional<Generator> gen) {
     geometric_kernel(iter, p, check_generator<RNG>(gen));
   }
 };
@@ -548,7 +548,7 @@
 
 template<typename RNG>
 struct ExponentialKernel {
-  void operator()(TensorIteratorBase& iter, double lambda, const std::optional<Generator>& gen) {
+  void operator()(TensorIteratorBase& iter, double lambda, c10::optional<Generator> gen) {
     exponential_kernel(iter, lambda, check_generator<RNG>(gen));
   }
 };
@@ -571,7 +571,7 @@
 
 template<typename RNG>
 struct CauchyKernel {
-  void operator()(TensorIteratorBase& iter, double median, double sigma, const std::optional<Generator>& gen) {
+  void operator()(TensorIteratorBase& iter, double median, double sigma, c10::optional<Generator> gen) {
     cauchy_kernel(iter, median, sigma, check_generator<RNG>(gen));
   }
 };
@@ -661,10 +661,10 @@
 
 template<typename RNG>
 struct BernoulliKernel {
-  void operator()(TensorIteratorBase& iter, double p, const std::optional<Generator>& gen) {
+  void operator()(TensorIteratorBase& iter, double p, c10::optional<Generator> gen) {
     bernoulli_kernel(iter, p, check_generator<RNG>(gen));
   }
-  void operator()(const TensorBase &self, const TensorBase &p_, const std::optional<Generator>& gen) {
+  void operator()(const TensorBase &self, const TensorBase &p_, c10::optional<Generator> gen) {
     bernoulli_kernel(self, p_, check_generator<RNG>(gen));
   }
 };
diff --git a/aten/src/ATen/native/cuda/DistributionUniform.cu b/aten/src/ATen/native/cuda/DistributionUniform.cu
index b83649e..2ebdfa4 100644
--- a/aten/src/ATen/native/cuda/DistributionUniform.cu
+++ b/aten/src/ATen/native/cuda/DistributionUniform.cu
@@ -5,7 +5,7 @@
 
 namespace at::native {
 
-void uniform_kernel(TensorIteratorBase& iter, double from, double to, const std::optional<Generator>& gen) {
+void uniform_kernel(TensorIteratorBase& iter, double from, double to, c10::optional<Generator> gen) {
   auto generator = get_generator_or_default<CUDAGeneratorImpl>(gen, cuda::detail::getDefaultCUDAGenerator());
   templates::cuda::uniform_kernel(iter, from, to, generator);
 }
diff --git a/aten/src/ATen/native/cuda/Distributions.cpp b/aten/src/ATen/native/cuda/Distributions.cpp
index 5804fd3..c0d5abb 100644
--- a/aten/src/ATen/native/cuda/Distributions.cpp
+++ b/aten/src/ATen/native/cuda/Distributions.cpp
@@ -18,14 +18,14 @@
 
 namespace at::native {
 
-Tensor _s_poisson_cuda(const Tensor& lambda, const std::optional<Generator>& gen_) {
+Tensor _s_poisson_cuda(const Tensor& lambda, c10::optional<Generator> gen_) {
   auto gen = get_generator_or_default<CUDAGeneratorImpl>(gen_, cuda::detail::getDefaultCUDAGenerator());
   Tensor ret = at::empty(lambda.sizes(), lambda.options());
   launch_poisson_cuda_kernel(ret, lambda, gen);
   return ret;
 }
 
-Tensor _s_binomial_cuda(const Tensor& count, const Tensor& prob, const std::optional<Generator>& gen_) {
+Tensor _s_binomial_cuda(const Tensor& count, const Tensor& prob, c10::optional<Generator> gen_) {
   auto gen = get_generator_or_default<CUDAGeneratorImpl>(gen_, cuda::detail::getDefaultCUDAGenerator());
   Tensor ret = at::empty(count.sizes(), count.options());
   at::TensorIterator iter = at::TensorIteratorConfig()
@@ -37,14 +37,14 @@
   return ret;
 }
 
-Tensor _s_gamma_cuda(const Tensor& alpha, const std::optional<Generator>& gen_) {
+Tensor _s_gamma_cuda(const Tensor& alpha, c10::optional<Generator> gen_) {
   auto gen = get_generator_or_default<CUDAGeneratorImpl>(gen_, cuda::detail::getDefaultCUDAGenerator());
   Tensor ret = at::empty(alpha.sizes(), alpha.options());
   launch_gamma_kernel(ret, alpha, gen);
   return ret;
 }
 
-Tensor _s_dirichlet_cuda(const Tensor& alpha, const std::optional<Generator>& gen_) {
+Tensor _s_dirichlet_cuda(const Tensor& alpha, c10::optional<Generator> gen_) {
   auto gen = get_generator_or_default<CUDAGeneratorImpl>(gen_, cuda::detail::getDefaultCUDAGenerator());
   Tensor ret = at::empty(alpha.sizes(), alpha.options());
   launch_gamma_kernel(ret, alpha, gen);
diff --git a/aten/src/ATen/native/cuda/Dropout.cu b/aten/src/ATen/native/cuda/Dropout.cu
index 85aac02..67ea3e4 100644
--- a/aten/src/ATen/native/cuda/Dropout.cu
+++ b/aten/src/ATen/native/cuda/Dropout.cu
@@ -387,7 +387,7 @@
 
 // TODO: _fused_dropout_cuda is to be removed, see PR #63937
 std::tuple<Tensor,Tensor>
-fused_dropout_cuda(const Tensor& self, double p, const std::optional<Generator>& gen_){
+fused_dropout_cuda(const Tensor& self, double p, c10::optional<Generator> gen_){
   auto gen = get_generator_or_default<CUDAGeneratorImpl>(gen_, cuda::detail::getDefaultCUDAGenerator());
   return dropout_cuda<uint8_t>(gen, self, p);
 }
diff --git a/aten/src/ATen/native/cuda/MultinomialKernel.cu b/aten/src/ATen/native/cuda/MultinomialKernel.cu
index 01e647d..d8f142a 100644
--- a/aten/src/ATen/native/cuda/MultinomialKernel.cu
+++ b/aten/src/ATen/native/cuda/MultinomialKernel.cu
@@ -328,7 +328,7 @@
     Tensor& result,
     const Tensor& self,
     const int64_t n_sample,
-    const std::optional<Generator>& generator) {
+    c10::optional<Generator> generator) {
   auto gen = get_generator_or_default<CUDAGeneratorImpl>(generator, cuda::detail::getDefaultCUDAGenerator());
 
   int inputSize = self.dim();
diff --git a/aten/src/ATen/native/cuda/Randperm.cu b/aten/src/ATen/native/cuda/Randperm.cu
index 0985227..c22c99d 100644
--- a/aten/src/ATen/native/cuda/Randperm.cu
+++ b/aten/src/ATen/native/cuda/Randperm.cu
@@ -55,7 +55,7 @@
 template <int N> struct alignas(N) OpaqueType { char data[N]; };
 }
 
-Tensor& randperm_out_cuda(int64_t n, const std::optional<Generator>& generator, Tensor& result) {
+Tensor& randperm_out_cuda(int64_t n, c10::optional<Generator> generator, Tensor& result) {
   TORCH_CHECK(n >= 0, "n must be non-negative, got", n);
 
   check_supported_max_int_with_precision(n, result);
diff --git a/aten/src/ATen/native/cuda/Randperm.cuh b/aten/src/ATen/native/cuda/Randperm.cuh
index a7c31a3..de5affe 100644
--- a/aten/src/ATen/native/cuda/Randperm.cuh
+++ b/aten/src/ATen/native/cuda/Randperm.cuh
@@ -40,7 +40,7 @@
 
 // See note [Algorithm of randperm]
 template<typename T, typename scalar_t>
-void randperm_handle_duplicate_keys(T *keys, scalar_t *data, int bits, int64_t n, const std::optional<at::Generator> &gen_) {
+void randperm_handle_duplicate_keys(T *keys, scalar_t *data, int bits, int64_t n, c10::optional<at::Generator> &gen_) {
   auto gen = at::get_generator_or_default<at::CUDAGeneratorImpl>(gen_, at::cuda::detail::getDefaultCUDAGenerator());
   int64_t counter_offset = n;
   at::PhiloxCudaState rng_engine_inputs;
diff --git a/aten/src/ATen/native/cuda/RreluWithNoise.cu b/aten/src/ATen/native/cuda/RreluWithNoise.cu
index 5d0e128..463a5ce 100644
--- a/aten/src/ATen/native/cuda/RreluWithNoise.cu
+++ b/aten/src/ATen/native/cuda/RreluWithNoise.cu
@@ -74,7 +74,7 @@
     const Tensor& noise_,
     const Scalar& lower_,
     const Scalar& upper_,
-    const std::optional<Generator>& generator) {
+    c10::optional<Generator> generator) {
   auto input = input_.contiguous();
   auto noise = noise_.contiguous();
   Tensor tmp_output = output.contiguous();
@@ -142,7 +142,7 @@
     const Scalar& lower,
     const Scalar& upper,
     bool training,
-    const std::optional<Generator>& generator,
+    c10::optional<Generator> generator,
     Tensor& output) {
   at::native::resize_output(output, self.sizes());
 
@@ -176,7 +176,7 @@
     const Scalar& lower,
     const Scalar& upper,
     bool training,
-    const std::optional<Generator>& generator) {
+    c10::optional<Generator> generator) {
   Tensor output = at::empty_like(self, LEGACY_CONTIGUOUS_MEMORY_FORMAT);
   return at::native::rrelu_with_noise_out_cuda(self, noise, lower, upper, training, generator, output);
 }
@@ -187,7 +187,7 @@
     const Scalar& lower,
     const Scalar& upper,
     bool training,
-    const std::optional<Generator>& generator) {
+    c10::optional<Generator> generator) {
   return at::native::rrelu_with_noise_out_cuda(
       self, noise, lower, upper, training, generator, self);
 }
diff --git a/aten/src/ATen/native/mps/operations/Distributions.mm b/aten/src/ATen/native/mps/operations/Distributions.mm
index 01437fe..7ed06c8 100644
--- a/aten/src/ATen/native/mps/operations/Distributions.mm
+++ b/aten/src/ATen/native/mps/operations/Distributions.mm
@@ -52,7 +52,7 @@
                         const c10::optional<Tensor>& mean_opt,
                         const c10::optional<Tensor>& std_opt,
                         MPSGraphRandomDistribution distribution,
-                        const std::optional<Generator>& gen,
+                        c10::optional<Generator> gen,
                         std::string op_name,
                         RandomOpBlock randomBlock) {
   if (self.numel() == 0) {
@@ -144,7 +144,7 @@
                                double std_s,
                                const c10::optional<Tensor>& mean_opt,
                                const c10::optional<Tensor>& std_opt,
-                               const std::optional<Generator>& gen,
+                               c10::optional<Generator> gen,
                                std::string op_name) {
   const Tensor& std_t = *(at::borrow_from_optional_tensor(std_opt));
   const Tensor& mean_t = *(at::borrow_from_optional_tensor(mean_opt));
@@ -198,7 +198,7 @@
 
 static Tensor& bernoulli_mps_impl(Tensor& self,
                                   const Tensor& prob_t,
-                                  const std::optional<Generator>& gen,
+                                  c10::optional<Generator> gen,
                                   std::string op_name) {
   TORCH_CHECK(prob_t.is_same_size(self) || prob_t.dim() == 0,
               op_name,
@@ -225,7 +225,7 @@
 
 } // namespace mps
 
-Tensor& uniform_mps_(Tensor& self, double from, double to, const std::optional<Generator>& gen) {
+Tensor& uniform_mps_(Tensor& self, double from, double to, c10::optional<Generator> gen) {
   auto scalar_type = self.scalar_type();
   if (scalar_type == ScalarType::ComplexFloat)
     scalar_type = ScalarType::Float;
@@ -257,16 +257,16 @@
       self, from, to, c10::nullopt, c10::nullopt, MPSGraphRandomDistributionUniform, gen, __func__, nullptr);
 }
 
-Tensor& normal_mps_(Tensor& self, double mean, double std, const std::optional<Generator>& gen) {
+Tensor& normal_mps_(Tensor& self, double mean, double std, c10::optional<Generator> gen) {
   return mps::normal_mps_impl(self, mean, std, c10::nullopt, c10::nullopt, gen, "normal");
 }
 
-Tensor normal_mps(const Tensor& mean, double std, const std::optional<Generator>& gen) {
+Tensor normal_mps(const Tensor& mean, double std, c10::optional<Generator> gen) {
   Tensor self = at::empty(mean.sizes(), mean.scalar_type(), c10::nullopt, kMPS, c10::nullopt, c10::nullopt);
   return mps::normal_mps_impl(self, 0.0, std, mean, c10::nullopt, gen, "normal");
 }
 
-Tensor normal_mps(double mean, const Tensor& std, const std::optional<Generator>& gen) {
+Tensor normal_mps(double mean, const Tensor& std, c10::optional<Generator> gen) {
   Tensor self = at::empty(std.sizes(), std.scalar_type(), c10::nullopt, kMPS, c10::nullopt, c10::nullopt);
   // when there's no tensor-type mean, we cannot pass scalar mean value due to the order of
   // multiply/add ops in random computation. So we create a mean tensor instead.
@@ -274,45 +274,45 @@
   return mps::normal_mps_impl(self, 0.0, 1.0, mean_t, std, gen, "normal");
 }
 
-Tensor normal_mps(const Tensor& mean, const Tensor& std, const std::optional<Generator>& gen) {
+Tensor normal_mps(const Tensor& mean, const Tensor& std, c10::optional<Generator> gen) {
   auto shape = at::infer_size(mean.sizes(), std.sizes());
   Tensor self = at::empty(shape, mean.scalar_type(), c10::nullopt, kMPS, c10::nullopt, c10::nullopt);
   return mps::normal_mps_impl(self, 0.0, 1.0, mean, std, gen, "normal");
 }
 
-Tensor& normal_mps_out(const Tensor& mean, double std, const std::optional<Generator>& gen, Tensor& self) {
+Tensor& normal_mps_out(const Tensor& mean, double std, c10::optional<Generator> gen, Tensor& self) {
   return mps::normal_mps_impl(self, 0.0, std, mean, c10::nullopt, gen, "normal");
 }
 
-Tensor& normal_mps_out(double mean, const Tensor& std, const std::optional<Generator>& gen, Tensor& self) {
+Tensor& normal_mps_out(double mean, const Tensor& std, c10::optional<Generator> gen, Tensor& self) {
   // when there's no tensor-type mean, we cannot pass scalar mean value due to the order of
   // multiply/add ops in random computation. So we create a mean tensor instead.
   Tensor mean_t = at::full_like(self, Scalar(mean));
   return mps::normal_mps_impl(self, 0.0, 1.0, mean_t, std, gen, "normal");
 }
 
-Tensor& normal_mps_out(const Tensor& mean, const Tensor& std, const std::optional<Generator>& gen, Tensor& self) {
+Tensor& normal_mps_out(const Tensor& mean, const Tensor& std, c10::optional<Generator> gen, Tensor& self) {
   TORCH_CHECK(mean.numel() == std.numel(), "normal_mps_out: mean and std must have same number of elements")
   return mps::normal_mps_impl(self, 0.0, 1.0, mean, std, gen, "normal");
 }
 
-Tensor& bernoulli_out_mps(const Tensor& p_, const std::optional<Generator>& gen, Tensor& result) {
+Tensor& bernoulli_out_mps(const Tensor& p_, c10::optional<Generator> gen, Tensor& result) {
   result.resize_(p_.sizes());
   return mps::bernoulli_mps_impl(result, p_, gen, __func__);
 }
 
-Tensor& bernoulli_mps_(Tensor& self, double p, const std::optional<Generator>& gen) {
+Tensor& bernoulli_mps_(Tensor& self, double p, c10::optional<Generator> gen) {
   TORCH_CHECK(0.0 <= p && p <= 1.0, "bernoulli_mps_ expects p to be in [0, 1], but got p=", p);
   Tensor prob_t = at::full({}, Scalar(p), c10::TensorOptions().dtype(kFloat).device(kMPS));
   return mps::bernoulli_mps_impl(self, prob_t, gen, __func__);
 }
 
-Tensor& bernoulli_mps_(Tensor& self, const Tensor& p_, const std::optional<Generator>& gen) {
+Tensor& bernoulli_mps_(Tensor& self, const Tensor& p_, c10::optional<Generator> gen) {
   return mps::bernoulli_mps_impl(self, p_, gen, __func__);
 }
 
 // random_.from
-Tensor& random_mps_(Tensor& self, int64_t from, c10::optional<int64_t> to_opt, const std::optional<Generator>& gen) {
+Tensor& random_mps_(Tensor& self, int64_t from, c10::optional<int64_t> to_opt, c10::optional<Generator> gen) {
   auto input_dtype = self.scalar_type();
   int64_t to = 0;
 
@@ -372,16 +372,16 @@
       self, from, to - 1, c10::nullopt, c10::nullopt, MPSGraphRandomDistributionUniform, gen, __func__, nullptr);
 }
 
-Tensor& random_mps_(Tensor& self, int64_t to, const std::optional<Generator>& gen) {
+Tensor& random_mps_(Tensor& self, int64_t to, c10::optional<Generator> gen) {
   return random_mps_(self, 0, to, gen);
 }
 
-Tensor& random_mps_(Tensor& self, const std::optional<Generator>& gen) {
+Tensor& random_mps_(Tensor& self, c10::optional<Generator> gen) {
   return random_mps_(self, 0, c10::nullopt, gen);
 }
 
 // Exponential distribution
-Tensor& exponential_mps_(Tensor& self, double lambda, const std::optional<Generator>& gen) {
+Tensor& exponential_mps_(Tensor& self, double lambda, c10::optional<Generator> gen) {
   TORCH_CHECK(lambda > 0.0, "exponential_ expects lambda > 0.0, but found lambda=", lambda);
 
   mps::RandomOpBlock random_op_block = ^RandomOpFn(cachedGraph, randomTensor) {
@@ -405,7 +405,7 @@
                                       random_op_block);
 }
 
-Tensor& randperm_out_mps(int64_t n, const std::optional<Generator>& generator, Tensor& result) {
+Tensor& randperm_out_mps(int64_t n, c10::optional<Generator> generator, Tensor& result) {
   if (!is_macos_13_or_newer()) {
     TORCH_WARN_ONCE("MPS: randperm op is supported natively starting from macOS 13.0. ",
                     "Falling back on CPU. This may have performance implications.");
@@ -453,7 +453,7 @@
 
 static Tensor& multinomial_with_replacement_mps_kernel(const Tensor& self,
                                                        const int64_t n_sample,
-                                                       const std::optional<Generator>& generator,
+                                                       c10::optional<Generator> generator,
                                                        Tensor& result) {
   using namespace mps;
 
@@ -581,7 +581,7 @@
 Tensor& multinomial_out_mps(const Tensor& self,
                             int64_t n_sample,
                             bool with_replacement,
-                            const std::optional<Generator>& gen,
+                            c10::optional<Generator> gen,
                             Tensor& result) {
   TORCH_CHECK(result.device() == self.device(), "multinomial arguments must have the same device");
   TORCH_CHECK(self.dim() > 0 && self.dim() <= 2, "prob_dist must be 1 or 2 dim");
@@ -652,10 +652,7 @@
   return result;
 }
 
-Tensor multinomial_mps(const Tensor& self,
-                       int64_t n_sample,
-                       bool with_replacement,
-                       const std::optional<Generator>& gen) {
+Tensor multinomial_mps(const Tensor& self, int64_t n_sample, bool with_replacement, c10::optional<Generator> gen) {
   Tensor result = at::empty({0}, self.options().dtype(kLong));
   multinomial_out_mps(self, n_sample, with_replacement, gen, result);
   return result;
diff --git a/aten/src/ATen/native/nested/NestedTensorMath.cpp b/aten/src/ATen/native/nested/NestedTensorMath.cpp
index 7e1666d..20c8489 100644
--- a/aten/src/ATen/native/nested/NestedTensorMath.cpp
+++ b/aten/src/ATen/native/nested/NestedTensorMath.cpp
@@ -931,7 +931,7 @@
   return self.reshape(sizes);
 }
 
-Tensor& normal_nested_(Tensor& self, double mean, double std, const std::optional<Generator>& gen) {
+Tensor& normal_nested_(Tensor& self, double mean, double std, c10::optional<Generator> gen) {
   const auto& self_buf = get_nested_tensor_impl(self)->get_buffer();
   self_buf.normal_(mean, std, gen);
   return self;
diff --git a/aten/src/ATen/native/sparse/SparseCsrTensorMath.cpp b/aten/src/ATen/native/sparse/SparseCsrTensorMath.cpp
index 4ec69c7..bff9842 100644
--- a/aten/src/ATen/native/sparse/SparseCsrTensorMath.cpp
+++ b/aten/src/ATen/native/sparse/SparseCsrTensorMath.cpp
@@ -326,7 +326,7 @@
     Tensor& self,
     double mean,
     double std,
-    const std::optional<Generator>& gen) {
+    c10::optional<Generator> gen) {
   return unary_op_inplace(self, &Tensor::normal_, mean, std, gen);
 }
 
diff --git a/aten/src/ATen/native/transformers/cuda/flash_attn/flash_api.cpp b/aten/src/ATen/native/transformers/cuda/flash_attn/flash_api.cpp
index 0d4e599..8f6f7a9 100644
--- a/aten/src/ATen/native/transformers/cuda/flash_attn/flash_api.cpp
+++ b/aten/src/ATen/native/transformers/cuda/flash_attn/flash_api.cpp
@@ -354,7 +354,7 @@
         int window_size_left,
         int window_size_right,
         const bool return_softmax,
-        const std::optional<at::Generator>& gen_) {
+        c10::optional<at::Generator> gen_) {
 
     auto dprops = at::cuda::getCurrentDeviceProperties();
     // bool is_sm75 = dprops->major == 7 && dprops->minor == 5;
@@ -546,7 +546,7 @@
                int window_size_left,
                int window_size_right,
                const bool return_softmax,
-               const std::optional<at::Generator>& gen_) {
+               c10::optional<at::Generator> gen_) {
 
     auto dprops = at::cuda::getCurrentDeviceProperties();
     // bool is_sm75 = dprops->major == 7 && dprops->minor == 5;
diff --git a/aten/src/ATen/native/transformers/cuda/flash_attn/flash_api.h b/aten/src/ATen/native/transformers/cuda/flash_attn/flash_api.h
index 336d1b4..2745b28 100644
--- a/aten/src/ATen/native/transformers/cuda/flash_attn/flash_api.h
+++ b/aten/src/ATen/native/transformers/cuda/flash_attn/flash_api.h
@@ -19,7 +19,7 @@
         int window_size_left,
         int window_size_right,
         const bool return_softmax,
-        const std::optional<at::Generator>& gen_);
+        c10::optional<at::Generator> gen_);
 
 std::tuple<at::Tensor, at::Tensor, at::Tensor, at::Tensor, at::Tensor, at::Tensor, at::Tensor, at::Tensor>
 mha_varlen_fwd(const at::Tensor &q,  // total_q x num_heads x head_size, total_q := \sum_{i=0}^{b} s_i
@@ -39,7 +39,7 @@
                int window_size_left,
                int window_size_right,
                const bool return_softmax,
-               const std::optional<at::Generator>& gen_);
+               c10::optional<at::Generator> gen_);
 
 
 std::tuple<at::Tensor, at::Tensor, at::Tensor, at::Tensor>
diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/flash_api.hip b/aten/src/ATen/native/transformers/hip/flash_attn/flash_api.hip
index 76c8b1d..24eebee 100644
--- a/aten/src/ATen/native/transformers/hip/flash_attn/flash_api.hip
+++ b/aten/src/ATen/native/transformers/hip/flash_attn/flash_api.hip
@@ -113,7 +113,7 @@
         int window_size_left,
         int window_size_right,
         const bool return_softmax,
-        const std::optional<at::Generator>& gen_) {
+        c10::optional<at::Generator> gen_) {
   check_gpu_arch();
 
   auto q_dtype = q.dtype();
@@ -322,7 +322,7 @@
                int window_size_left,
                int window_size_right,
                const bool return_softmax,
-               const std::optional<at::Generator>& gen_) {
+               c10::optional<at::Generator> gen_) {
 
   TORCH_CHECK(false, "mha_varlen_fwd not supported on ROCm");
 
diff --git a/aten/src/ATen/native/vulkan/ops/Random.cpp b/aten/src/ATen/native/vulkan/ops/Random.cpp
index 23d9549..c266b10 100644
--- a/aten/src/ATen/native/vulkan/ops/Random.cpp
+++ b/aten/src/ATen/native/vulkan/ops/Random.cpp
@@ -16,7 +16,7 @@
     Tensor& self,
     const double from,
     const double to,
-    const std::optional<at::Generator>& /* not implemented */) {
+    const c10::optional<at::Generator> /* not implemented */) {
   TORCH_CHECK(
       self.is_vulkan(),
       "Vulkan: In-place operator is only supported on Vulkan tensors.");
@@ -75,7 +75,7 @@
     Tensor& self,
     const double mean,
     const double std,
-    const std::optional<at::Generator>& /* not implemented */) {
+    const c10::optional<at::Generator> /* not implemented */) {
   TORCH_CHECK(
       self.is_vulkan(),
       "Vulkan: In-place operator is only supported on Vulkan tensors.");
diff --git a/aten/src/ATen/test/cpu_rng_test.cpp b/aten/src/ATen/test/cpu_rng_test.cpp
index d860e5c..ebc3eee 100644
--- a/aten/src/ATen/test/cpu_rng_test.cpp
+++ b/aten/src/ATen/test/cpu_rng_test.cpp
@@ -44,89 +44,89 @@
 
 // ==================================================== Random ========================================================
 
-Tensor& random_(Tensor& self, const std::optional<Generator>& generator) {
+Tensor& random_(Tensor& self, c10::optional<Generator> generator) {
   return at::native::templates::random_impl<native::templates::cpu::RandomKernel, TestCPUGenerator>(self, generator);
 }
 
-Tensor& random_from_to(Tensor& self, int64_t from, optional<int64_t> to, const std::optional<Generator>& generator) {
+Tensor& random_from_to(Tensor& self, int64_t from, optional<int64_t> to, c10::optional<Generator> generator) {
   return at::native::templates::random_from_to_impl<native::templates::cpu::RandomFromToKernel, TestCPUGenerator>(self, from, to, generator);
 }
 
-Tensor& random_to(Tensor& self, int64_t to, const std::optional<Generator>& generator) {
+Tensor& random_to(Tensor& self, int64_t to, c10::optional<Generator> generator) {
   return random_from_to(self, 0, to, generator);
 }
 
 // ==================================================== Normal ========================================================
 
-Tensor& normal_(Tensor& self, double mean, double std, const std::optional<Generator>& gen) {
+Tensor& normal_(Tensor& self, double mean, double std, c10::optional<Generator> gen) {
   return at::native::templates::normal_impl_<native::templates::cpu::NormalKernel, TestCPUGenerator>(self, mean, std, gen);
 }
 
-Tensor& normal_Tensor_float_out(const Tensor& mean, double std, const std::optional<Generator>& gen, Tensor& output) {
+Tensor& normal_Tensor_float_out(const Tensor& mean, double std, c10::optional<Generator> gen, Tensor& output) {
   return at::native::templates::normal_out_impl<native::templates::cpu::NormalKernel, TestCPUGenerator>(output, mean, std, gen);
 }
 
-Tensor& normal_float_Tensor_out(double mean, const Tensor& std, const std::optional<Generator>& gen, Tensor& output) {
+Tensor& normal_float_Tensor_out(double mean, const Tensor& std, c10::optional<Generator> gen, Tensor& output) {
   return at::native::templates::normal_out_impl<native::templates::cpu::NormalKernel, TestCPUGenerator>(output, mean, std, gen);
 }
 
-Tensor& normal_Tensor_Tensor_out(const Tensor& mean, const Tensor& std, const std::optional<Generator>& gen, Tensor& output) {
+Tensor& normal_Tensor_Tensor_out(const Tensor& mean, const Tensor& std, c10::optional<Generator> gen, Tensor& output) {
   return at::native::templates::normal_out_impl<native::templates::cpu::NormalKernel, TestCPUGenerator>(output, mean, std, gen);
 }
 
-Tensor normal_Tensor_float(const Tensor& mean, double std, const std::optional<Generator>& gen) {
+Tensor normal_Tensor_float(const Tensor& mean, double std, c10::optional<Generator> gen) {
   return at::native::templates::normal_impl<native::templates::cpu::NormalKernel, TestCPUGenerator>(mean, std, gen);
 }
 
-Tensor normal_float_Tensor(double mean, const Tensor& std, const std::optional<Generator>& gen) {
+Tensor normal_float_Tensor(double mean, const Tensor& std, c10::optional<Generator> gen) {
   return at::native::templates::normal_impl<native::templates::cpu::NormalKernel, TestCPUGenerator>(mean, std, gen);
 }
 
-Tensor normal_Tensor_Tensor(const Tensor& mean, const Tensor& std, const std::optional<Generator>& gen) {
+Tensor normal_Tensor_Tensor(const Tensor& mean, const Tensor& std, c10::optional<Generator> gen) {
   return at::native::templates::normal_impl<native::templates::cpu::NormalKernel, TestCPUGenerator>(mean, std, gen);
 }
 
 // ==================================================== Uniform =======================================================
 
-Tensor& uniform_(Tensor& self, double from, double to, const std::optional<Generator>& generator) {
+Tensor& uniform_(Tensor& self, double from, double to, c10::optional<Generator> generator) {
   return at::native::templates::uniform_impl_<native::templates::cpu::UniformKernel, TestCPUGenerator>(self, from, to, generator);
 }
 
 // ==================================================== Cauchy ========================================================
 
-Tensor& cauchy_(Tensor& self, double median, double sigma, const std::optional<Generator>& generator) {
+Tensor& cauchy_(Tensor& self, double median, double sigma, c10::optional<Generator> generator) {
   return at::native::templates::cauchy_impl_<native::templates::cpu::CauchyKernel, TestCPUGenerator>(self, median, sigma, generator);
 }
 
 // ================================================== LogNormal =======================================================
 
-Tensor& log_normal_(Tensor& self, double mean, double std, const std::optional<Generator>& gen) {
+Tensor& log_normal_(Tensor& self, double mean, double std, c10::optional<Generator> gen) {
   return at::native::templates::log_normal_impl_<native::templates::cpu::LogNormalKernel, TestCPUGenerator>(self, mean, std, gen);
 }
 
 // ================================================== Geometric =======================================================
 
-Tensor& geometric_(Tensor& self, double p, const std::optional<Generator>& gen) {
+Tensor& geometric_(Tensor& self, double p, c10::optional<Generator> gen) {
   return at::native::templates::geometric_impl_<native::templates::cpu::GeometricKernel, TestCPUGenerator>(self, p, gen);
 }
 
 // ================================================== Exponential =====================================================
 
-Tensor& exponential_(Tensor& self, double lambda, const std::optional<Generator>& gen) {
+Tensor& exponential_(Tensor& self, double lambda, c10::optional<Generator> gen) {
   return at::native::templates::exponential_impl_<native::templates::cpu::ExponentialKernel, TestCPUGenerator>(self, lambda, gen);
 }
 
 // ================================================== Bernoulli =======================================================
 
-Tensor& bernoulli_Tensor(Tensor& self, const Tensor& p_, const std::optional<Generator>& gen) {
+Tensor& bernoulli_Tensor(Tensor& self, const Tensor& p_, c10::optional<Generator> gen) {
   return at::native::templates::bernoulli_impl_<native::templates::cpu::BernoulliKernel, TestCPUGenerator>(self, p_, gen);
 }
 
-Tensor& bernoulli_float(Tensor& self, double p, const std::optional<Generator>& gen) {
+Tensor& bernoulli_float(Tensor& self, double p, c10::optional<Generator> gen) {
   return at::native::templates::bernoulli_impl_<native::templates::cpu::BernoulliKernel, TestCPUGenerator>(self, p, gen);
 }
 
-Tensor& bernoulli_out(const Tensor& self, const std::optional<Generator>& gen, Tensor& result) {
+Tensor& bernoulli_out(const Tensor& self, c10::optional<Generator> gen, Tensor& result) {
   return at::native::templates::bernoulli_out_impl<native::templates::cpu::BernoulliKernel, TestCPUGenerator>(result, self, gen);
 }
 
diff --git a/aten/src/ATen/test/cuda_distributions_test.cu b/aten/src/ATen/test/cuda_distributions_test.cu
index aab31af..6f6cfca 100644
--- a/aten/src/ATen/test/cuda_distributions_test.cu
+++ b/aten/src/ATen/test/cuda_distributions_test.cu
@@ -174,7 +174,7 @@
   bool shuffled2 = false;
   for (int i = 0; i < 100; i++) {
     cudaDeviceSynchronize();
-    std::optional<at::Generator> gen = c10::nullopt;
+    c10::optional<at::Generator> gen = c10::nullopt;
     randperm_handle_duplicate_keys(keys, values, 8, 5, gen);
     cudaDeviceSynchronize();
     std::vector<int> slice1 = {values[0], values[1], values[2]};
diff --git a/test/cpp_extensions/rng_extension.cpp b/test/cpp_extensions/rng_extension.cpp
index a4da80e..2e657d1 100644
--- a/test/cpp_extensions/rng_extension.cpp
+++ b/test/cpp_extensions/rng_extension.cpp
@@ -33,15 +33,15 @@
   uint64_t value_;
 };
 
-Tensor& random_(Tensor& self, const std::optional<Generator>& generator) {
+Tensor& random_(Tensor& self, c10::optional<Generator> generator) {
   return at::native::templates::random_impl<native::templates::cpu::RandomKernel, TestCPUGenerator>(self, generator);
 }
 
-Tensor& random_from_to(Tensor& self, int64_t from, optional<int64_t> to, const std::optional<Generator>& generator) {
+Tensor& random_from_to(Tensor& self, int64_t from, optional<int64_t> to, c10::optional<Generator> generator) {
   return at::native::templates::random_from_to_impl<native::templates::cpu::RandomFromToKernel, TestCPUGenerator>(self, from, to, generator);
 }
 
-Tensor& random_to(Tensor& self, int64_t to, const std::optional<Generator>& generator) {
+Tensor& random_to(Tensor& self, int64_t to, c10::optional<Generator> generator) {
   return random_from_to(self, 0, to, generator);
 }
 
diff --git a/torch/csrc/jit/frontend/tracer.cpp b/torch/csrc/jit/frontend/tracer.cpp
index e82b2ca..823b27f 100644
--- a/torch/csrc/jit/frontend/tracer.cpp
+++ b/torch/csrc/jit/frontend/tracer.cpp
@@ -678,7 +678,7 @@
 void addInputs(
     Node* n,
     const char* name,
-    const std::optional<at::Generator>& value) {
+    const c10::optional<at::Generator>& value) {
   Graph* g = n->owningGraph();
 
   if (value.has_value() && value->defined()) {
diff --git a/torch/csrc/jit/frontend/tracer.h b/torch/csrc/jit/frontend/tracer.h
index 8081fac..f265d57 100644
--- a/torch/csrc/jit/frontend/tracer.h
+++ b/torch/csrc/jit/frontend/tracer.h
@@ -340,7 +340,7 @@
 TORCH_API void addInputs(
     Node* n,
     const char* name,
-    const std::optional<at::Generator>& value);
+    const c10::optional<at::Generator>& value);
 
 inline void addInputs(
     Node* n,
diff --git a/torch/csrc/jit/runtime/register_special_ops.cpp b/torch/csrc/jit/runtime/register_special_ops.cpp
index 944c696..36ede67 100644
--- a/torch/csrc/jit/runtime/register_special_ops.cpp
+++ b/torch/csrc/jit/runtime/register_special_ops.cpp
@@ -406,7 +406,7 @@
           double a;
           // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
           double b;
-          std::optional<at::Generator> generator =
+          c10::optional<at::Generator> generator =
               pop(stack).toOptional<at::Generator>();
 
           pop(stack, tensor, a, b);
@@ -425,7 +425,7 @@
           double mean;
           // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
           double std;
-          std::optional<at::Generator> generator =
+          c10::optional<at::Generator> generator =
               pop(stack).toOptional<at::Generator>();
 
           pop(stack, tensor, mean, std);
diff --git a/torch/csrc/lazy/core/shape_inference.cpp b/torch/csrc/lazy/core/shape_inference.cpp
index 6c0940a..c1b3424 100644
--- a/torch/csrc/lazy/core/shape_inference.cpp
+++ b/torch/csrc/lazy/core/shape_inference.cpp
@@ -177,14 +177,14 @@
 
 std::vector<Shape> compute_shape_bernoulli(
     const at::Tensor& self,
-    const std::optional<at::Generator>& generator) {
+    c10::optional<at::Generator> generator) {
   return {Shape(self.scalar_type(), self.sizes().vec())};
 }
 
 std::vector<Shape> compute_shape_bernoulli(
     const at::Tensor& self,
     double p,
-    const std::optional<at::Generator>& generator) {
+    c10::optional<at::Generator> generator) {
   return compute_shape_bernoulli(self, generator);
 }
 
@@ -692,14 +692,14 @@
 
 std::vector<Shape> compute_shape_random(
     const at::Tensor& self,
-    const std::optional<at::Generator>& generator) {
+    c10::optional<at::Generator> generator) {
   return {Shape(self.scalar_type(), self.sizes().vec())};
 }
 
 std::vector<Shape> compute_shape_random(
     const at::Tensor& self,
     int64_t to,
-    const std::optional<at::Generator>& generator) {
+    c10::optional<at::Generator> generator) {
   return compute_shape_random(self, generator);
 }
 
@@ -707,7 +707,7 @@
     const at::Tensor& self,
     int64_t from,
     c10::optional<int64_t> to,
-    const std::optional<at::Generator>& generator) {
+    c10::optional<at::Generator> generator) {
   return compute_shape_random(self, generator);
 }
 
@@ -1372,7 +1372,7 @@
     const at::Tensor& self,
     double mean,
     double std,
-    const std::optional<at::Generator>& generator) {
+    c10::optional<at::Generator> generator) {
   return {Shape(self.scalar_type(), self.sizes().vec())};
 }
 
@@ -1380,7 +1380,7 @@
     const at::Tensor& self,
     double from,
     double to,
-    const std::optional<at::Generator>& generator) {
+    c10::optional<at::Generator> generator) {
   return {Shape(self.scalar_type(), self.sizes().vec())};
 }
 
diff --git a/torch/csrc/lazy/core/shape_inference.h b/torch/csrc/lazy/core/shape_inference.h
index e37a4eb..a8388a0 100644
--- a/torch/csrc/lazy/core/shape_inference.h
+++ b/torch/csrc/lazy/core/shape_inference.h
@@ -24,8 +24,8 @@
 TORCH_API std::vector<torch::lazy::Shape> compute_shape__adaptive_avg_pool3d_backward(const at::Tensor & grad_output, const at::Tensor & self);
 TORCH_API std::vector<torch::lazy::Shape> compute_shape_abs(const at::Tensor & self);
 TORCH_API std::vector<torch::lazy::Shape> compute_shape_arange_out(const at::Scalar & start, const at::Scalar & end, const at::Scalar & step, at::Tensor & out);
-TORCH_API std::vector<torch::lazy::Shape> compute_shape_bernoulli(const at::Tensor & self, const ::std::optional<at::Generator> & generator);
-TORCH_API std::vector<torch::lazy::Shape> compute_shape_bernoulli(const at::Tensor & self, double p, const ::std::optional<at::Generator> & generator);
+TORCH_API std::vector<torch::lazy::Shape> compute_shape_bernoulli(const at::Tensor & self, c10::optional<at::Generator> generator);
+TORCH_API std::vector<torch::lazy::Shape> compute_shape_bernoulli(const at::Tensor & self, double p, c10::optional<at::Generator> generator);
 TORCH_API std::vector<torch::lazy::Shape> compute_shape_binary_cross_entropy(const at::Tensor & self, const at::Tensor & target, const c10::optional<at::Tensor> & weight, int64_t reduction);
 TORCH_API std::vector<torch::lazy::Shape> compute_shape_binary_cross_entropy_backward(const at::Tensor & grad_output, const at::Tensor & self, const at::Tensor & target, const c10::optional<at::Tensor> & weight, int64_t reduction);
 TORCH_API std::vector<torch::lazy::Shape> compute_shape_cat(at::TensorList tensors, int64_t dim);
@@ -70,10 +70,10 @@
 TORCH_API std::vector<torch::lazy::Shape> compute_shape_nll_loss2d_backward(const at::Tensor & grad_output, const at::Tensor & self, const at::Tensor & target, const c10::optional<at::Tensor> & weight, int64_t reduction, int64_t ignore_index, const at::Tensor & total_weight);
 TORCH_API std::vector<torch::lazy::Shape> compute_shape_nll_loss2d_forward(const at::Tensor & self, const at::Tensor & target, const c10::optional<at::Tensor> & weight, int64_t reduction, int64_t ignore_index);
 TORCH_API std::vector<torch::lazy::Shape> compute_shape_nonzero(const at::Tensor & self);
-TORCH_API std::vector<torch::lazy::Shape> compute_shape_normal_functional(const at::Tensor & self, double mean, double std, const ::std::optional<at::Generator> & generator);
-TORCH_API std::vector<torch::lazy::Shape> compute_shape_random(const at::Tensor & self, const ::std::optional<at::Generator> & generator);
-TORCH_API std::vector<torch::lazy::Shape> compute_shape_random(const at::Tensor & self, int64_t to, const ::std::optional<at::Generator> & generator);
-TORCH_API std::vector<torch::lazy::Shape> compute_shape_random(const at::Tensor & self, int64_t from, c10::optional<int64_t> to, const ::std::optional<at::Generator> & generator);
+TORCH_API std::vector<torch::lazy::Shape> compute_shape_normal_functional(const at::Tensor & self, double mean, double std, c10::optional<at::Generator> generator);
+TORCH_API std::vector<torch::lazy::Shape> compute_shape_random(const at::Tensor & self, c10::optional<at::Generator> generator);
+TORCH_API std::vector<torch::lazy::Shape> compute_shape_random(const at::Tensor & self, int64_t to, c10::optional<at::Generator> generator);
+TORCH_API std::vector<torch::lazy::Shape> compute_shape_random(const at::Tensor & self, int64_t from, c10::optional<int64_t> to, c10::optional<at::Generator> generator);
 TORCH_API std::vector<torch::lazy::Shape> compute_shape_relu(const at::Tensor & self);
 TORCH_API std::vector<torch::lazy::Shape> compute_shape_repeat(const at::Tensor & self, at::IntArrayRef repeats);
 TORCH_API std::vector<torch::lazy::Shape> compute_shape_slogdet(const at::Tensor & self);
@@ -92,7 +92,7 @@
 TORCH_API std::vector<torch::lazy::Shape> compute_shape_hardswish(const at::Tensor & self);
 TORCH_API std::vector<torch::lazy::Shape> compute_shape_hardswish_backward(const at::Tensor & grad_output, const at::Tensor & self);
 TORCH_API std::vector<torch::lazy::Shape> compute_shape_selu(const at::Tensor & self);
-TORCH_API std::vector<torch::lazy::Shape> compute_shape_uniform(const at::Tensor & self, double from, double to, const ::std::optional<at::Generator> & generator);
+TORCH_API std::vector<torch::lazy::Shape> compute_shape_uniform(const at::Tensor & self, double from, double to, c10::optional<at::Generator> generator);
 
 // Non-Native ops
 TORCH_API std::vector<Shape> compute_shape_scalar(const at::Scalar& value, const at::ScalarType& type);
diff --git a/torch/csrc/utils/python_arg_parser.h b/torch/csrc/utils/python_arg_parser.h
index aecd784..cec99a8 100644
--- a/torch/csrc/utils/python_arg_parser.h
+++ b/torch/csrc/utils/python_arg_parser.h
@@ -246,7 +246,7 @@
   inline std::vector<int64_t> intlistWithDefault(
       int i,
       std::vector<int64_t> default_intlist);
-  inline std::optional<at::Generator> generator(int i);
+  inline c10::optional<at::Generator> generator(int i);
   inline at::Storage storage(int i);
   inline at::Storage storage(
       int i,
@@ -1069,7 +1069,7 @@
   return args[i] == nullptr;
 }
 
-inline std::optional<at::Generator> PythonArgs::generator(int i) {
+inline c10::optional<at::Generator> PythonArgs::generator(int i) {
   if (!args[i])
     return c10::nullopt;
   return reinterpret_cast<THPGenerator*>(args[i])->cdata;
diff --git a/torchgen/api/cpp.py b/torchgen/api/cpp.py
index e08b8bc..f546603 100644
--- a/torchgen/api/cpp.py
+++ b/torchgen/api/cpp.py
@@ -144,9 +144,6 @@
         remove_non_owning_ref_types=remove_non_owning_ref_types,
     )
     if r is not None:
-        if isinstance(t, OptionalType) and not mutable:
-            if str(t.elem) == "Generator":
-                return NamedCType(binds, ConstRefCType(r.type))
         return r
 
     if isinstance(t, BaseType):
diff --git a/torchgen/api/types/types.py b/torchgen/api/types/types.py
index 1a8ad3c..16eff73 100644
--- a/torchgen/api/types/types.py
+++ b/torchgen/api/types/types.py
@@ -12,7 +12,6 @@
 Add new types to `types.py` if these types are ATen/c10 related.
 Add new types to `types_base.py` if they are basic and not attached to ATen/c10.
 """
-
 from dataclasses import dataclass
 from typing import Dict
 
@@ -32,6 +31,7 @@
     shortT,
 )
 
+
 TENSOR_LIST_LIKE_CTYPES = [
     "at::TensorList",
     "const c10::List<c10::optional<at::Tensor>> &",
@@ -133,13 +133,9 @@
 
     def cpp_type(self, *, strip_ref: bool = False) -> str:
         # Do not pass `strip_ref` recursively.
-        if "Generator" in self.elem.cpp_type():
-            return f"::std::optional<{self.elem.cpp_type()}>"
         return f"c10::optional<{self.elem.cpp_type()}>"
 
     def cpp_type_registration_declarations(self) -> str:
-        if "Generator" in self.elem.cpp_type_registration_declarations():
-            return f"::std::optional<{self.elem.cpp_type_registration_declarations()}>"
         return f"c10::optional<{self.elem.cpp_type_registration_declarations()}>"
 
     def remove_const_ref(self) -> "CType":
diff --git a/torchgen/api/types/types_base.py b/torchgen/api/types/types_base.py
index 127b26d..2f8561e 100644
--- a/torchgen/api/types/types_base.py
+++ b/torchgen/api/types/types_base.py
@@ -95,13 +95,11 @@
     elem: "CType"
 
     def cpp_type(self, *, strip_ref: bool = False) -> str:
-        if isinstance(self.elem, ConstRefCType) or strip_ref:
+        if strip_ref:
             return self.elem.cpp_type(strip_ref=strip_ref)
         return f"const {self.elem.cpp_type()} &"
 
     def cpp_type_registration_declarations(self) -> str:
-        if isinstance(self.elem, ConstRefCType):
-            return self.elem.cpp_type_registration_declarations()
         return f"const {self.elem.cpp_type_registration_declarations()} &"
 
     def remove_const_ref(self) -> "CType":
diff --git a/torchgen/dest/lazy_ir.py b/torchgen/dest/lazy_ir.py
index 8ec3f7a..43cde1e 100644
--- a/torchgen/dest/lazy_ir.py
+++ b/torchgen/dest/lazy_ir.py
@@ -16,7 +16,6 @@
 from torchgen.api.types import (
     BaseCType,
     Binding,
-    ConstRefCType,
     deviceT,
     DispatcherSignature,
     kernel_signature,
@@ -246,9 +245,7 @@
         value_args = schema.filtered_args(values=True, scalars=False)
         scalar_args = schema.filtered_args(values=False, scalars=True)
 
-        ctor_args = [
-            f"{ConstRefCType(i.lazy_type).cpp_type()} {i.name}" for i in all_args
-        ]
+        ctor_args = [f"const {i.lazy_type.cpp_type()}& {i.name}" for i in all_args]
         reuse_ctor_args = ", ".join(ctor_args)
         if self.use_lazy_shape and schema.properties.ShapePrecompute:
             ctor_args.append("std::vector<torch::lazy::Shape>&& shapes")