use irange for caffe2/aten directory (#72067)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/72067

The majority of scripts used to generate the changes are from Richard Barnes (D28874212).

Use irange in PyTorch, which adds some benefits
- const safety
- might help the compiler to generate more efficient binary
- more concise

Originally, I was planning to change everything include the head files. But it caused too many errors in other places, therefore I changed the script to only change the cpp and cc files.

```
#filetypes = ('.cpp', '.cc', '.h', '.hpp')
filetypes = ('.cpp', '.cc')
```

Even only changing the cpp(cc) files, there are still some unknown issues, therefore I limited to  **aten** folder to begin with.
```
#target_path = '..'
target_path = '../aten'
```
**Later on, we could run the script for each folder one by one.**

The following files are known to cause issues (such as name space conflicts (already in c10 namespace), loop variable should not be constant etc). We will need to deal with them one by one.
```
excluded_files = ['../c10/util/ConstexprCrc.h',
    '../aten/src/ATen/core/jit_type.h',
    '../aten/src/ATen/native/Math.h',
    '../c10/util/variant.h',
    '../c10/util/flags_use_no_gflags.cpp',
    '../caffe2/operators/cc_bmm_bg_op.h',
    '../aten/src/ATen/core/tensor_type.cpp',
    '../aten/src/ATen/native/Linear.cpp',
    '../aten/src/ATen/native/ConvolutionTBC.cpp',
    '../caffe2/share/fb/mask_rcnn/bbox_concat_batch_splits_op.h',
    '../aten/src/ATen/native/BatchLinearAlgebra.cpp',
    '../aten/src/ATen/native/quantized/cpu/kernels/QuantizedOpKernels.cpp',
    '../aten/src/ATen/native/cuda/DistributionTemplates.h',
    '../c10/util/sparse_bitset.h',
    '../torch/csrc/distributed/c10d/TCPStore.cpp',
    '../caffe2/fb/operators/calibration_op.h',
    '../torch/csrc/jit/testing/file_check.cpp',
    '../torch/csrc/jit/passes/concat_opt.cpp',
    '../torch/csrc/jit/tensorexpr/operators/reduction.cpp',
    '../torch/fb/operators/select_keys.cpp',
    '../torch/fb/operators/calibration/bucketize_calibration.cpp',
    '../fb/custom_ops/maskrcnn/int8/int8_aabb_roi_align.cpp',
    '../fb/custom_ops/maskrcnn/aabb/aabb_roi_align.cpp',
    '../caffe2/fb/tests/RecordIOHelper.cpp',
    '../test/cpp/api/rnn.cpp',
    '../torch/fb/training_toolkit/common/tdigest/tests/TestBufferedTDigest.cpp'
    ]
```

I placed **use_irange.py** at cafee2/script and run the script from there.
```
[charleszhang@devvm7388]~/fbsource/fbcode/caffe2/scripts% pwd
/home/charleszhang/fbsource/fbcode/caffe2/scripts
[charleszhang@devvm7388]~/fbsource/fbcode/caffe2/scripts% ls -l use*
-rwxr-xr-x 1 charleszhang users 5174 Jan 27 10:18 use_irange.py
```

The following is **use_irange.py** I used to generate the changes.
```
#!/usr/bin/env python3
# (c) Facebook, Inc. and its affiliates. Confidential and proprietary.

import re
import os

irange_header = "#include <c10/util/irange.h>"

# I recommend using https://regex101.com/ to understand this.
for_loop_regex = re.compile(
    r"for\s*\((?:int32_t|int64_t|uint32_t|int64_t|size_t|int|unsigned|auto|std::size_t|short|uint16_t|uint8_t) ([A-Za-z0-9_]+)\s*=\s*([^\s]+)\s*;\s*\1\s*<\s*([^\s]+)\s*;\s*(?:\+\+\1|\1\+\+)\s*\)\s*({?)")

header_regex = re.compile(r'#include ["<][^>"]+(?:[">])')

new_loop_zero = "for (const auto {loop_var} : c10::irange({upper_bound})){bracket}"
new_loop_range = (
    "for (const auto {loop_var} : c10::irange({lower_bound}, {upper_bound})){bracket}"
)

#header_insertion_points = (("c10", "alpha"), ("ATen/", "after"), ("torch/", "before"))

def find_c10(data : str) -> int:
    insert_at = -1
    for m in header_regex.finditer(data):
        if "c10/" in m.group(0):
            if insert_at is None:
                insert_at = m.span()[0]
            if irange_header > m.group(0):
                insert_at = m.span()[1]
    return insert_at

def find_ATen(data : str) -> int:
    insert_at = -1
    for m in header_regex.finditer(data):
        if "ATen/" in m.group(0):
            insert_at = m.span()[1]
    return insert_at

def find_torch(data : str) -> int:
    for m in header_regex.finditer(data):
        if "torch/" in m.group(0):
            return m.span()[0]
    return -1

def find_header_insertion_point(data: str) -> (int, str):
    """Look through headers to find an insertion point."""

    m = find_c10(data)
    if m != -1:
        return m, "after"
    else:
        m = find_ATen(data)
        if m != -1:
            return m, "after"
        else:
            m = find_torch(data)
            return m, "before"

def process_one_file(a_file : str):
    data = ''
    with open(a_file) as f:
        data = f.read()
    has_for_loop = for_loop_regex.findall(data)
    if not has_for_loop:
        return
    needs_header = has_for_loop and irange_header not in data

    if needs_header:
        pos, stype = find_header_insertion_point(data)
        # we do no change the file if do not know where to insert the head file
        # for now, since there are too many of them
        if pos == -1:
            return
        if stype == "after":
            data = data[0:pos] + "\n" + irange_header + data[pos:]
        else:
            data = data[0:pos] + irange_header + "\n" + data[pos:]

    start = 0
    new_data = ""
    for match in for_loop_regex.finditer(data):
        loop_text_begin, loop_text_end = match.span()
        loop_var = match.group(1)
        lower_bound = match.group(2)
        upper_bound = match.group(3)
        bracket = " {" if match.group(4) == "{" else ""
        if lower_bound == "0":
            replacement_loop = new_loop_zero.format(
                loop_var=loop_var, upper_bound=upper_bound, bracket=bracket
            )
        else:
            replacement_loop = new_loop_range.format(
                loop_var=loop_var,
                lower_bound=lower_bound,
                upper_bound=upper_bound,
                bracket=bracket,
            )
        old_loop = data[loop_text_begin : loop_text_end]
        new_data += data[start : loop_text_begin] + replacement_loop
        start = loop_text_end
    new_data += data[start:]

    with open(a_file, "w") as fout:
        fout.write(new_data)

#filetypes = ('.cpp', '.cc', '.h', '.hpp')
filetypes = ('.cpp', '.cc')
#target_path = '..'
target_path = '../aten'

excluded_files = ['../c10/util/ConstexprCrc.h',
    '../aten/src/ATen/core/jit_type.h',
    '../aten/src/ATen/native/Math.h',
    '../c10/util/variant.h',
    '../c10/util/flags_use_no_gflags.cpp',
    '../caffe2/operators/cc_bmm_bg_op.h',
    '../aten/src/ATen/core/tensor_type.cpp',
    '../aten/src/ATen/native/Linear.cpp',
    '../aten/src/ATen/native/ConvolutionTBC.cpp',
    '../caffe2/share/fb/mask_rcnn/bbox_concat_batch_splits_op.h',
    '../aten/src/ATen/native/BatchLinearAlgebra.cpp',
    '../aten/src/ATen/native/quantized/cpu/kernels/QuantizedOpKernels.cpp',
    '../aten/src/ATen/native/cuda/DistributionTemplates.h',
    '../c10/util/sparse_bitset.h',
    '../torch/csrc/distributed/c10d/TCPStore.cpp',
    '../caffe2/fb/operators/calibration_op.h',
    '../torch/csrc/jit/testing/file_check.cpp',
    '../torch/csrc/jit/passes/concat_opt.cpp',
    '../torch/csrc/jit/tensorexpr/operators/reduction.cpp',
    '../torch/fb/operators/select_keys.cpp',
    '../torch/fb/operators/calibration/bucketize_calibration.cpp',
    '../fb/custom_ops/maskrcnn/int8/int8_aabb_roi_align.cpp',
    '../fb/custom_ops/maskrcnn/aabb/aabb_roi_align.cpp',
    '../caffe2/fb/tests/RecordIOHelper.cpp',
    '../test/cpp/api/rnn.cpp',
    '../torch/fb/training_toolkit/common/tdigest/tests/TestBufferedTDigest.cpp'
    ]

for current_folder, subfolders, files in os.walk(target_path):
    for a_file in files:
        if a_file.endswith(filetypes) and current_folder != '../caffe2/torch/jit':
            full_path = os.path.join(current_folder, a_file)
            if full_path not in excluded_files:
                process_one_file(full_path)

```

Test Plan: Sandcastle

Reviewed By: r-barnes

Differential Revision: D33892443

fbshipit-source-id: eb76a3b39e6bebb867ede85f74af9791ee8be566
(cherry picked from commit 28f8a2a6cca5b9a4e4ce4166bdc50135caf1b311)
diff --git a/aten/src/ATen/FunctionalInverses.cpp b/aten/src/ATen/FunctionalInverses.cpp
index 3e68670..ded8e4f 100644
--- a/aten/src/ATen/FunctionalInverses.cpp
+++ b/aten/src/ATen/FunctionalInverses.cpp
@@ -3,6 +3,7 @@
 
 #include <ATen/ATen.h>
 #include <ATen/ExpandUtils.h>
+#include <c10/util/irange.h>
 namespace at {
 namespace functionalization {
 
@@ -155,7 +156,7 @@
     dim = at::maybe_wrap_dim(dim, base.sizes().size());
     auto dim_size = base.size(dim);
     int64_t start = 0;
-    for (auto i = 0; i < mutated_view_idx; ++i) {
+    for (const auto i : c10::irange(mutated_view_idx)) {
         start += split_sizes[i];
     }
     auto end = start + split_sizes[mutated_view_idx];
diff --git a/aten/src/ATen/core/DeprecatedTypePropertiesRegistry.cpp b/aten/src/ATen/core/DeprecatedTypePropertiesRegistry.cpp
index e35f487..5799fa0 100644
--- a/aten/src/ATen/core/DeprecatedTypePropertiesRegistry.cpp
+++ b/aten/src/ATen/core/DeprecatedTypePropertiesRegistry.cpp
@@ -1,6 +1,7 @@
 #include <ATen/core/DeprecatedTypePropertiesRegistry.h>
 
 #include <ATen/core/DeprecatedTypeProperties.h>
+#include <c10/util/irange.h>
 
 namespace at {
 
@@ -9,8 +10,8 @@
 }
 
 DeprecatedTypePropertiesRegistry::DeprecatedTypePropertiesRegistry() {
-  for (int b = 0; b < static_cast<int>(Backend::NumOptions); ++b) {
-    for (int s = 0; s < static_cast<int>(ScalarType::NumOptions); ++s) {
+  for (const auto b : c10::irange(static_cast<int>(Backend::NumOptions))) {
+    for (const auto s : c10::irange(static_cast<int>(ScalarType::NumOptions))) {
       registry[b][s] = std::make_unique<DeprecatedTypeProperties>(
               static_cast<Backend>(b),
               static_cast<ScalarType>(s));
diff --git a/aten/src/ATen/core/Formatting.cpp b/aten/src/ATen/core/Formatting.cpp
index 68fcd8c..f3122da 100644
--- a/aten/src/ATen/core/Formatting.cpp
+++ b/aten/src/ATen/core/Formatting.cpp
@@ -173,7 +173,7 @@
     for (const auto l : c10::irange(self.size(0))) {
       Tensor row = self.select(0,l);
       double *row_ptr = row.data_ptr<double>();
-      for(int64_t c = firstColumn; c < lastColumn+1; c++) {
+      for (const auto c : c10::irange(firstColumn, lastColumn+1)) {
         stream << std::setw(sz) << row_ptr[c]/scale;
         if(c == lastColumn) {
           stream << std::endl;
@@ -226,7 +226,7 @@
     }
     stream << "(";
     Tensor tensor = self;
-    for(int64_t i=0; i < self.ndimension()-2; i++) {
+    for (const auto i : c10::irange(self.ndimension()-2)) {
       tensor = tensor.select(0, counter[i]);
       stream << counter[i]+1 << ",";
     }
diff --git a/aten/src/ATen/core/class_type.cpp b/aten/src/ATen/core/class_type.cpp
index 9d7b38d..6884e0c 100644
--- a/aten/src/ATen/core/class_type.cpp
+++ b/aten/src/ATen/core/class_type.cpp
@@ -406,7 +406,7 @@
 ClassTypePtr ClassType::refine(at::ArrayRef<TypePtr> refined_slots) const {
   auto ptr = ClassType::create(name(), compilation_unit_, is_module());
   AT_ASSERT(numAttributes() == refined_slots.size());
-  for (size_t i = 0; i < attributes_.size(); ++i) {
+  for (const auto i : c10::irange(attributes_.size())) {
     AT_ASSERT(refined_slots[i]->isSubtypeOf(*attributes_[i].getType()));
     ptr->addAttribute(attributes_[i].getName(), refined_slots[i], (attributes_[i].getKind() == AttributeKind::PARAMETER),
     (attributes_[i].getKind() == AttributeKind::BUFFER));
@@ -495,7 +495,7 @@
 
 void ClassType::checkNotExist(const std::string& name, const std::string& what) const {
   // Check no overlap with existing constants
-  for (size_t i = 0; i < constantNames_.size(); ++i) {
+  for (const auto i : c10::irange(constantNames_.size())) {
     TORCH_CHECK(
         name != constantNames_[i],
         "attempting to add ",
diff --git a/aten/src/ATen/core/dispatch/DispatchKeyExtractor.cpp b/aten/src/ATen/core/dispatch/DispatchKeyExtractor.cpp
index c8c3859..a930edc 100644
--- a/aten/src/ATen/core/dispatch/DispatchKeyExtractor.cpp
+++ b/aten/src/ATen/core/dispatch/DispatchKeyExtractor.cpp
@@ -1,4 +1,5 @@
 #include <ATen/core/dispatch/DispatchKeyExtractor.h>
+#include <c10/util/irange.h>
 
 #include <sstream>
 
@@ -14,7 +15,7 @@
 
 std::string DispatchKeyExtractor::dumpState() const {
   std::ostringstream oss;
-  for (size_t i=0; i < c10::utils::bitset::NUM_BITS(); ++i) {
+  for (const auto i : c10::irange(c10::utils::bitset::NUM_BITS())) {
     if (dispatch_arg_indices_reverse_.get(i)) {
       oss << "1";
     } else {
diff --git a/aten/src/ATen/core/dispatch/OperatorEntry.cpp b/aten/src/ATen/core/dispatch/OperatorEntry.cpp
index d4d997f..b99d23e 100644
--- a/aten/src/ATen/core/dispatch/OperatorEntry.cpp
+++ b/aten/src/ATen/core/dispatch/OperatorEntry.cpp
@@ -2,6 +2,7 @@
 #include <ATen/core/op_registration/infer_schema.h>
 #include <ATen/core/dispatch/Dispatcher.h>
 #include <ATen/core/dispatch/ObservedOperators.h>
+#include <c10/util/irange.h>
 
 namespace c10 {
 namespace impl {
@@ -443,7 +444,7 @@
 // updateDispatchTableFull_ would update the dispatch table to be)
 std::string OperatorEntry::dumpComputedTable() const {
   std::ostringstream oss;
-  for (uint8_t i = 0; i < static_cast<uint8_t>(DispatchKey::NumDispatchKeys); i++) {
+  for (const auto i : c10::irange(static_cast<uint8_t>(DispatchKey::NumDispatchKeys))) {
     auto k = static_cast<DispatchKey>(i);
     auto kernel_prov = computeDispatchTableEntryWithDebug(c10::Dispatcher::singleton(), k);
     if (kernel_prov.first.kernel.isValid()) {
diff --git a/aten/src/ATen/core/dynamic_type.cpp b/aten/src/ATen/core/dynamic_type.cpp
index 95050da..c505b75 100644
--- a/aten/src/ATen/core/dynamic_type.cpp
+++ b/aten/src/ATen/core/dynamic_type.cpp
@@ -7,6 +7,7 @@
 #include <ATen/core/jit_type.h>
 #include <ATen/core/type_factory.h>
 #include <c10/util/Exception.h>
+#include <c10/util/irange.h>
 
 namespace c10 {
 
@@ -64,7 +65,7 @@
     c10::ArrayRef<TypePtr> args)
     : Arguments(args) {
   TORCH_INTERNAL_ASSERT(names.size() == args.size());
-  for (size_t i = 0; i < args.size(); i++) {
+  for (const auto i : c10::irange(args.size())) {
     elems[i].label = std::string{names[i]};
   }
 }
diff --git a/aten/src/ATen/core/type.cpp b/aten/src/ATen/core/type.cpp
index a3f0451..103d497 100644
--- a/aten/src/ATen/core/type.cpp
+++ b/aten/src/ATen/core/type.cpp
@@ -59,7 +59,7 @@
       if (has_valid_strides_info &&
           type_verbosity() >= TypeVerbosity::TypeAndStride) {
         out << ", strides=[";
-        for (size_t i = 0; i < *ndim; ++i) {
+        for (const auto i : c10::irange(*ndim)) {
           if (i > 0) {
             out << ", ";
           }
@@ -119,7 +119,7 @@
       out << "NamedTuple";
     }
     out << "(";
-    for(size_t i = 0; i < tup->elements().size(); ++i) {
+    for (const auto i : c10::irange(tup->elements().size())) {
       if(i > 0)
         out << ", ";
       if (tup->schema()) {
@@ -299,7 +299,7 @@
       return c10::nullopt;
     }
     std::vector<TypePtr> elements;
-    for (size_t i = 0; i < tuple1->elements().size(); i++) {
+    for (const auto i : c10::irange(tuple1->elements().size())) {
       if (auto elem = unifyTypes(tuple1->elements().at(i), tuple2->elements().at(i), default_to_union)) {
         elements.push_back(*std::move(elem));
       } else {
@@ -431,7 +431,7 @@
       if (tp_formal->elements().size() != tp_actual->elements().size()) {
         return MatchTypeReturn("Cannot match tuples of mismatched size");
       }
-      for (size_t i = 0; i < tp_formal->elements().size(); ++i) {
+      for (const auto i : c10::irange(tp_formal->elements().size())) {
         auto result = matchTypeVariables(
             tp_formal->elements()[i], tp_actual->elements()[i], type_env);
         if (!result.success()) {
@@ -632,7 +632,7 @@
   std::vector<Argument> arguments;
   arguments.reserve(field_names.size());
   auto min_default_idx = field_names.size() - field_defaults.size();
-  for (size_t i = 0; i < field_names.size(); ++i) {
+  for (const auto i : c10::irange(field_names.size())) {
     if (i < min_default_idx) {
       Argument arg{
           /*name=*/std::string{field_names[i]},
@@ -740,7 +740,7 @@
       return false;
     }
 
-    for (size_t i = 0; i < args_lhs.size(); ++i) {
+    for (const auto i : c10::irange(args_lhs.size())) {
       if (args_lhs[i].name() != args_rhs[i].name()) {
         return false;
       }
@@ -788,7 +788,7 @@
     ss << name()->qualifiedName();
   } else {
     ss << "(";
-    for(size_t i = 0; i < elements().size(); ++i) {
+    for (const auto i : c10::irange(elements().size())) {
       if(i > 0)
         ss << ", ";
       ss << elements()[i]->str();
@@ -809,7 +809,7 @@
       // https://docs.python.org/3/library/typing.html#typing.Tuple
       ss << "()";
     } else {
-      for (size_t i = 0; i < elements().size(); ++i) {
+      for (const auto i : c10::irange(elements().size())) {
         if (i > 0)
           ss << ", ";
         ss << elements()[i]->annotation_str(printer);
diff --git a/aten/src/ATen/core/union_type.cpp b/aten/src/ATen/core/union_type.cpp
index 032e09f..1962e28 100644
--- a/aten/src/ATen/core/union_type.cpp
+++ b/aten/src/ATen/core/union_type.cpp
@@ -378,7 +378,7 @@
 
   ss << "Union" + open_delimeter;
   bool printed = false;
-  for (size_t i = 0; i < types_.size(); ++i) {
+  for (const auto i : c10::irange(types_.size())) {
     if (!can_hold_numbertype || !is_numbertype(types_[i])) {
       if (i > 0) {
         ss << ", ";
diff --git a/aten/src/ATen/cuda/PeerToPeerAccess.cpp b/aten/src/ATen/cuda/PeerToPeerAccess.cpp
index e2c538f..8d2e167 100644
--- a/aten/src/ATen/cuda/PeerToPeerAccess.cpp
+++ b/aten/src/ATen/cuda/PeerToPeerAccess.cpp
@@ -1,6 +1,7 @@
 #include <ATen/cuda/PeerToPeerAccess.h>
 #include <c10/cuda/CUDAGuard.h>
 #include <c10/util/Exception.h>
+#include <c10/util/irange.h>
 
 #include <vector>
 #include <algorithm>
@@ -23,7 +24,7 @@
   p2pAccessEnabled_.resize(num_devices * num_devices, -1);
   num_devices_ = num_devices;
 
-  for (int64_t i = 0; i < num_devices; ++i) {
+  for (const auto i : c10::irange(num_devices)) {
     p2pAccessEnabled_[i * num_devices + i] = 1;
   }
 }
diff --git a/aten/src/ATen/cuda/detail/CUDAHooks.cpp b/aten/src/ATen/cuda/detail/CUDAHooks.cpp
index d60d078..5dbe52e 100644
--- a/aten/src/ATen/cuda/detail/CUDAHooks.cpp
+++ b/aten/src/ATen/cuda/detail/CUDAHooks.cpp
@@ -15,6 +15,7 @@
 #include <ATen/native/cuda/CuFFTPlanCache.h>
 #include <c10/util/Exception.h>
 #include <c10/cuda/CUDACachingAllocator.h>
+#include <c10/util/irange.h>
 
 #if AT_CUDNN_ENABLED()
 #include <ATen/cudnn/cudnn-wrapper.h>
@@ -208,7 +209,7 @@
       return current_device_index;
     }
   }
-  for (int64_t device_index = 0; device_index < at::cuda::device_count(); device_index++) {
+  for (const auto device_index : c10::irange(at::cuda::device_count())) {
     if (device_index == current_device_index) continue;
     if (hasPrimaryContext(device_index)) {
       return device_index;
diff --git a/aten/src/ATen/native/BatchLinearAlgebraKernel.cpp b/aten/src/ATen/native/BatchLinearAlgebraKernel.cpp
index 7722595..3ea6c77 100644
--- a/aten/src/ATen/native/BatchLinearAlgebraKernel.cpp
+++ b/aten/src/ATen/native/BatchLinearAlgebraKernel.cpp
@@ -256,7 +256,7 @@
   Tensor work = at::empty({lwork}, input.dtype());
   auto work_data = work.data_ptr<scalar_t>();
 
-  for (auto i = decltype(batch_size){0}; i < batch_size; i++) {
+  for (const auto i : c10::irange(decltype(batch_size){0}, batch_size)) {
     scalar_t* input_working_ptr = &input_data[i * input_matrix_stride];
     scalar_t* values_working_ptr = &values_data[i * values_stride];
     scalar_t* rvectors_working_ptr = compute_eigenvectors ? &rvectors_data[i * input_matrix_stride] : nullptr;
diff --git a/aten/src/ATen/native/CPUBlas.cpp b/aten/src/ATen/native/CPUBlas.cpp
index 5765b0e..0ce4336 100644
--- a/aten/src/ATen/native/CPUBlas.cpp
+++ b/aten/src/ATen/native/CPUBlas.cpp
@@ -4,6 +4,7 @@
 
 #include <c10/util/SmallBuffer.h>
 #include <c10/util/C++17.h>
+#include <c10/util/irange.h>
 
 #include <climits>
 
@@ -331,7 +332,7 @@
     const scalar_t **b, int64_t ldb,
     scalar_t beta,
     scalar_t **c, int64_t ldc) {
-  for (int64_t batch = 0; batch < batch_size; ++batch) {
+  for (const auto batch : c10::irange(batch_size)) {
     gemm(transa, transb, m, n, k, alpha, a[batch], lda, b[batch], ldb, beta, c[batch], ldc);
   }
 }
@@ -376,7 +377,7 @@
     const scalar_t *b, int64_t ldb, int64_t batch_stride_b,
     scalar_t beta,
     scalar_t *c, int64_t ldc, int64_t batch_stride_c) {
-  for (int64_t batch = 0; batch < batch_size; ++batch) {
+  for (const auto batch : c10::irange(batch_size)) {
     const auto a_batch = a + batch_stride_a * batch;
     const auto b_batch = b + batch_stride_b * batch;
     const auto c_batch = c + batch_stride_c * batch;
@@ -405,7 +406,7 @@
           c10::SmallBuffer<const scalar_t*, 16> b_ptrs(batch_size);
           c10::SmallBuffer<scalar_t*, 16> c_ptrs(batch_size);
 
-          for (int64_t batch = 0; batch < batch_size; ++batch) {
+          for (const auto batch : c10::irange(batch_size)) {
             a_ptrs[batch] = a + batch_stride_a * batch;
             b_ptrs[batch] = b + batch_stride_b * batch;
             c_ptrs[batch] = c + batch_stride_c * batch;
diff --git a/aten/src/ATen/native/Convolution.cpp b/aten/src/ATen/native/Convolution.cpp
index 5a32752..c2551f0 100644
--- a/aten/src/ATen/native/Convolution.cpp
+++ b/aten/src/ATen/native/Convolution.cpp
@@ -1762,7 +1762,7 @@
         std::vector<Tensor> backend_grad_inputs(params.groups);
         std::vector<Tensor> backend_grad_weights(params.groups);
         std::vector<Tensor> backend_grad_biases(params.groups);
-        for (int g = 0; g < params.groups; ++g) {
+        for (const auto g : c10::irange(params.groups)) {
           auto grad_output_g = subtensor(grad_output, 1, params.groups, g);
           auto input_g = subtensor(input, 1, params.groups, g);
           auto weight_g = subtensor(weight, 0, params.groups, g);
diff --git a/aten/src/ATen/native/ConvolutionMM2d.cpp b/aten/src/ATen/native/ConvolutionMM2d.cpp
index b746332..30fb04b 100644
--- a/aten/src/ATen/native/ConvolutionMM2d.cpp
+++ b/aten/src/ATen/native/ConvolutionMM2d.cpp
@@ -122,7 +122,7 @@
 
   // Allow for empty batch size and channel size but not other dimensions
   TORCH_CHECK(ndim == 4, "Expected 4D input tensor, but got: ", input.sizes());
-  for (int64_t dim = 2; dim < ndim; ++dim) {
+  for (const auto dim : c10::irange(2, ndim)) {
     TORCH_CHECK(input.size(dim) != 0,
                 "Expected non-zero size for input dimension ", dim,
                 ", but got input shape: ", input.sizes(), ". Only the batch and channel dimensions support size 0.");
@@ -444,7 +444,7 @@
     auto grad_weight_2d_a = grad_weight_2d.accessor<scalar_t, 2>();
     auto finput_a = finput.accessor<scalar_t, 3>();
 
-    for (int64_t t = 0; t < batch_size; t++) {
+    for (const auto t : c10::irange(batch_size)) {
       auto grad_output_t = grad_output_a[t];
       auto finput_t = finput_a[t];
 
diff --git a/aten/src/ATen/native/EmbeddingBag.cpp b/aten/src/ATen/native/EmbeddingBag.cpp
index 7f598d9..19844cb 100644
--- a/aten/src/ATen/native/EmbeddingBag.cpp
+++ b/aten/src/ATen/native/EmbeddingBag.cpp
@@ -130,7 +130,7 @@
     int64_t N,
     const index_t* offsets,
     const index_t* indices) {
-  for (int m = 0; m < output_size; ++m) {
+  for (const auto m : c10::irange(output_size)) {
     for (index_t i = offsets[m]; i < offsets[m + 1]; ++i) {
       TORCH_CHECK(i < index_size);
       index_t idx = indices[i];
@@ -899,7 +899,7 @@
   auto nonempty_max_indices = max_indices.index_select(0, bag_size.nonzero().view(-1));
   auto nonempty_grad = grad.index_select(0, bag_size.nonzero().view(-1));
 
-  for (int64_t dim = 0; dim < grad.sizes()[1]; dim++) {
+  for (const auto dim : c10::irange(grad.sizes()[1])) {
     index_grad_weight.select(1, dim).index_add_(
       0, nonempty_max_indices.select(1, dim), nonempty_grad.select(1, dim));
   }
diff --git a/aten/src/ATen/native/GridSampler.cpp b/aten/src/ATen/native/GridSampler.cpp
index 5d31d89..99b3d93 100644
--- a/aten/src/ATen/native/GridSampler.cpp
+++ b/aten/src/ATen/native/GridSampler.cpp
@@ -428,11 +428,11 @@
   uint8_t* out_ptr = (uint8_t*)output.data_ptr<quint8>();
   float* grid_ptr = grid.data_ptr<float>();
   at::parallel_for(0, N, 0, [&](int64_t start, int64_t end) {
-    for (int64_t n = start; n < end; ++n) {
+    for (const auto n : c10::irange(start, end)) {
       float* grid_ptr_N = grid_ptr + n * grid_sN;
       uint8_t* inp_ptr_N = inp_ptr + n * inp_sN;
-      for (int64_t h = 0; h < out_H; ++h) {
-        for (int64_t w = 0; w < out_W; ++w) {
+      for (const auto h : c10::irange(out_H)) {
+        for (const auto w : c10::irange(out_W)) {
           // get the corresponding input x, y, z co-ordinates from grid
           float* grid_ptr_NHW = grid_ptr_N + h * grid_sH + w * grid_sW;
           float x = *grid_ptr_NHW;
diff --git a/aten/src/ATen/native/Histogram.cpp b/aten/src/ATen/native/Histogram.cpp
index 11a1a0f..abd1ae3 100644
--- a/aten/src/ATen/native/Histogram.cpp
+++ b/aten/src/ATen/native/Histogram.cpp
@@ -12,6 +12,7 @@
 #include <c10/util/ArrayRef.h>
 #include <c10/core/ScalarType.h>
 #include <c10/core/DefaultDtype.h>
+#include <c10/util/irange.h>
 
 /* Implements a numpy-like histogramdd function running on cpu
  * https://numpy.org/doc/stable/reference/generated/numpy.histogramdd.html
@@ -61,7 +62,7 @@
                 "-dimensional histogram but got ", bins.size());
 
     auto input_dtype = input.dtype();
-    for (int64_t dim = 0; dim < N; dim++) {
+    for (const auto dim : c10::irange(N)) {
         const Tensor& dim_bins = bins[dim];
 
         auto bins_dtype = dim_bins.dtype();
@@ -113,7 +114,7 @@
     TORCH_CHECK(input.dtype() == hist.dtype(), "torch.histogram: input tensor and hist tensor should",
             " have the same dtype, but got input ", input.dtype(), " and hist ", hist.dtype());
 
-    for (int64_t dim = 0; dim < N; dim++) {
+    for (const auto dim : c10::irange(N)) {
         TORCH_CHECK(input.dtype() == bin_edges[dim].dtype(), "torch.histogram: input tensor and bin_edges tensor should",
                 " have the same dtype, but got input ", input.dtype(), " and bin_edges ", bin_edges[dim].dtype(),
                 " for dimension ", dim);
@@ -167,7 +168,7 @@
         TORCH_CHECK((int64_t)range.value().size() == 2 * N, "torch.histogramdd: for a ", N, "-dimensional histogram",
                 " range should have ", 2 * N, " elements, but got ", range.value().size());
 
-        for (int64_t dim = 0; dim < N; dim++) {
+        for (const auto dim : c10::irange(N)) {
             leftmost_edges[dim] = range.value()[2 * dim];
             rightmost_edges[dim] = range.value()[2 * dim + 1];
         }
@@ -178,7 +179,7 @@
         });
     }
 
-    for (int64_t dim = 0; dim < N; dim++) {
+    for (const auto dim : c10::irange(N)) {
         double leftmost_edge = leftmost_edges[dim];
         double rightmost_edge = rightmost_edges[dim];
 
@@ -232,7 +233,7 @@
     TORCH_CHECK(self.dim() >= 2, "torch.histogramdd: input tensor should have at least 2 dimensions");
     const int64_t N = self.size(-1);
     std::vector<Tensor> bin_edges_out(N);
-    for (int64_t dim = 0; dim < N; dim++) {
+    for (const auto dim : c10::irange(N)) {
         bin_edges_out[dim] = at::empty({0}, self.options(), MemoryFormat::Contiguous);
     }
     return bin_edges_out;
@@ -246,7 +247,7 @@
     histogramdd_check_inputs(self, bins, weight);
     histogramdd_prepare_out(self, bins, hist, bin_edges);
 
-    for (size_t dim = 0; dim < bins.size(); dim++) {
+    for (const auto dim : c10::irange(bins.size())) {
         bin_edges[dim].copy_(bins[dim]);
     }
 
@@ -280,7 +281,7 @@
 
     auto outer_bin_edges = select_outer_bin_edges(reshaped_self, range);
 
-    for (int64_t dim = 0; dim < N; dim++) {
+    for (const auto dim : c10::irange(N)) {
         linspace_out(outer_bin_edges.first[dim], outer_bin_edges.second[dim],
                 bin_ct[dim] + 1, bin_edges_out[dim]);
     }
@@ -304,7 +305,7 @@
     histogramdd_check_inputs(self, bins, weight);
     histogramdd_prepare_out(self, bins, hist, bin_edges);
 
-    for (size_t dim = 0; dim < bins.size(); dim++) {
+    for (const auto dim : c10::irange(bins.size())) {
         bin_edges[dim].copy_(bins[dim]);
     }
 
diff --git a/aten/src/ATen/native/LossCTC.cpp b/aten/src/ATen/native/LossCTC.cpp
index f7c20d4..344c726 100644
--- a/aten/src/ATen/native/LossCTC.cpp
+++ b/aten/src/ATen/native/LossCTC.cpp
@@ -118,7 +118,7 @@
 
       // now the loop over the inputs
       for (const auto t : c10::irange(1, input_length)) {
-        for (int64_t s=0; s<2*target_length+1; s++) {
+        for (const auto s : c10::irange(2*target_length+1)) {
           auto current_target_prime = get_target_prime(targets_data, tg_batch_offset, tg_target_stride, s, BLANK);
           // this loop over s could be parallel/vectorized, too, but the required items are one index apart
           // alternatively, one might consider moving s to the outer loop to cache current_target_prime more (but then it needs to be descending)
diff --git a/aten/src/ATen/native/MaxUnpooling.cpp b/aten/src/ATen/native/MaxUnpooling.cpp
index b0c6fe2..6d395d9 100644
--- a/aten/src/ATen/native/MaxUnpooling.cpp
+++ b/aten/src/ATen/native/MaxUnpooling.cpp
@@ -1,6 +1,7 @@
 #include <ATen/ATen.h>
 #include <ATen/NativeFunctions.h>
 #include <ATen/native/cpu/MaxUnpoolKernel.h>
+#include <c10/util/irange.h>
 
 namespace at {
 namespace native {
@@ -26,7 +27,7 @@
       "Expected shape of indices to be same as that of the input tensor (", self_.sizes(),
       ") but got indices tensor with shape: ", indices_.sizes());
 
-  for (int64_t i = 1; i < self_.ndimension(); ++i) {
+  for (const auto i : c10::irange(1, self_.ndimension())) {
     TORCH_CHECK(self_.size(i) > 0, "max_unpooling2d_forward_out_cpu(): ",
                 "Expected input to have non-zero size for non-batch dimensions, but got ",
                 self_.sizes(), " with dimension ", i , " being empty.");
@@ -93,7 +94,7 @@
       "Expected shape of indices to be same as that of the input tensor (", input.sizes(),
       ") but got indices tensor with shape: ", indices.sizes());
 
-  for (int64_t i = 1; i < input.ndimension(); ++i) {
+  for (const auto i : c10::irange(1, input.ndimension())) {
     TORCH_CHECK(input.size(i) > 0, fn_name,
                 ": Expected input to have non-zero size for non-batch dimensions, but got ",
                 input.sizes(), " with dimension ", i , " being empty.");
diff --git a/aten/src/ATen/native/SegmentReduce.cpp b/aten/src/ATen/native/SegmentReduce.cpp
index 866127c..acac58e 100644
--- a/aten/src/ATen/native/SegmentReduce.cpp
+++ b/aten/src/ATen/native/SegmentReduce.cpp
@@ -59,7 +59,7 @@
             }
 
             // ===== step2: apply reduction
-            for (int64_t j = 0; j < lengths_data[i]; ++j) {
+            for (const auto j : c10::irange(lengths_data[i])) {
               int64_t starting_index =
                   ((lengths_cum_sum + j) * stride_count) + l;
               const auto data = values_data[starting_index];
@@ -153,7 +153,7 @@
             if (reduction == SegmentReductionType::MAX ||
                 reduction == SegmentReductionType::MIN) {
               int64_t counter = 0;
-              for (int64_t j = 0; j < lengths_data[i]; ++j) {
+              for (const auto j : c10::irange(lengths_data[i])) {
                 int64_t starting_index =
                     ((lengths_cum_sum + j) * stride_count) + l;
                 if (at::_isnan(values_data[starting_index]) ||
@@ -167,7 +167,7 @@
               if (counter < 2) {
                 continue;
               }
-              for (int64_t j = 0; j < lengths_data[i]; ++j) {
+              for (const auto j : c10::irange(lengths_data[i])) {
                 int64_t starting_index =
                     ((lengths_cum_sum + j) * stride_count) + l;
                 if (grad_input_data[starting_index] > 0) {
@@ -177,14 +177,14 @@
               }
             } else if (reduction == SegmentReductionType::MEAN) {
               auto grad_val = grad_data[output_index] / lengths_data[i];
-              for (int64_t j = 0; j < lengths_data[i]; ++j) {
+              for (const auto j : c10::irange(lengths_data[i])) {
                 int64_t starting_index =
                     ((lengths_cum_sum + j) * stride_count) + l;
                 grad_input_data[starting_index] = grad_val;
               }
             } else if (reduction == SegmentReductionType::SUM) {
               const auto& grad_val = grad_data[output_index];
-              for (int64_t j = 0; j < lengths_data[i]; ++j) {
+              for (const auto j : c10::irange(lengths_data[i])) {
                 int64_t starting_index =
                     ((lengths_cum_sum + j) * stride_count) + l;
                 grad_input_data[starting_index] = grad_val;
diff --git a/aten/src/ATen/native/TensorAdvancedIndexing.cpp b/aten/src/ATen/native/TensorAdvancedIndexing.cpp
index 8898735..8a1e08c 100644
--- a/aten/src/ATen/native/TensorAdvancedIndexing.cpp
+++ b/aten/src/ATen/native/TensorAdvancedIndexing.cpp
@@ -1301,7 +1301,7 @@
 
   TORCH_CHECK(self.dim() == index.dim(),
       "Shape mismatch between `self` (got ", self.sizes(), ") and `index` (got ", index.sizes(), ")");
-  for (int64_t i = 0; i < self.dim(); i++) {
+  for (const auto i : c10::irange(self.dim())) {
     TORCH_CHECK(self.size(i) == index.size(i),
         "Shape mismatch between `self` (got ", self.sizes(), ") and `index` (got ", index.sizes(), ")");
   }
@@ -1338,16 +1338,15 @@
 
 
     int64_t offset1 = 1, offset2 = 1;
-    for (int64_t d = 0; d < dim; d++)
-      offset1 *= self.size(d);
+    for (const auto d : c10::irange(dim))offset1 *= self.size(d);
     for (int64_t d = dim + 1; d < self.dim(); d++)
       offset2 *= self.size(d);
 
     scalar_t value;
     int64_t dim_index;
-    for (int64_t i = 0; i < offset1; i++) {
-      for (int64_t j = 0; j < self.size(dim); j++) {
-        for (int64_t k = 0; k < offset2; k++) {
+    for (const auto i : c10::irange(offset1)) {
+      for (const auto j : c10::irange(self.size(dim))) {
+        for (const auto k : c10::irange(offset2)) {
           value = self_data[i * self_cont.stride(dim) * self_cont.size(dim) + j * self_cont.stride(dim) + k];
           dim_index = index_data[i * index_cont.stride(dim) * index_cont.size(dim) + j * index_cont.stride(dim) + k];
           TORCH_CHECK(dim_index >= 0 && dim_index < out.size(dim),
diff --git a/aten/src/ATen/native/TensorShape.cpp b/aten/src/ATen/native/TensorShape.cpp
index 62c7370..8a9026c 100644
--- a/aten/src/ATen/native/TensorShape.cpp
+++ b/aten/src/ATen/native/TensorShape.cpp
@@ -107,12 +107,12 @@
   int64_t nnz_factor = 1;
   int64_t min_broadcast_dim = (sparse_extra_ndim > 0 ? 0: -1);
   int64_t max_unchanged_dim = -1;
-  for (int64_t i=0; i<sparse_extra_ndim; i++) {
+  for (const auto i : c10::irange(sparse_extra_ndim)) {
     auto d = size[i];
     nnz_factor *= d;
     broadcast_sizes.emplace_back(d);
   }
-  for (int64_t i=0; i<self.sparse_dim(); i++) {
+  for (const auto i : c10::irange(self.sparse_dim())) {
     auto d = size[sparse_extra_ndim + i];
     if (self.size(i) != d) {
       TORCH_CHECK(self.size(i) == 1,
@@ -136,7 +136,7 @@
   bool is_coalesced = self.dim()==0 || (self.is_coalesced() && (max_unchanged_dim < min_broadcast_dim || min_broadcast_dim == -1));
 
   broadcast_dense_sizes.emplace_back(nnz);
-  for (int64_t i=0; i<self.dense_dim(); i++) {
+  for (const auto i : c10::irange(self.dense_dim())) {
     broadcast_dense_sizes.emplace_back(size[sparse_extra_ndim + self.sparse_dim() + i]);
   }
 
@@ -152,7 +152,7 @@
     // auxilary arange tensors
     Tensor broadcast_indices = at::native::new_ones(indices, broadcast_sizes).nonzero().transpose(0, 1).tile(nnz);
     new_indices.narrow(0, 0, sparse_extra_ndim).copy_(broadcast_indices.narrow(0, 0, sparse_extra_ndim));
-    for (size_t i=0; i<broadcast_dims.size(); i++) {
+    for (const auto i : c10::irange(broadcast_dims.size())) {
       int64_t j=broadcast_dims[i];
       new_indices.select(0, sparse_extra_ndim + j).copy_(broadcast_indices.select(0, sparse_extra_ndim + i));
     }
@@ -1287,7 +1287,7 @@
     std::vector<int64_t> zindices;
     std::vector<int64_t> iindices;
     int64_t new_nnz = 0;
-    for (int64_t i = 0; i < new_sizes[dim]; i++) {
+    for (const auto i : c10::irange(new_sizes[dim])) {
       int64_t idx = cpu_index_ptr[i];
       if (idx < -size || idx >= size) {
         TORCH_CHECK_INDEX(false, "index_select(): index contains ", idx, " that is out of range for tensor of size ",
@@ -1296,7 +1296,7 @@
       if (idx < 0) {
         idx += size;
       }
-      for (int64_t j = 0; j < nnz; j++) {
+      for (const auto j : c10::irange(nnz)) {
         int64_t jdx = cpu_dim_indices_ptr[j];
         if (idx == jdx) {
           new_nnz++;
diff --git a/aten/src/ATen/native/UpSampleBilinear2d.cpp b/aten/src/ATen/native/UpSampleBilinear2d.cpp
index 448b05f..f73bb50 100644
--- a/aten/src/ATen/native/UpSampleBilinear2d.cpp
+++ b/aten/src/ATen/native/UpSampleBilinear2d.cpp
@@ -76,7 +76,7 @@
       grad_output.dim() == 4,
       "Expected grad_output to be a tensor of dimension 4 but got: dimension ", grad_output.dim());
 
-  for (int i = 0; i < 4; ++i) {
+  for (const auto i : c10::irange(4)) {
     TORCH_CHECK(
         grad_output.size(i) == full_output_size[i],
         "Expected grad_output to have the same shape as output;",
diff --git a/aten/src/ATen/native/UpSampleNearest2d.cpp b/aten/src/ATen/native/UpSampleNearest2d.cpp
index 0f0af3a..864121f 100644
--- a/aten/src/ATen/native/UpSampleNearest2d.cpp
+++ b/aten/src/ATen/native/UpSampleNearest2d.cpp
@@ -72,7 +72,7 @@
       grad_output.dim() == 4,
       "Expected grad_output to be a tensor of dimension 4 but got: dimension ", grad_output.dim());
 
-  for (int i = 0; i < 4; ++i) {
+  for (const auto i : c10::irange(4)) {
     TORCH_CHECK(
         grad_output.size(i) == full_output_size[i],
         "Expected grad_output to have the same shape as output;",
diff --git a/aten/src/ATen/native/UpSampleNearest3d.cpp b/aten/src/ATen/native/UpSampleNearest3d.cpp
index 1623a4f..c659a86 100644
--- a/aten/src/ATen/native/UpSampleNearest3d.cpp
+++ b/aten/src/ATen/native/UpSampleNearest3d.cpp
@@ -81,7 +81,7 @@
       grad_output.dim() == 5,
       "Expected grad_output to be a tensor of dimension 5 but got: dimension ", grad_output.dim());
 
-  for (int i = 0; i < 5; ++i) {
+  for (const auto i : c10::irange(5)) {
     TORCH_CHECK(
         grad_output.size(i) == full_output_size[i],
         "Expected grad_output to have the same shape as output;",
diff --git a/aten/src/ATen/native/attention.cpp b/aten/src/ATen/native/attention.cpp
index 9a6cace..4c69cae 100644
--- a/aten/src/ATen/native/attention.cpp
+++ b/aten/src/ATen/native/attention.cpp
@@ -6,6 +6,7 @@
 #include <ATen/NativeFunctions.h>
 #include <ATen/Parallel.h>
 #include <ATen/cpu/vec/vec256/vec256.h>
+#include <c10/util/irange.h>
 
 namespace at {
 
@@ -141,14 +142,14 @@
                 }
 
                 auto hmax = std::numeric_limits<scalar_t>::lowest();
-                for (auto i = 0; i < V; ++i) {
+                for (const auto i : c10::irange(V)) {
                   hmax = std::max(max_input[i], hmax);
                 }
                 accscalar_t hsum = 0;
                 for (auto t = 0; t < T; t += V) {
                   auto v = Vec::loadu(&input_data[t]);
                   // TODO: vectorize in accscalar_t?
-                  for (auto i = 0; i < V; ++i) {
+                  for (const auto i : c10::irange(V)) {
                     hsum += std::exp(static_cast<accscalar_t>(v[i]) - hmax);
                   }
                 }
@@ -159,12 +160,12 @@
                   // TODO: vectorize in accscalar_t?
                   // TODO this faster solution does not work on Android build
                   /*
-                  for (auto i = 0; i < V; ++i) {
+                  for (const auto i : c10::irange(V)) {
                     v[i] = static_cast<scalar_t>(std::exp(static_cast<accscalar_t>(v[i]) - hmax) * inv_denominator);
                   }
                   v.store(&input_data[t]);
                   */
-                  for (auto i = 0; i < V; ++i) {
+                  for (const auto i : c10::irange(V)) {
                     input_data[t + i] = static_cast<scalar_t>(std::exp(static_cast<accscalar_t>(v[i]) - hmax) * inv_denominator);
                   }
                 }
diff --git a/aten/src/ATen/native/cpu/AvgPoolKernel.cpp b/aten/src/ATen/native/cpu/AvgPoolKernel.cpp
index e0b8551..328f333 100644
--- a/aten/src/ATen/native/cpu/AvgPoolKernel.cpp
+++ b/aten/src/ATen/native/cpu/AvgPoolKernel.cpp
@@ -286,7 +286,7 @@
       if (ih0 >= ih1 || iw0 >= iw1) {
         // since we are not directly using output as the accumulation buffer,
         // in case the kernel window is out of range, need to zero the output buffer here.
-        for (int64_t k = 0; k < size; k++) {
+        for (const auto k : c10::irange(size)) {
           out[k] = 0;
         }
         // move on to next output index
diff --git a/aten/src/ATen/native/cpu/HistogramKernel.cpp b/aten/src/ATen/native/cpu/HistogramKernel.cpp
index 0a9c4c0..583f367 100644
--- a/aten/src/ATen/native/cpu/HistogramKernel.cpp
+++ b/aten/src/ATen/native/cpu/HistogramKernel.cpp
@@ -80,7 +80,7 @@
 
     const int64_t D = input.size(1);
     TORCH_INTERNAL_ASSERT(int64_t(bin_edges.size()) == D);
-    for (int64_t dim = 0; dim < D; dim++) {
+    for (const auto dim : c10::irange(D)) {
         TORCH_INTERNAL_ASSERT(bin_edges[dim].is_contiguous());
         TORCH_INTERNAL_ASSERT(hist.size(dim) + 1 == bin_edges[dim].numel());
     }
@@ -103,7 +103,7 @@
     std::vector<int64_t> num_bin_edges(D);
     std::vector<input_t> leftmost_edge(D), rightmost_edge(D);
 
-    for (int64_t dim = 0; dim < D; dim++) {
+    for (const auto dim : c10::irange(D)) {
         bin_seq[dim] = bin_edges[dim].data_ptr<input_t>();
         num_bin_edges[dim] = bin_edges[dim].numel();
         leftmost_edge[dim] = bin_seq[dim][0];
@@ -138,7 +138,7 @@
             bool skip_elt = false;
             int64_t hist_index = 0;
 
-            for (int64_t dim = 0; dim < D; dim++) {
+            for (const auto dim : c10::irange(D)) {
                 const input_t elt = accessor_in[i][dim];
 
                 // Skips elements which fall outside the specified bins
@@ -215,7 +215,7 @@
             : c10::optional<Tensor>();
 
     std::vector<Tensor> bin_edges_contig(bin_edges.size());
-    for (size_t dim = 0; dim < bin_edges_contig.size(); dim++) {
+    for (const auto dim : c10::irange(bin_edges_contig.size())) {
         bin_edges_contig[dim] = bin_edges[dim].contiguous();
     }
 
@@ -234,7 +234,7 @@
          /* For each dimension, divides each bin's value
           * by the bin's length in that dimension.
           */
-        for (int64_t dim = 0; dim < N; dim++) {
+        for (const auto dim : c10::irange(N)) {
             const auto bin_lengths = bin_edges[dim].diff();
 
             // Used to reshape bin_lengths to align with the corresponding dimension of hist.
diff --git a/aten/src/ATen/native/cpu/SumKernel.cpp b/aten/src/ATen/native/cpu/SumKernel.cpp
index dcddc6c..d4a6eb3 100644
--- a/aten/src/ATen/native/cpu/SumKernel.cpp
+++ b/aten/src/ATen/native/cpu/SumKernel.cpp
@@ -28,7 +28,7 @@
   alignas(64) std::array<acc_t, vacc_t::size()> acc;
   acc.fill(ident);
   for (const auto k : c10::irange(vstride)) {
-    for (int i = 0; i < vacc_t::size(); ++i) {
+    for (const auto i : c10::irange(vacc_t::size())) {
       acc[i] = reduce(acc[i], values[i * vstride + k]);
     }
   }
@@ -139,7 +139,7 @@
     val.store(values);
 
     alignas(64) acc_t acc[vacc_t::size()];
-    for (int i = 0; i < vacc_t::size(); ++i) {
+    for (const auto i : c10::irange(vacc_t::size())) {
       acc[i] = values[i];
     }
 
diff --git a/aten/src/ATen/native/cpu/UpSampleKernel.cpp b/aten/src/ATen/native/cpu/UpSampleKernel.cpp
index 31cf12d..88bdbd7 100644
--- a/aten/src/ATen/native/cpu/UpSampleKernel.cpp
+++ b/aten/src/ATen/native/cpu/UpSampleKernel.cpp
@@ -851,7 +851,7 @@
         // input_index = round(index_f32)
         // Same as Pillow and Scikit-Image/Scipy ndi.zoom
 
-        for (int64_t i=0; i<output_size; i++) {
+        for (const auto i : c10::irange(output_size)) {
           const scalar_t real_input_index = area_pixel_compute_source_index<scalar_t>(
               scale, i, /*align_corners=*/align_corners, /*cubic=*/false);
           input_index = static_cast<int64_t>(floorf(real_input_index + 0.5));
diff --git a/aten/src/ATen/native/cuda/Activation.cpp b/aten/src/ATen/native/cuda/Activation.cpp
index 0b5a929..9ad2ef8 100644
--- a/aten/src/ATen/native/cuda/Activation.cpp
+++ b/aten/src/ATen/native/cuda/Activation.cpp
@@ -2,6 +2,7 @@
 
 #include <ATen/ATen.h>
 #include <ATen/native/Resize.h>
+#include <c10/util/irange.h>
 
 namespace at { namespace native {
 
@@ -145,7 +146,7 @@
     std::vector<int64_t> reduce_dims;
     reduce_dims.push_back(0);
     if (dims > 2) {
-      for(int64_t i = 2; i < dims; i++) reduce_dims.push_back(i);
+      for (const auto i : c10::irange(2, dims))reduce_dims.push_back(i);
     }
     weight_grad = weight_grad_collector.sum(reduce_dims);
   }
diff --git a/aten/src/ATen/native/cuda/BatchLinearAlgebra.cpp b/aten/src/ATen/native/cuda/BatchLinearAlgebra.cpp
index 16bd6db..e8bffaf 100644
--- a/aten/src/ATen/native/cuda/BatchLinearAlgebra.cpp
+++ b/aten/src/ATen/native/cuda/BatchLinearAlgebra.cpp
@@ -6,6 +6,7 @@
 #include <ATen/cuda/detail/IndexUtils.cuh>
 
 #include <c10/util/Exception.h>
+#include <c10/util/irange.h>
 
 #include <ATen/native/LinearAlgebraUtils.h>
 #include <ATen/native/cuda/MiscUtils.h>
@@ -1287,7 +1288,7 @@
     ALLOCATE_ARRAY(b_array, scalar_t*, batch_size);
 
     // Set up the created arrays
-    for (int64_t i = 0; i < batch_size; i++) {
+    for (const auto i : c10::irange(batch_size)) {
       A_array[i] = &A_data[i * A_mat_stride];
       b_array[i] = &b_data[i * b_mat_stride];
       ipiv_array[i] = &ipiv_data[i * n];
@@ -1380,7 +1381,7 @@
   ALLOCATE_ARRAY(self_inv_array, scalar_t*, batch_size);
 
   // Set up the created arrays
-  for (int64_t i = 0; i < batch_size; i++) {
+  for (const auto i : c10::irange(batch_size)) {
     self_array[i] = &self_data[i * self_mat_stride];
     self_inv_array[i] = &self_inv_data[i * self_inv_mat_stride];
     ipiv_array[i] = &ipiv_data[i * n];
@@ -1521,7 +1522,7 @@
     ALLOCATE_ARRAY(b_array, scalar_t*, batch_size);
 
     // Set up the created arrays
-    for (int64_t i = 0; i < batch_size; i++) {
+    for (const auto i : c10::irange(batch_size)) {
       A_array[i] = &A_data[i * A_mat_stride];
       b_array[i] = &b_data[i * b_mat_stride];
     }
@@ -1628,7 +1629,7 @@
     ALLOCATE_ARRAY(self_array, scalar_t*, batch_size);
 
     // Set up the created arrays
-    for (int64_t i = 0; i < batch_size; i++) {
+    for (const auto i : c10::irange(batch_size)) {
       self_array[i] = &self_data[i * self_mat_stride];
     }
 
@@ -1882,7 +1883,7 @@
   ALLOCATE_ARRAY(input_array, scalar_t*, batch_size);
 
   // Set up array of pointers to matrices
-  for (int64_t i = 0; i < batch_size; i++) {
+  for (const auto i : c10::irange(batch_size)) {
     input_array[i] = &input_data[i * input_matrix_stride];
   }
 
@@ -1897,7 +1898,7 @@
     pivots.fill_(1);
     magma_int_t** pivots_array;
     ALLOCATE_ARRAY(pivots_array, magma_int_t*, batch_size);
-    for (int64_t i = 0; i < batch_size; i++) {
+    for (const auto i : c10::irange(batch_size)) {
       pivots_array[i] = &pivots_data[i * pivots_stride];
     }
     magmaLuBatched<scalar_t>(m, n, input_array, leading_dimension, pivots_array, infos_data, batch_size, magma_queue);
@@ -2040,7 +2041,7 @@
   ALLOCATE_ARRAY(b_array, scalar_t*, batch_size);
 
   // Set up the created arrays
-  for (int64_t i = 0; i < batch_size; i++) {
+  for (const auto i : c10::irange(batch_size)) {
     A_array[i] = &A_data[i * A_mat_stride];
     b_array[i] = &b_data[i * b_mat_stride];
   }
@@ -2155,7 +2156,7 @@
   scalar_t* work_data = nullptr; // workspace is not needed for geqrf2_gpu
 
   magma_int_t info = 0;
-  for (int64_t i = 0; i < batch_size; i++) {
+  for (const auto i : c10::irange(batch_size)) {
     scalar_t* input_working_ptr = &input_data[i * input_matrix_stride];
     scalar_t* tau_working_ptr = &tau_data[i * tau_stride];
 
@@ -2241,7 +2242,7 @@
   // This phase computes R (the raw version)
   // This uses MAGMA's ?geqrf2_gpu function
   magma_int_t info = 0;
-  for (int64_t i = 0; i < batch_size; i++) {
+  for (const auto i : c10::irange(batch_size)) {
     scalar_t* r_working_ptr = &r_data[i * r_matrix_stride];
     magmaGeqrf<scalar_t>(m, n, r_working_ptr, m, tau_data, work_data, &info, /*is_v2=*/true);
     checkMagmaInternalError(info, "geqrf");
@@ -2259,7 +2260,7 @@
   // http://icl.cs.utk.edu/magma/forum/viewtopic.php?f=2&t=1015&p=2800&hilit=geqrf_gpu#p2800
   auto q_data = Q.data_ptr<scalar_t>();
   auto q_matrix_stride = matrixStride(Q);
-  for (int64_t i = 0; i < batch_size; i++) {
+  for (const auto i : c10::irange(batch_size)) {
     scalar_t* q_working_ptr = &q_data[i * q_matrix_stride];
     magmaGeqrf<scalar_t>(m, n, q_working_ptr, m, tau_data, work_data, &info, /*is_v2=*/false);
     checkMagmaInternalError(info, "geqrf");
@@ -2649,7 +2650,7 @@
   Tensor work = at::empty({lwork}, input.dtype());
   auto work_data = work.data_ptr<scalar_t>();
 
-  for (auto i = decltype(batch_size){0}; i < batch_size; i++) {
+  for (const auto i : c10::irange(decltype(batch_size){0}, batch_size)) {
     scalar_t* input_working_ptr = &input_data[i * input_matrix_stride];
     scalar_t* values_working_ptr = &values_data[i * values_stride];
     scalar_t* rvectors_working_ptr = compute_eigenvectors ? &rvectors_data[i * input_matrix_stride] : nullptr;
@@ -2739,7 +2740,7 @@
   scalar_t* work;
   ALLOCATE_ARRAY(work, scalar_t, lwork);
 
-  for (int64_t i = 0; i < batchsize; i++) {
+  for (const auto i : c10::irange(batchsize)) {
     // Compute S, U (optionally), Vh (optionally)
     magmaSvd<scalar_t, value_t>(jobz, m, n,
                                 A_data + i * A_stride, lda,
@@ -2919,7 +2920,7 @@
   ALLOCATE_ARRAY(lu_array, scalar_t*, batch_size);
   ALLOCATE_ARRAY(b_array, scalar_t*, batch_size);
 
-  for (int64_t i = 0; i < batch_size; i++) {
+  for (const auto i : c10::irange(batch_size)) {
     pivots_array[i] = &pivots_data[i * pivots_stride];
     b_array[i] = &b_data[i * b_stride];
     lu_array[i] = &lu_data[i * lu_stride];
diff --git a/aten/src/ATen/native/cuda/BatchLinearAlgebraLib.cpp b/aten/src/ATen/native/cuda/BatchLinearAlgebraLib.cpp
index e67967e..661f136 100644
--- a/aten/src/ATen/native/cuda/BatchLinearAlgebraLib.cpp
+++ b/aten/src/ATen/native/cuda/BatchLinearAlgebraLib.cpp
@@ -282,7 +282,7 @@
   // Heuristic: For small batch size or large matrix size, we use for-loop to iterate over the batches instead of
   //            calling the batched cublas routine.
   if (batch_size <= 8 || /* batch_size > 8 && */ n >= 512) {
-    for (int64_t i = 0; i < batch_size; i++) {
+    for (const auto i : c10::irange(batch_size)) {
       auto dataPtr = allocator.allocate(sizeof(int) * lda);
       int* pivot = reinterpret_cast<int*>(dataPtr.get());
 
@@ -398,7 +398,7 @@
                                         : batches.size();
 
 
-  for(int _i = 0; _i < batchsize; _i++){
+  for (const auto _i : c10::irange(batchsize)) {
     int i = calculate_all_batches ? _i : batches[_i];
 
     at::cuda::solver::gesvd<scalar_t>(
@@ -487,7 +487,7 @@
 
   auto dataPtr = allocator.allocate(sizeof(scalar_t)*lwork);
 
-  for(int i = 0; i < batchsize; i++){
+  for (const auto i : c10::irange(batchsize)) {
     at::cuda::solver::gesvdj<scalar_t>(
       handle, jobz, econ, m, n,
       A_data + i * A_stride,
@@ -577,7 +577,7 @@
 
   std::vector<int64_t> res;
 
-  for(int64_t i = 0; i < infos.numel(); i++) {
+  for (const auto i : c10::irange(infos.numel())) {
     int info_for_batch_i = infos_cpu_data[i];
 
     // From cusolver doc, if info < 0, the i-th function call parameter is wrong,
@@ -685,7 +685,7 @@
   auto workdata_host = host_allocator.allocate(worksize_host * batch_size);
   void* workdata_host_ptr = workdata_host.get();
 
-  for (int64_t i = 0; i < batch_size; i++) {
+  for (const auto i : c10::irange(batch_size)) {
     at::cuda::solver::xpotrf(
       handle, params, uplo, n, datatype,
       self_working_copy_ptr + i * matrix_stride,
@@ -709,7 +709,7 @@
   auto work_data = allocator.allocate(sizeof(scalar_t)*lwork * batch_size);
   scalar_t* work_data_ptr = static_cast<scalar_t*>(work_data.get());
 
-  for (int64_t i = 0; i < batch_size; i++) {
+  for (const auto i : c10::irange(batch_size)) {
     at::cuda::solver::potrf<scalar_t>(
       handle, uplo, n_32,
       self_working_copy_ptr + i * matrix_stride,
@@ -782,7 +782,7 @@
   cudaDataType datatype = at::cuda::solver::get_cusolver_datatype<scalar_t>();
   TORCH_CUSOLVER_CHECK(cusolverDnCreateParams(&params));
 
-  for (int64_t i = 0; i < batch_size; i++) {
+  for (const auto i : c10::irange(batch_size)) {
     at::cuda::solver::xpotrs(
       handle, params, uplo, n, nrhs, datatype,
       A_ptr + i * A_matrix_stride,
@@ -800,7 +800,7 @@
   int lda_32 = cuda_int_cast(lda, "lda");
   int ldb_32 = cuda_int_cast(ldb, "ldb");
 
-  for (int64_t i = 0; i < batch_size; i++) {
+  for (const auto i : c10::irange(batch_size)) {
     at::cuda::solver::potrs<scalar_t>(
       handle, uplo, n_32, nrhs_32,
       A_ptr + i * A_matrix_stride,
@@ -1041,7 +1041,7 @@
   auto info = at::zeros({1}, input.options().dtype(at::kInt));
   auto info_data = info.data_ptr<int>();
 
-  for (auto i = decltype(batch_size){0}; i < batch_size; i++) {
+  for (const auto i : c10::irange(decltype(batch_size){0}, batch_size)) {
     scalar_t* input_working_ptr = &input_data[i * input_matrix_stride];
     scalar_t* other_working_ptr = &other_data[i * other_matrix_stride];
     scalar_t* tau_working_ptr = &tau_data[i * tau_stride];
@@ -1119,7 +1119,7 @@
   auto info = at::zeros({1}, self.options().dtype(at::kInt));
   auto info_data = info.data_ptr<int>();
 
-  for (auto i = decltype(batchsize){0}; i < batchsize; i++) {
+  for (const auto i : c10::irange(decltype(batchsize){0}, batchsize)) {
     scalar_t* self_working_ptr = &self_data[i * self_matrix_stride];
     scalar_t* tau_working_ptr = &tau_data[i * tau_stride];
     auto handle = at::cuda::getCurrentCUDASolverDnHandle();
@@ -1411,7 +1411,7 @@
     const auto pivots_stride = get_pivots ? pivots.size(-1) : 0;
 
     const auto handle = at::cuda::getCurrentCUDASolverDnHandle();
-    for (auto batch = decltype(batch_size){0}; batch < batch_size; ++batch) {
+    for (const auto batch : c10::irange(decltype(batch_size){0}, batch_size)) {
       at::cuda::solver::getrf<scalar_t>(
         handle, m, n,
         self_data + batch * self_stride,
@@ -1447,7 +1447,7 @@
     int leading_dimension = cuda_int_cast(std::max<int>(1, n), "leading_dimension");
 
     auto handle = at::cuda::getCurrentCUDASolverDnHandle();
-    for (auto batch = decltype(batch_size){0}; batch < batch_size; ++batch) {
+    for (const auto batch : c10::irange(decltype(batch_size){0}, batch_size)) {
       at::cuda::solver::getrs<scalar_t>(
         handle,
         n,
diff --git a/aten/src/ATen/native/cuda/jit_utils.cpp b/aten/src/ATen/native/cuda/jit_utils.cpp
index f201bcf..c7772a2 100644
--- a/aten/src/ATen/native/cuda/jit_utils.cpp
+++ b/aten/src/ATen/native/cuda/jit_utils.cpp
@@ -358,7 +358,7 @@
     Array<${index_type}, NARGS> get(${index_type} linear_idx) const {
       Array<${index_type}, NARGS> offsets;
       #pragma unroll
-      for (int arg = 0; arg < NARGS; arg++) {
+      for (const auto arg : c10::irange(NARGS)) {
         offsets[arg] = linear_idx;
       }
       return offsets;
@@ -371,12 +371,12 @@
   __device__ __forceinline__ Array<${index_type}, NARGS> get(${index_type} linear_idx) const {
       Array<${index_type}, NARGS> offsets;
       #pragma unroll
-      for (int arg = 0; arg < NARGS; ++arg) {
+      for (const auto arg : c10::irange(NARGS)) {
       offsets[arg] = 0;
       }
 
       #pragma unroll
-      for (int dim = 0; dim < 25; ++dim) {
+      for (const auto dim : c10::irange(25)) {
       if (dim == dims) {
           break;
       }
@@ -385,7 +385,7 @@
       linear_idx = divmod.div;
 
       #pragma unroll
-      for (int arg = 0; arg < NARGS; ++arg) {
+      for (const auto arg : c10::irange(NARGS)) {
           offsets[arg] += divmod.mod * strides_[dim][arg];
       }
       //printf("offset calc thread dim size stride offset %d %d %d %d %d %d %d %d\n",
@@ -421,7 +421,7 @@
     auto thread_idx = threadIdx.x;
 
     #pragma unroll
-    for (int j = 0; j < thread_work_size; j++){
+    for (const auto j : c10::irange(thread_work_size)) {
         if (thread_idx >= remaining) {
             break;
         }
@@ -435,7 +435,7 @@
     }
 
     #pragma unroll
-    for (int j = 0; j < thread_work_size; j++) {
+    for (const auto j : c10::irange(thread_work_size)) {
       if ((threadIdx.x  + j*num_threads) < remaining) {
         out[j] = ${name}<${compute_type}>(${args}${extra_args});
       }
@@ -443,7 +443,7 @@
 
     thread_idx = threadIdx.x;
     #pragma unroll
-    for (int j = 0; j < thread_work_size; j++){
+    for (const auto j : c10::irange(thread_work_size)) {
         if (thread_idx >= remaining) {
             break;
         }
@@ -496,7 +496,7 @@
 
       if (remaining < block_work_size) {
         #pragma unroll
-        for (int j = 0; j < thread_work_size; j++){
+        for (const auto j : c10::irange(thread_work_size)) {
           if (thread_idx >= remaining) {
             break;
           }
@@ -505,14 +505,14 @@
           thread_idx += num_threads;
         }
         #pragma unroll
-        for (int j = 0; j < thread_work_size; j++) {
+        for (const auto j : c10::irange(thread_work_size)) {
           if ((threadIdx.x  + j*num_threads) < remaining) {
             out[j] = ${name}<${compute_type}>(${args} ${extra_args});
           }
         }
         thread_idx = threadIdx.x;
         #pragma unroll
-        for (int j = 0; j < thread_work_size; j++) {
+        for (const auto j : c10::irange(thread_work_size)) {
           if (thread_idx >= remaining) {
               break;
           }
@@ -526,7 +526,7 @@
         using vec_t_input = aligned_vector<${scalar_type}, vec_size>;
         ${vector_pointers}
         #pragma unroll
-        for (int i = 0; i<loop_size; i++){
+        for (const auto i : c10::irange(loop_size)) {
           vec_t_input v;
           ${load_vectorized_inputs}
           thread_idx += num_threads;
@@ -534,17 +534,17 @@
 
 
         #pragma unroll
-        for (int j = 0; j < thread_work_size; j++) {
+        for (const auto j : c10::irange(thread_work_size)) {
           out[j] = ${name}<${compute_type}>(${args}${extra_args});
         }
         using vec_t_output = aligned_vector<${result_type}, vec_size>;
         vec_t_output * to_ = reinterpret_cast<vec_t_output *>(data[0]) + block_work_size / vec_size * idx;
         int thread_idx = threadIdx.x;
         #pragma unroll
-        for (int i = 0; i<loop_size; i++){
+        for (const auto i : c10::irange(loop_size)) {
           vec_t_output v;
           #pragma unroll
-          for (int j=0; j<vec_size; j++){
+          for (const auto j : c10::irange(vec_size)) {
             v.val[j] = out[vec_size * i + j];
           }
           to_[thread_idx] = v;
@@ -664,7 +664,7 @@
   // (look at polygamma for example).
   std::string extra_params = "";
   std::string extra_args = "";
-  for (size_t i = 0; i < extra_args_typenames.size(); i++) {
+  for (const auto i : c10::irange(extra_args_typenames.size())) {
     auto type = std::string(extra_args_typenames[i]);
     auto name = "extra_arg_" + std::string(to_string(i));
     extra_params += "," + type + " " + name;
@@ -674,7 +674,7 @@
   env.s("extra_args", extra_args);
 
   std::stringstream declare_load_arrays;
-  for (int i = 0; i < nInputs; i++) {
+  for (const auto i : c10::irange(nInputs)) {
     // TODO these arrays are potentially of the different types, use function
     // traits to determine the types
     declare_load_arrays << f_inputs_type << " arg" << std::to_string(i)
@@ -746,7 +746,7 @@
     }
 
     std::stringstream load_inputs;
-    for (int i = 0; i < nInputs; i++) {
+    for (const auto i : c10::irange(nInputs)) {
       auto i_string = std::to_string(i);
       load_inputs << "arg" << i_string << "[j] = l.load<" << f_inputs_type
                   << ">(data[" << std::to_string(i + nOutputs)
@@ -781,7 +781,7 @@
     auto i_string = std::to_string(i);
     load_vectorized_inputs << "v = vec" << i_string << "[thread_idx];\n";
     load_vectorized_inputs << "#pragma unroll\n";
-    load_vectorized_inputs << "for (int j=0; j < vec_size; j++){\n";
+    load_vectorized_inputs << "for (const auto j : c10::irange(vec_size)) {\n";
     load_vectorized_inputs << "  arg" << i_string << "[vec_size * i + j] = v.val[j];\n";
     load_vectorized_inputs << "}\n";
   }
diff --git a/aten/src/ATen/native/mkl/SpectralOps.cpp b/aten/src/ATen/native/mkl/SpectralOps.cpp
index 870b010..bcf8afe 100644
--- a/aten/src/ATen/native/mkl/SpectralOps.cpp
+++ b/aten/src/ATen/native/mkl/SpectralOps.cpp
@@ -82,7 +82,7 @@
     // Explicitly loop over a Hermitian mirrored dimension
     if (iter_index[0] > 0) {
       auto end = std::min(signal_half_sizes[0], iter_index[0] + numel_remaining);
-      for (int64_t i = iter_index[0]; i < end; ++i) {
+      for (const auto i : c10::irange(iter_index[0], end)) {
         out_ptr[(signal_half_sizes[0] - i) * out_strides[0]] = std::conj(in_ptr[i * in_strides[0]]);
       }
       numel_remaining -= (end - iter_index[0]);
diff --git a/aten/src/ATen/native/quantized/cpu/qembeddingbag.cpp b/aten/src/ATen/native/quantized/cpu/qembeddingbag.cpp
index 0d3bf94..cf18da7 100644
--- a/aten/src/ATen/native/quantized/cpu/qembeddingbag.cpp
+++ b/aten/src/ATen/native/quantized/cpu/qembeddingbag.cpp
@@ -149,7 +149,7 @@
     int64_t N,
     const OffsetType* offsets,
     const IndexType* indices) {
-  for (int m = 0; m < output_size; ++m) {
+  for (const auto m : c10::irange(output_size)) {
     for (OffsetType i = offsets[m]; i < offsets[m + 1]; ++i) {
       TORCH_CHECK(i < index_size);
       IndexType idx = indices[i];
diff --git a/aten/src/ATen/native/sparse/SparseCsrTensorMath.cpp b/aten/src/ATen/native/sparse/SparseCsrTensorMath.cpp
index 522a12d..4bfaa1c 100644
--- a/aten/src/ATen/native/sparse/SparseCsrTensorMath.cpp
+++ b/aten/src/ATen/native/sparse/SparseCsrTensorMath.cpp
@@ -120,7 +120,7 @@
   output_t* data_out = row0.data_ptr<output_t>();
   row1.copy_(*col_indices.expect_contiguous());
   at::parallel_for(0, nrows, GRAIN_SIZE, [&](int64_t start, int64_t end) {
-    for (int64_t i = start; i < end; i++) {
+    for (const auto i : c10::irange(start, end)) {
       std::fill(&data_out[crow_indices_data_in[i]], &data_out[crow_indices_data_in[i + 1]], static_cast<output_t>(i));
     }
   });
diff --git a/aten/src/ATen/native/vulkan/ops/Concat.cpp b/aten/src/ATen/native/vulkan/ops/Concat.cpp
index ed6bbae..3d58786 100644
--- a/aten/src/ATen/native/vulkan/ops/Concat.cpp
+++ b/aten/src/ATen/native/vulkan/ops/Concat.cpp
@@ -1,5 +1,6 @@
 #include <ATen/native/vulkan/api/Helper.h>
 #include <ATen/native/vulkan/ops/Common.h>
+#include <c10/util/irange.h>
 #include <torch/library.h>
 
 namespace at {
@@ -128,7 +129,7 @@
         v_self.extents().data[1u],
         depth_slice};
 
-      for (int b = 0; b < tensor.sizes()[0]; ++b) {
+      for (const auto b : c10::irange(tensor.sizes()[0])) {
         src_offset.data[2u] = safe_downcast<uint32_t>(depth_slice * b);
         dst_offset.data[2u] = depth_size_allprior + safe_downcast<uint32_t>(depth_interval * b);
         api::helper::copy_texture_to_texture(command_buffer,
@@ -215,7 +216,7 @@
       is_mult4ch = false;
     }
 
-    for (int d = 0; d < 4; ++d) {
+    for (const auto d : c10::irange(4)) {
       if (d == dim) {
         continue;
       }
diff --git a/aten/src/ATen/native/vulkan/ops/Convolution.cpp b/aten/src/ATen/native/vulkan/ops/Convolution.cpp
index 3405afe..c7d629c 100644
--- a/aten/src/ATen/native/vulkan/ops/Convolution.cpp
+++ b/aten/src/ATen/native/vulkan/ops/Convolution.cpp
@@ -100,7 +100,7 @@
   float* const dst_weight_ptr = v_weight_payload.get();
   memset(dst_weight_ptr, 0, v_weight.nbytes());
 
-  for (int64_t src_oc = 0; src_oc < src_filter[Layout::Filter::output]; ++src_oc) {
+  for (const auto src_oc : c10::irange(src_filter[Layout::Filter::output])) {
     /* Source */
     const float* const src_weight_oc_ptr = src_weight_ptr + src_oc * src_block_sz;
 
@@ -112,7 +112,7 @@
                                     dst_c * dst_kernel_sz +
                                     dst_oh * dst_kw_sz;
 
-    for (int64_t src_ih = 0; src_ih < src_filter[Layout::Filter::height]; ++src_ih) {
+    for (const auto src_ih : c10::irange(src_filter[Layout::Filter::height])) {
       memcpy(
           dst_weight_c_ptr + src_ih * src_kw_sz,
           src_weight_oc_ptr + src_ih * src_kw_sz,
@@ -161,7 +161,7 @@
   float* const dst_weight_ptr = v_weight_payload.get();
   memset(dst_weight_ptr, 0, v_weight.nbytes());
 
-  for (int64_t src_oc = 0; src_oc < src_filter[Layout::Filter::output]; ++src_oc) {
+  for (const auto src_oc : c10::irange(src_filter[Layout::Filter::output])) {
     /* Source */
     const float* const src_weight_oc_ptr = src_weight_ptr + src_oc * src_block_sz;
 
@@ -171,7 +171,7 @@
 
     float* const dst_weight_c_ptr = dst_weight_ptr + dst_c * dst_kernel_sz;
 
-    for (int64_t src_ic = 0; src_ic < src_filter[Layout::Filter::input]; ++src_ic) {
+    for (const auto src_ic : c10::irange(src_filter[Layout::Filter::input])) {
       const int64_t dst_ic4 = src_ic / 4;
 
       for (const auto src_ih : c10::irange(src_kh_sz)) {
diff --git a/aten/src/ATen/native/vulkan/ops/TransposeConvolution2d.cpp b/aten/src/ATen/native/vulkan/ops/TransposeConvolution2d.cpp
index 8896f6f..d459e5d 100644
--- a/aten/src/ATen/native/vulkan/ops/TransposeConvolution2d.cpp
+++ b/aten/src/ATen/native/vulkan/ops/TransposeConvolution2d.cpp
@@ -3,6 +3,7 @@
 #include <ATen/native/utils/ParamUtils.h>
 #include <ATen/native/vulkan/ops/Common.h>
 #include <ATen/native/vulkan/api/Utils.h>
+#include <c10/util/irange.h>
 
 namespace at {
 namespace native {
@@ -51,7 +52,7 @@
   float* const dst_weight_ptr = v_weight_payload.get();
   memset(dst_weight_ptr, 0, v_weight.nbytes());
 
-  for (int64_t src_oc = 0; src_oc < src_filter[Layout::Filter::output]; ++src_oc) {
+  for (const auto src_oc : c10::irange(src_filter[Layout::Filter::output])) {
     /* Source */
     const float* const src_weight_oc_ptr = src_weight_ptr + src_oc * src_block_sz;
 
@@ -61,10 +62,10 @@
 
     float* const dst_weight_c_ptr = dst_weight_ptr + dst_c * dst_kernel_sz;
 
-    for (int64_t src_ic = 0; src_ic < src_filter[Layout::Filter::input]; ++src_ic) {
-      for (int64_t src_ih = 0; src_ih < src_kh_sz; ++src_ih) {
+    for (const auto src_ic : c10::irange(src_filter[Layout::Filter::input])) {
+      for (const auto src_ih : c10::irange(src_kh_sz)) {
         const int64_t dst_h = reversed ? (src_kh_sz - 1 - src_ih) : src_ih;
-        for (int64_t src_iw = 0; src_iw < src_kw_sz; ++src_iw) {
+        for (const auto src_iw : c10::irange(src_kw_sz)) {
           const int64_t dst_w = reversed ? (src_kw_sz - 1 - src_iw) : src_iw;
           const int64_t dst_w_offset = dst_w * stack_depth;
           memcpy(
@@ -127,7 +128,7 @@
     float* const dst_bias_ptr = v_bias_payload.get();
 
     memset(dst_bias_ptr, 0, v_bias.nbytes());
-    for (int64_t i = 0; i < src_w; ++i) {
+    for (const auto i : c10::irange(src_w)) {
       const int64_t c = i % 4;
       const int64_t x = i / 4;
       dst_bias_ptr[c * packed_w + x] = src_bias_ptr[i];
@@ -251,7 +252,7 @@
   std::vector<int64_t> output_size(dim);
   output_size[0] = input_size[input_batch_size_dim];
   output_size[1] = weight_size[weight_input_channels_dim];
-  for (size_t d = 2; d < dim; ++d) {
+  for (const auto d : c10::irange(2, dim)) {
     output_size[d] = stride[d - 2] * (input_size[d] - 1) + weight_size[d] - 2 * padding[d - 2] + output_padding[d - 2];
   }
   return output_size;
diff --git a/aten/src/ATen/nnapi/nnapi_bind.cpp b/aten/src/ATen/nnapi/nnapi_bind.cpp
index 54267f4..4d48938 100644
--- a/aten/src/ATen/nnapi/nnapi_bind.cpp
+++ b/aten/src/ATen/nnapi/nnapi_bind.cpp
@@ -150,7 +150,7 @@
   TORCH_CHECK(operand->dimensionCount == t.dim()); // Check for overflow.
   dims->resize(t.dim());
   operand->dimensions = dims->data();
-  for (size_t i = 0; i < dims->size(); i++) {
+  for (const auto i : c10::irange(dims->size())) {
     (*dims)[i] = t.sizes()[i];
     TORCH_CHECK((*dims)[i] == t.sizes()[i]); // Check for overflow.
   }
diff --git a/aten/src/ATen/nnapi/nnapi_model_loader.cpp b/aten/src/ATen/nnapi/nnapi_model_loader.cpp
index 2906e62..8553d97 100644
--- a/aten/src/ATen/nnapi/nnapi_model_loader.cpp
+++ b/aten/src/ATen/nnapi/nnapi_model_loader.cpp
@@ -4,6 +4,7 @@
 #include <ATen/nnapi/NeuralNetworks.h>
 #include <ATen/nnapi/nnapi_wrapper.h>
 #include <ATen/nnapi/nnapi_model_loader.h>
+#include <c10/util/irange.h>
 
 
 #ifndef NNAPI_LOADER_STANDALONE
@@ -138,15 +139,15 @@
   next_pointer = (uint8_t*)serialized_model + required_size;
   CAFFE_ENFORCE(next_pointer <= end_of_buf);
 
-  for (int i = 0; i < ser_model->operand_count; i++) {
+  for (const auto i : c10::irange(ser_model->operand_count)) {
     required_size += 4 * operands[i].dimension_count;
   }
 
-  for (int i = 0; i < ser_model->value_count; i++) {
+  for (const auto i : c10::irange(ser_model->value_count)) {
     required_size += value_physical_size(values[i].source_length);
   }
 
-  for (int i = 0; i < ser_model->operation_count; i++) {
+  for (const auto i : c10::irange(ser_model->operation_count)) {
     required_size += 4 * (operations[i].input_count + operations[i].output_count);
   }
 
@@ -155,7 +156,7 @@
   CAFFE_ENFORCE(model_length >= required_size, "Model is too small.  Size = ", model_length);
   CAFFE_ENFORCE(next_pointer <= end_of_buf);
 
-  for (int i = 0; i < ser_model->operand_count; i++) {
+  for (const auto i : c10::irange(ser_model->operand_count)) {
     ANeuralNetworksOperandType operand;
     operand.type = operands[i].type;
     operand.scale = operands[i].scale;
@@ -171,7 +172,7 @@
     NNAPI_CHECK(result);
   }
 
-  for (int i = 0; i < ser_model->value_count; i++) {
+  for (const auto i : c10::irange(ser_model->value_count)) {
     uint32_t len = values[i].source_length;
     const uint8_t* stored_pointer = next_pointer;
     // NOLINTNEXTLINE(modernize-use-nullptr)
@@ -220,7 +221,7 @@
     NNAPI_CHECK(result);
   }
 
-  for (int i = 0; i < ser_model->operation_count; i++) {
+  for (const auto i : c10::irange(ser_model->operation_count)) {
     const uint32_t* inputs = (const uint32_t*)next_pointer;
     next_pointer += 4 * operations[i].input_count;
     CAFFE_ENFORCE(next_pointer <= end_of_buf);
diff --git a/aten/src/ATen/test/basic.cpp b/aten/src/ATen/test/basic.cpp
index 6c2c977..d78ef45 100644
--- a/aten/src/ATen/test/basic.cpp
+++ b/aten/src/ATen/test/basic.cpp
@@ -254,7 +254,7 @@
   for (const auto i : c10::irange(tensor.numel())) {
     ASSERT_TRUE(tensor[i].equal(one * i));
   }
-  for (size_t i = 0; i < static_cast<uint64_t>(tensor.numel()); ++i) {
+  for (const auto i : c10::irange(static_cast<uint64_t>(tensor.numel()))) {
     ASSERT_TRUE(tensor[i].equal(one * static_cast<int64_t>(i)));
   }
   for (const auto i : c10::irange(tensor.numel())) {
diff --git a/aten/src/ATen/test/packedtensoraccessor_test.cpp b/aten/src/ATen/test/packedtensoraccessor_test.cpp
index 2a56fb4..e79a6c5 100644
--- a/aten/src/ATen/test/packedtensoraccessor_test.cpp
+++ b/aten/src/ATen/test/packedtensoraccessor_test.cpp
@@ -22,9 +22,9 @@
   ASSERT_EQ(original.size(0), transposed.size(2));
   ASSERT_EQ(original.size(1), transposed.size(1));
   ASSERT_EQ(original.size(2), transposed.size(0));
-  for (int i = 0; i < sizes[0]; i++) {
-    for (int j = 0; j < sizes[1]; j++) {
-      for (int k = 0; k < sizes[2]; k++) {
+  for (const auto i : c10::irange(sizes[0])) {
+    for (const auto j : c10::irange(sizes[1])) {
+      for (const auto k : c10::irange(sizes[2])) {
         ASSERT_EQ(original[i][j][k], transposed[k][j][i]);
       }
     }
diff --git a/aten/src/ATen/test/quantized_test.cpp b/aten/src/ATen/test/quantized_test.cpp
index 3f56e38..545ce03 100644
--- a/aten/src/ATen/test/quantized_test.cpp
+++ b/aten/src/ATen/test/quantized_test.cpp
@@ -288,7 +288,7 @@
   const int numel = 132;
 
   std::vector<float> x_values;
-  for (int i = 0; i < numel; i++) {
+  for (const auto i : c10::irange(numel)) {
     x_values.push_back(9 * i);
   }
 
@@ -303,14 +303,14 @@
     for (int zero_point : {zero_point_min, 10, zero_point_max}) {
       const Tensor q = at::quantize_per_tensor(x, scale, zero_point, scalar_type);
       auto* q_data = get_data_ptr(q);
-      for (int i = 0; i < numel; i++) {
+      for (const auto i : c10::irange(numel)) {
         ASSERT_EQ(
           q_data[i].val_,
           quantize_val_with_datatype(scale, zero_point, x_values[i]).val_);
       }
       const Tensor r = q.dequantize();
       const float* r_data = r.data_ptr<float>();
-      for (int i = 0; i < numel; i++) {
+      for (const auto i : c10::irange(numel)) {
         ASSERT_FLOAT_EQ(
           r_data[i],
           native::dequantize_val(scale, zero_point, q_data[i]));
diff --git a/aten/src/ATen/test/vec_test_all_types.cpp b/aten/src/ATen/test/vec_test_all_types.cpp
index 6cd9f0d..35cb7f1 100644
--- a/aten/src/ATen/test/vec_test_all_types.cpp
+++ b/aten/src/ATen/test/vec_test_all_types.cpp
@@ -457,7 +457,7 @@
         CACHE_ALIGN VT expected_vals[vec::size()];
         auto vals = 1 << (vec::size());
         for (const auto val : c10::irange(vals)) {
-          for (int i = 0; i < vec::size(); ++i) {
+          for (const auto i : c10::irange(vec::size())) {
             if (val & (1 << i)) {
               test_vals[i] = std::numeric_limits<VT>::quiet_NaN();
               // All bits are set to 1 if true, otherwise 0.
@@ -750,8 +750,7 @@
         auto power_sets = 1 << (vec::size());
         for (const auto expected : c10::irange(power_sets)) {
             // generate test_val based on expected
-            for (int i = 0; i < vec::size(); ++i)
-            {
+            for (const auto i : c10::irange(vec::size())) {
                 if (expected & (1 << i)) {
                     test_vals[i] = (VT)0;
                 }
@@ -777,7 +776,7 @@
         CACHE_ALIGN IntVT expected_vals1[vec::size()];
         // NOLINTNEXTLINE(cppcoreguidelines-avoid-c-arrays,modernize-avoid-c-arrays)
         CACHE_ALIGN IntVT actual_vals1[vec::size()];
-        for (int64_t i = 0; i < vec::size(); i++) {
+        for (const auto i : c10::irange(vec::size())) {
             input1[i] = (VT)i * (VT)2.1 + (VT)0.5;
             expected_vals1[i] = static_cast<IntVT>(input1[i]);
         }
@@ -795,7 +794,7 @@
         CACHE_ALIGN VT expected_vals2[vec::size()];
         // NOLINTNEXTLINE(cppcoreguidelines-avoid-c-arrays,modernize-avoid-c-arrays)
         CACHE_ALIGN VT actual_vals2[vec::size()];
-        for (int64_t i = 0; i < vec::size(); i++) {
+        for (const auto i : c10::irange(vec::size())) {
             input2[i] = (IntVT)i * (IntVT)2 + (IntVT)1;
             expected_vals2[i] = (VT)input2[i];
         }
@@ -834,7 +833,7 @@
     test_blend(VT expected_val[vec::size()], VT a[vec::size()], VT b[vec::size()]) {
         // generate expected_val
         int64_t m = mask;
-        for (int64_t i = 0; i < vec::size(); i++) {
+        for (const auto i : c10::irange(vec::size())) {
             expected_val[i] = (m & 0x01) ? b[i] : a[i];
             m = m >> 1;
         }
@@ -853,7 +852,7 @@
     test_blendv(VT expected_val[vec::size()], VT a[vec::size()], VT b[vec::size()], VT mask[vec::size()]) {
         using bit_rep = BitType<VT>;
         // generate expected_val
-        for (int64_t i = 0; i < vec::size(); i++) {
+        for (const auto i : c10::irange(vec::size())) {
             bit_rep hex_mask = 0;
             hex_mask=bit_cast<bit_rep>(mask[i]);
             expected_val[i] = (hex_mask & 0x01) ? b[i] : a[i];
@@ -865,7 +864,7 @@
         auto expected = vec::loadu(expected_val);
         auto actual = vec::blendv(vec_a, vec_b, vec_m);
         auto mask_str = std::string("\nblendv mask: ");
-        for (int64_t i = 0; i < vec::size(); i++) {
+        for (const auto i : c10::irange(vec::size())) {
             mask_str += std::to_string(mask[i]) + " ";
         }
         if (AssertVectorized<vec>(std::string(NAME_INFO(test_blendv)) + mask_str, expected, actual).check()) {
@@ -953,7 +952,7 @@
     void test_set(VT expected_val[vec::size()], VT a[vec::size()], VT b[vec::size()], int64_t count){
         if (count < 0) return;
         //generate expected_val
-        for (int64_t i = 0; i < vec::size(); i++) {
+        for (const auto i : c10::irange(vec::size())) {
             expected_val[i] = (i < count) ? b[i] : a[i];
         }
         // test with set
@@ -1000,7 +999,7 @@
         CACHE_ALIGN VT expected_val[vec::size()];
         VT base, step;
         arange_init(base, step);
-        for (int64_t i = 0; i < vec::size(); i++) {
+        for (const auto i : c10::irange(vec::size())) {
             expected_val[i] = base + VT((UVT)i) * step;
         }
         auto expected = vec::loadu(expected_val);
@@ -1059,7 +1058,7 @@
             float inv_scale = 1.0f / static_cast<float>(scale);
             auto zero_point_val = generator_zp.get();
             int index = 0;
-            for (int j = 0; j < vec::float_num_vecs(); j++) {
+            for (const auto j : c10::irange(vec::float_num_vecs())) {
                 //generate vals
                 for (auto& v : unit_float_vec) {
                     v = gen.get();
@@ -1107,7 +1106,7 @@
             int index = 0;
             auto qint_vec = vec::loadu(qint_vals);
             auto actual_float_ret = qint_vec.dequantize(vf_scale, vf_zp, vf_scale_zp);
-            for (int j = 0; j < vec::float_num_vecs(); j++) {
+            for (const auto j : c10::irange(vec::float_num_vecs())) {
                 for (auto& v : unit_exp_vals) {
                     v = dequantize_val(scale, zero_point_val, qint_vals[index]);
                     index++;
@@ -1144,7 +1143,7 @@
             float multiplier = 1.f / (generator_sc.get());
             auto zero_point_val = generator.get();
             int index = 0;
-            for (int j = 0; j < vec::float_num_vecs(); j++) {
+            for (const auto j : c10::irange(vec::float_num_vecs())) {
                 //generate vals
                 for (auto& v : unit_int_vec) {
                     v = c10::qint32(generator.get());
@@ -1180,7 +1179,7 @@
         for (const auto i : c10::irange(trials)) {
             (void)i; // Suppress unused variable warning
             //generate vals
-            for (int j = 0; j < vec::size(); j++) {
+            for (const auto j : c10::irange(vec::size())) {
                 qint_vals[j] = generator.get();
                 qint_b[j] = generator.get();
                 if (std::is_same<underlying, int>::value) {
@@ -1192,7 +1191,7 @@
             auto qint_vec = vec::loadu(qint_vals);
             auto qint_vec_b = vec::loadu(qint_b);
             auto actual_int_ret = qint_vec.widening_subtract(qint_vec_b);
-            for (int j = 0; j < vec::float_num_vecs(); j++) {
+            for (const auto j : c10::irange(vec::float_num_vecs())) {
                 for (auto& v : unit_exp_vals) {
                     v = widening_subtract(qint_vals[index], qint_b[index]);
                     index++;
diff --git a/aten/src/ATen/test/vulkan_api_test.cpp b/aten/src/ATen/test/vulkan_api_test.cpp
index 7f1ea01..ec2b1bd 100644
--- a/aten/src/ATen/test/vulkan_api_test.cpp
+++ b/aten/src/ATen/test/vulkan_api_test.cpp
@@ -2,6 +2,7 @@
 
 #include <gtest/gtest.h>
 #include <ATen/ATen.h>
+#include <c10/util/irange.h>
 
 // TODO: These functions should move to a common place.
 
@@ -46,9 +47,9 @@
   const float maxDiff = maxValue * tolerance;
   std::cout << "Max Diff allowed: " << maxDiff << std::endl;
   if (diff.sizes().size() == 2) {
-    for (int y = 0; y < diff.sizes()[0]; y++) {
+    for (const auto y : c10::irange(diff.sizes()[0])) {
       std::cout << y << ":";
-      for (int x = 0; x < diff.sizes()[1]; x++) {
+      for (const auto x : c10::irange(diff.sizes()[1])) {
         float diff_xy = diff[y][x].item<float>();
         if (diff_xy > maxDiff) {
           std::cout << std::setw(5) << x;
@@ -69,7 +70,7 @@
     out.push_back(in);
   }
   else {
-    for (int j = i; j < in.size(); ++j) {
+    for (const auto j : c10::irange(i, in.size())) {
       std::swap(in[i], in[j]);
       gen_allpermutations(out, in, i + 1);
     }