use irange for loops 8 (#66743)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/66743

Modified loops in files under fbsource/fbcode/caffe2/ from the format

`for(TYPE var=x0;var<x_max;x++)`

to the format

`for(const auto var: irange(xmax))`

This was achieved by running r-barnes's loop upgrader script (D28874212) with some modification to exclude all files under /torch/jit and a number of reversions or unused variable suppression warnings added by hand.

Test Plan: Sandcastle

Reviewed By: malfet

Differential Revision: D31705359

fbshipit-source-id: c9ea2fbc0f9cd29e97a52dcb203addc5f2abb09b
diff --git a/caffe2/operators/reshape_op.h b/caffe2/operators/reshape_op.h
index 57b5174..765921e 100644
--- a/caffe2/operators/reshape_op.h
+++ b/caffe2/operators/reshape_op.h
@@ -97,7 +97,7 @@
     }
 
     int unknown_idx = -1;
-    for (int i = 0; i < actual_new_shape.size(); ++i) {
+    for (const auto i : c10::irange(actual_new_shape.size())) {
       const auto dim = actual_new_shape[i];
       if (dim == -1) {
         CAFFE_ENFORCE(
@@ -153,7 +153,7 @@
     old_shape->Resize(input.sizes().size());
     T* old_shape_data = old_shape->template mutable_data<T>();
     std::vector<T> old_shape_vector(input.sizes().begin(), input.sizes().end());
-    for (int i = 0; i < old_shape_vector.size(); ++i) {
+    for (const auto i : c10::irange(old_shape_vector.size())) {
       old_shape_data[i] = old_shape_vector[i];
     }
 
diff --git a/caffe2/operators/reverse_packed_segs_op.h b/caffe2/operators/reverse_packed_segs_op.h
index ac6dfdf..1d92e81 100644
--- a/caffe2/operators/reverse_packed_segs_op.h
+++ b/caffe2/operators/reverse_packed_segs_op.h
@@ -62,7 +62,7 @@
     context_.FinishDeviceComputation();
 
     T* rev_data_ptr = output->template mutable_data<T>();
-    for (int64_t i = 0; i < batch_size; i++) {
+    for (const auto i : c10::irange(batch_size)) {
       const auto& seg_length = lengths_host[i];
       CAFFE_ENFORCE_LE(seg_length, max_length);
       int64_t j = 0;
diff --git a/caffe2/operators/rnn/recurrent_network_blob_fetcher_op.h b/caffe2/operators/rnn/recurrent_network_blob_fetcher_op.h
index ebcf50e..13f542c 100644
--- a/caffe2/operators/rnn/recurrent_network_blob_fetcher_op.h
+++ b/caffe2/operators/rnn/recurrent_network_blob_fetcher_op.h
@@ -32,7 +32,7 @@
     std::vector<std::string> blob_names_vector = {};
 
     // NOLINTNEXTLINE(clang-diagnostic-sign-compare)
-    for (int64_t i = 0; i < stepWorkspaces.size(); i++) {
+    for (const auto i : c10::irange(stepWorkspaces.size())) {
       Workspace* currentStepWorkspace = stepWorkspaces[i].get();
       std::vector<std::string> blob_names = currentStepWorkspace->LocalBlobs();
 
diff --git a/caffe2/operators/rnn/recurrent_network_executor.h b/caffe2/operators/rnn/recurrent_network_executor.h
index eecccf7..179bb7c 100644
--- a/caffe2/operators/rnn/recurrent_network_executor.h
+++ b/caffe2/operators/rnn/recurrent_network_executor.h
@@ -38,7 +38,7 @@
         recurrent_input_map_(recurrent_input_map),
         timestep_blob_(timestep_blob) {
     const bool net_def_has_device_option = step_net_def_.has_device_option();
-    for (int i = 0; i < step_net_def_.op_size(); i++) {
+    for (const auto i : c10::irange(step_net_def_.op_size())) {
       if (net_def_has_device_option) {
         // In the case when net def specifies device option, final device option
         // will be equal to merge of operator and net def device options, with
@@ -86,7 +86,7 @@
       for (auto& rnn_op : timestep_ops_template_) {
         rnn_op.has_timestep_blob = false;
         const OperatorDef& op = step_net_def_.op(rnn_op.order);
-        for (int i = 0; i < op.input_size(); i++) {
+        for (const auto i : c10::irange(op.input_size())) {
           if (op.input(i) == timestep_blob_) {
             rnn_op.has_timestep_blob = true;
             break;
@@ -137,7 +137,7 @@
         if (rnn_op.has_timestep_blob) {
           OperatorDef op_copy = step_net_def_.op(rnn_op.order);
 
-          for (int i = 0; i < op_copy.input_size(); i++) {
+          for (const auto i : c10::irange(op_copy.input_size())) {
             if (op_copy.input(i) == timestep_blob_) {
               op_copy.set_input(i, this_timestep_blob);
             }
@@ -283,7 +283,7 @@
       int opidx,
       std::vector<RNNNetOperator>& rnn_ops,
       std::unordered_set<int>* dep_ops) {
-    for (int i = 0; i < rnn_ops.size(); i++) {
+    for (const auto i : c10::irange(rnn_ops.size())) {
       if (i == opidx) {
         continue;
       }
@@ -315,7 +315,7 @@
    * for each timestep.
    */
   void CalculateInternalDependencies() {
-    for (int i = 0; i < step_net_def_.op_size(); i++) {
+    for (const auto i : c10::irange(step_net_def_.op_size())) {
       timestep_ops_template_.push_back(RNNNetOperator(step_net_def_.op(i), i));
     }
     // Then see which outputs appear as inputs, and those are
diff --git a/caffe2/operators/rnn/recurrent_network_op.h b/caffe2/operators/rnn/recurrent_network_op.h
index de82c9f..803606d 100644
--- a/caffe2/operators/rnn/recurrent_network_op.h
+++ b/caffe2/operators/rnn/recurrent_network_op.h
@@ -103,7 +103,7 @@
     T* dst,
     Context* context) {
   // NOLINTNEXTLINE(clang-diagnostic-sign-compare)
-  for (int i = 0; i < repeat_n; ++i) {
+  for (const auto i : c10::irange(repeat_n)) {
     context->template CopySameDevice<T>(n, src, dst + i * n);
   }
 }
@@ -228,7 +228,7 @@
     CAFFE_ENFORCE_EQ(states.size(), inputs.size(), "states/inputs mismatch");
     std::vector<detail::RecurrentInput> ris;
     // NOLINTNEXTLINE(clang-diagnostic-sign-compare)
-    for (auto i = 0; i < states.size(); ++i) {
+    for (const auto i : c10::irange(states.size())) {
       // States need to be "global" (since they are shared between
       // forward and backward).
       sharedWs->CreateBlob(states[i]);
@@ -254,7 +254,7 @@
         dst.size() == offset.size(), "alias_dst/alias_offset mismatch");
     std::vector<detail::OffsetAlias> aliases;
     // NOLINTNEXTLINE(clang-diagnostic-sign-compare)
-    for (auto i = 0; i < src.size(); ++i) {
+    for (const auto i : c10::irange(src.size())) {
       detail::OffsetAlias oc;
       oc.src = src[i];
       oc.dst = dst[i];
@@ -343,7 +343,7 @@
       stepWorkspaces.resize(num_workspaces_on_fwd_only);
     }
 
-    for (auto t = 0; t < seqLen; ++t) {
+    for (const auto t : c10::irange(seqLen)) {
       auto& currentStepWorkspace =
           (has_backward_pass ? stepWorkspaces[t] :
               stepWorkspaces[t % num_workspaces_on_fwd_only]);
@@ -472,7 +472,7 @@
   }
 
   void renameOpInputOutput(std::string from_name, std::string to_name) {
-    for (int j = 0; j < stepNetDef_.op_size(); j++) {
+    for (const auto j : c10::irange(stepNetDef_.op_size())) {
       auto* op = stepNetDef_.mutable_op(j);
       for (int i = 0; i < op->input_size(); i++) {
         if (op->input(i) == from_name) {
@@ -498,7 +498,7 @@
         " != ",
         param_grads.size());
     // NOLINTNEXTLINE(clang-diagnostic-sign-compare)
-    for (int i = 0; i < param.size(); ++i) {
+    for (const auto i : c10::irange(param.size())) {
       detail::Param p;
       // Forward inputs come after [outputs_with_grads] gradient inputs
       p.param = operator_def.input(param[i] + gradInputs_.size());
@@ -526,17 +526,17 @@
         this->template GetRepeatedArgument<int32_t>("alias_offset");
 
     // NOLINTNEXTLINE(clang-diagnostic-sign-compare)
-    for (auto i = 0; i < recurrent.size(); ++i) {
+    for (const auto i : c10::irange(recurrent.size())) {
       detail::RecurrentGradient rg;
       rg.param = recurrent[i];
       rg.grad = remappedName(recurrent[i] + "_grad");
 
-      for (int j = 0; j < alias_src.size(); ++j) {
+      for (const auto j : c10::irange(alias_src.size())) {
         if (alias_src[j] != recurrent[i]) {
           continue;
         }
         int idx = -1;
-        for (int k = 0; k < gradInputs_.size(); ++k) {
+        for (const auto k : c10::irange(gradInputs_.size())) {
           if (gradInputs_[k] == j) {
             idx = k;
           }
@@ -575,7 +575,7 @@
         "",
         &links);
     // NOLINTNEXTLINE(clang-diagnostic-sign-compare)
-    for (int i = 0; i < links.size(); i++) {
+    for (const auto i : c10::irange(links.size())) {
       links[i] = remappedLink(links[i]);
     }
     return links;
@@ -715,7 +715,7 @@
     // This code assumes that there are several inputs
     // sequences. Actually it is not supported by the rest of the code,
     // and numSequences_ is a constant, equal to 1.
-    for (int i = 0; i < numSequences_; ++i) {
+    for (const auto i : c10::irange(numSequences_)) {
       // Offseting as the first gradInputs_.size() inputs of the op
       // are from GO. Then all I(0..N).
       const int gradientInputIndex = i + gradInputs_.size();
@@ -790,7 +790,7 @@
 
     CAFFE_ENFORCE_EQ(recurrentInputIds_.size(), recurrentGradients_.size());
     // NOLINTNEXTLINE(clang-diagnostic-sign-compare)
-    for (int i = 0; i < recurrentInputIds_.size(); ++i) {
+    for (const auto i : c10::irange(recurrentInputIds_.size())) {
       // See GetRecurrentNetworkGradient to understand offseting here
       // Outputs of the gradient are inputs of the forward pass.
       // So we need to offset on all inputs that go before recurrent
diff --git a/caffe2/operators/rowmul_op.h b/caffe2/operators/rowmul_op.h
index ef86627..cf0339e 100644
--- a/caffe2/operators/rowmul_op.h
+++ b/caffe2/operators/rowmul_op.h
@@ -32,9 +32,9 @@
         "Length of w should be equal to the first dim of mat");
 
     auto block_size = mat.size_from_dim(1);
-    for (int i = 0; i < w.numel(); i++) {
+    for (const auto i : c10::irange(w.numel())) {
       size_t offset = i * block_size;
-      for (int j = 0; j < block_size; j++) {
+      for (const auto j : c10::irange(block_size)) {
         output_data[offset + j] = mat_data[offset + j] * w_data[i];
       }
     }
@@ -60,10 +60,10 @@
     T* output_data = output->template mutable_data<T>();
     const T* mat_data = mat.template data<T>();
 
-    for (int i = 0; i < N; i++) {
+    for (const auto i : c10::irange(N)) {
       output_data[i] = 0;
       size_t offset = i * block_size;
-      for (int j = 0; j < block_size; j++) {
+      for (const auto j : c10::irange(block_size)) {
         output_data[i] += mat_data[offset + j];
       }
     }
diff --git a/caffe2/operators/scale_blobs_op.h b/caffe2/operators/scale_blobs_op.h
index dce69e3..af7d800 100644
--- a/caffe2/operators/scale_blobs_op.h
+++ b/caffe2/operators/scale_blobs_op.h
@@ -20,7 +20,7 @@
   bool DoRunWithType() {
     int batchSize = InputSize();
 
-    for (int i = 0; i < batchSize; ++i) {
+    for (const auto i : c10::irange(batchSize)) {
       const auto& X = Input(i);
       auto* Y = Output(i, X.sizes(), at::dtype<T>());
       math::Scale<float, T, Context>(
@@ -34,7 +34,7 @@
   }
 
   bool RunOnDevice() override {
-    for (int i = 0; i < InputSize(); ++i) {
+    for (const auto i : c10::irange(InputSize())) {
       auto& input = this->template Input<Tensor>(i, CPU);
       auto* output = this->template Output<Tensor>(i, CPU);
       output->ResizeLike(input);
diff --git a/caffe2/operators/segment_reduction_op.h b/caffe2/operators/segment_reduction_op.h
index f126b10..b407581 100644
--- a/caffe2/operators/segment_reduction_op.h
+++ b/caffe2/operators/segment_reduction_op.h
@@ -2,6 +2,7 @@
 #define CAFFE2_OPERATORS_SEGMENT_REDUCTION_OP_H_
 
 #include "caffe2/core/export_caffe2_op_to_c10.h"
+#include <c10/util/irange.h>
 #include "caffe2/core/context.h"
 #include "caffe2/core/logging.h"
 #include "caffe2/core/operator.h"
@@ -335,7 +336,7 @@
     const int num_blocks = block_size > 0 ? data.numel() / block_size : 0;
 
     Reducer r(ctx, out, &context_);
-    for (int64_t i = 0; i < num_blocks; ++i) {
+    for (const auto i : c10::irange(num_blocks)) {
       r.template process<FixedSize>(
           ctx, inputAccessor_.getBlockPtr(block_size, i), i, &context_);
     }
@@ -406,7 +407,7 @@
     T* out = data_grads->template mutable_data<T>();
 
     ReducerGradient r(ctx, r_grad, &context_);
-    for (int64_t i = 0; i < block_num; ++i) {
+    for (const auto i : c10::irange(block_num)) {
       r.template fillGrad<FixedSize>(
           ctx,
           out + block_size * i,
@@ -1070,7 +1071,7 @@
       K = num_segments_;
     } else {
       K = 0;
-      for (int64_t i = 0; i < N; ++i) {
+      for (const auto i : c10::irange(N)) {
         K = std::max(K, s_ids[i] + 1);
       }
     }
@@ -1086,11 +1087,11 @@
 
     reducers_.clear();
     reducers_.reserve(K);
-    for (int64_t i = 0; i < K; ++i) {
+    for (const auto i : c10::irange(K)) {
       reducers_.emplace_back(ctx, out + out_block_size * i, &context_);
     }
 
-    for (int64_t i = 0; i < N; ++i) {
+    for (const auto i : c10::irange(N)) {
       auto s_id = s_ids[i];
       CAFFE_ENFORCE(
           0 <= s_id && s_id < K,
@@ -1114,7 +1115,7 @@
           ctx, inputAccessor_.getBlockPtr(in_block_size, idx), i, &context_);
     }
 
-    for (int64_t i = 0; i < K; ++i) {
+    for (const auto i : c10::irange(K)) {
       reducers_[i].template finish<FixedSize>(ctx, &context_);
     }
     // call reducers destructors (if there is any)
@@ -1188,7 +1189,7 @@
 
     if (ReducerGradient::computeLength()) {
       segment_length_.resize(K, 0);
-      for (int i = 0; i < N; ++i) {
+      for (const auto i : c10::irange(N)) {
         auto s_id = s_ids[i];
         CAFFE_ENFORCE(
             0 <= s_id && s_id < K,
@@ -1206,7 +1207,7 @@
       reducers_.emplace_back(ctx, s_grads + s_block_size * i, &context_);
     }
 
-    for (int64_t i = 0; i < N; ++i) {
+    for (const auto i : c10::irange(N)) {
       auto s_id = s_ids[i];
       if (ReducerGradient::computeLength()) {
         reducers_[s_id].template fillGrad<FixedSize>(
@@ -1462,7 +1463,7 @@
     TData* out = output->template mutable_data<TData>();
 
     int64_t dataIndex = 0;
-    for (int64_t rangeIndex = 0; rangeIndex < outputSize; ++rangeIndex) {
+    for (const auto rangeIndex : c10::irange(outputSize)) {
       Reducer reducer(ctx, out + out_block_size * rangeIndex, &context_);
       for (int64_t start = dataIndex; dataIndex < start + lengths[rangeIndex];
            ++dataIndex) {
@@ -1551,7 +1552,7 @@
     CAFFE_ENFORCE(segmentGradsInput.dim() > 0);
     CAFFE_ENFORCE(numSegments == segmentGradsInput.size(0));
     const TLengths* lengths = lengthsInput.template data<TLengths>();
-    for (int64_t i = 0; i < numSegments; ++i) {
+    for (const auto i : c10::irange(numSegments)) {
       reducedDataSize += lengths[i];
     }
 
@@ -1580,7 +1581,7 @@
     T* dataGrads = dataGradsOutput->template mutable_data<T>();
 
     int64_t dataIndex = 0;
-    for (int64_t rangeIndex = 0; rangeIndex < numSegments; ++rangeIndex) {
+    for (const auto rangeIndex : c10::irange(numSegments)) {
       ReducerGradient reducer(
           ctx, segmentGrads + segmentBlockSize * rangeIndex, &context_);
       for (int64_t start = dataIndex; dataIndex < start + lengths[rangeIndex];
@@ -1690,7 +1691,7 @@
 
     const Tembedding* data = dataInput.template data<Tembedding>();
     int64_t dataIndex = 0;
-    for (int64_t rangeIndex = 0; rangeIndex < numSegments; ++rangeIndex) {
+    for (const auto rangeIndex : c10::irange(numSegments)) {
       ReducerGradient reducer(
           ctx, segmentGrads + segmentBlockSize * rangeIndex, &context_);
       for (int64_t start = dataIndex; dataIndex < start + lengths[rangeIndex];
@@ -1788,7 +1789,7 @@
     const T* data = dataInput.template data<T>();
 
     int64_t dataIndex = 0;
-    for (int64_t rangeIndex = 0; rangeIndex < numSegments; ++rangeIndex) {
+    for (const auto rangeIndex : c10::irange(numSegments)) {
       ReducerGradient reducer(
           ctx, segmentGrads + segmentBlockSize * rangeIndex, &context_);
       for (int64_t start = dataIndex; dataIndex < start + lengths[rangeIndex];
diff --git a/caffe2/operators/self_binning_histogram_op.h b/caffe2/operators/self_binning_histogram_op.h
index 9ad95a4..3e2f303 100644
--- a/caffe2/operators/self_binning_histogram_op.h
+++ b/caffe2/operators/self_binning_histogram_op.h
@@ -59,12 +59,12 @@
     T max = 0;
     T min = 0;
     int64_t total_count = 0;
-    for (int input_idx = 0; input_idx < InputSize(); input_idx++) {
+    for (const auto input_idx : c10::irange(InputSize())) {
       const auto& x = Input(input_idx);
       const int64_t N = x.numel();
       total_count += N;
       const auto* x_data = x.template data<T>();
-      for (int64_t data_idx = 0; data_idx < N; data_idx++) {
+      for (const auto data_idx : c10::irange(N)) {
         const T val = this->abs_ ? abs(x_data[data_idx]) :  x_data[data_idx];
         if (!first_seen) {
           max = val;
@@ -91,7 +91,7 @@
       scaled_max = min + (max - min) * RANGE_SCALING;
       T scaled_range = (scaled_max - min);
       // Avoid underflow by calculating advancement through multiplication.
-      for (int i = 0; i < num_edges_; i++) {
+      for (const auto i : c10::irange(num_edges_)) {
         T advancement_ratio = T(i) / num_bins_;
         histogram_values_data[i] = min + advancement_ratio * scaled_range;
       }
@@ -112,7 +112,7 @@
       T log_multiplier_numerator =log(scaled_max) - log(min);
       // Avoid underflow by:
       // - Calculating each advancement separately for each i.
-      for (int i = 0; i < num_edges_; i++) {
+      for (const auto i : c10::irange(num_edges_)) {
         T advancement_ratio = T(i)/num_bins_;
         histogram_values_data[i] = min * exp(log_multiplier_numerator * advancement_ratio);
       }
@@ -127,11 +127,11 @@
       histogram_counts_data[0] = total_count;
     }
     else {
-      for (int input_idx = 0; input_idx < InputSize(); input_idx++) {
+      for (const auto input_idx : c10::irange(InputSize())) {
         const auto& x = Input(input_idx);
         const int64_t N = x.numel();
         const auto* x_data = x.template data<T>();
-        for (int64_t data_idx = 0; data_idx < N; data_idx++) {
+        for (const auto data_idx : c10::irange(N)) {
           const T val = this->abs_ ? abs(x_data[data_idx]) :  x_data[data_idx];
           const auto bisection_it = std::upper_bound(
               histogram_values_data,
@@ -163,7 +163,7 @@
 
   void CheckInputs() {
     const auto& input_zero = Input(0);
-    for (int i = 1; i < InputSize(); i++) {
+    for (const auto i : c10::irange(1, InputSize())) {
       CAFFE_ENFORCE_EQ(
           Input(i).dtype(),
           input_zero.dtype(),
diff --git a/caffe2/operators/shape_op.h b/caffe2/operators/shape_op.h
index fd45cbe..4c3750e 100644
--- a/caffe2/operators/shape_op.h
+++ b/caffe2/operators/shape_op.h
@@ -34,7 +34,7 @@
     auto* output = Output(0, {numAxes}, at::dtype<int64_t>());
     auto src = reinterpret_cast<const char*>(data.sizes().data());
     auto out = reinterpret_cast<char*>(output->template mutable_data<int64_t>());
-    for (int i = 0; i < numAxes; i++) {
+    for (const auto i : c10::irange(numAxes)) {
       auto axis = axes_[i];
       CAFFE_ENFORCE_LT(axis, numDims, "Axis out of range");
       CAFFE_ENFORCE_GE(axis, 0, "Each axis should be non-negative");
diff --git a/caffe2/operators/sinusoid_position_encoding_op.h b/caffe2/operators/sinusoid_position_encoding_op.h
index 9b75d33..6cf308c 100644
--- a/caffe2/operators/sinusoid_position_encoding_op.h
+++ b/caffe2/operators/sinusoid_position_encoding_op.h
@@ -51,7 +51,7 @@
     float max_alpha_pow =
         ((float)embedding_size_ - 1.0f) / (float)embedding_size_;
 
-    for (int i = 0; i < M; ++i) {
+    for (const auto i : c10::irange(M)) {
       float pos = (float)idxs[i * K];
 
       // Compute the embedding for position i, example 0 first
@@ -72,7 +72,7 @@
       row_array = amplitude_ * row_array.sin().eval();
 
       // Copy the embedding to position i in the other examples
-      for (int j = 1; j < K; ++j) {
+      for (const auto j : c10::irange(1, K)) {
         int base = i * K * embedding_size_;
         std::copy(
             &out[base],
diff --git a/caffe2/operators/slice_op.h b/caffe2/operators/slice_op.h
index 9706472..973a8c5 100644
--- a/caffe2/operators/slice_op.h
+++ b/caffe2/operators/slice_op.h
@@ -30,7 +30,7 @@
   std::vector<SIndex> ends_idx(data.dim());
   std::vector<SIndex> dst_sizes(data.dim());
 
-  for (int i = 0; i < data.dim(); ++i) {
+  for (const auto i : c10::irange(data.dim())) {
     if (i >= starts.numel()) {
       starts_idx[i] = 0;
       ends_idx[i] = data.size(i);
@@ -78,7 +78,7 @@
   }
   // for now only supports slicing in 1 dimension
   int dim = -1;
-  for (int i = 0; i < data.dim(); ++i) {
+  for (const auto i : c10::irange(data.dim())) {
     if (starts_idx[i] > 0 || ends_idx[i] < data.size(i)) {
       CAFFE_ENFORCE_EQ(
           dim, -1, "Currently only possible to slice in 1 dimension.");
@@ -131,7 +131,7 @@
 
     char* src_offset_bytes = src_bytes + itemsize * src_offset;
     char* dst_offset_bytes = dst_bytes;
-    for (size_t i = 0; i < num_blocks; ++i) {
+    for (const auto i : c10::irange(num_blocks)) {
       char* local_src_offset_bytes =
           src_offset_bytes + i * src_block_size_bytes;
       char* local_dst_offset_bytes =
@@ -177,7 +177,7 @@
       return true;
     }
 
-    for (size_t i = 0; i < num_blocks; ++i) {
+    for (const auto i : c10::irange(num_blocks)) {
       char* local_src_offset_bytes =
           src_offset_bytes + i * src_block_size_bytes;
       char* local_dst_offset_bytes =
diff --git a/caffe2/operators/space_batch_op.h b/caffe2/operators/space_batch_op.h
index 4c80711..ce77227 100644
--- a/caffe2/operators/space_batch_op.h
+++ b/caffe2/operators/space_batch_op.h
@@ -29,14 +29,14 @@
   const int input_height = input.dim32(2);
   const int input_width = input.dim32(3);
 
-  for (int out_b = 0; out_b < output_batch; ++out_b) {
+  for (const auto out_b : c10::irange(output_batch)) {
     const int in_b = out_b % input_batch;
     const int offset_w = (out_b / input_batch) % block_size;
     const int offset_h = (out_b / input_batch) / block_size;
-    for (int d = 0; d < input_depth; ++d) {
-      for (int out_h = 0; out_h < output_height; ++out_h) {
+    for (const auto d : c10::irange(input_depth)) {
+      for (const auto out_h : c10::irange(output_height)) {
         const int in_h = out_h * block_size + offset_h - pad_t;
-        for (int out_w = 0; out_w < output_width; ++out_w) {
+        for (const auto out_w : c10::irange(output_width)) {
           const int in_w = out_w * block_size + offset_w - pad_l;
           const auto output_offset =
               ((out_b * output_depth + d) * output_height + out_h) *
@@ -80,14 +80,14 @@
   const int input_width = input.dim32(3);
 
   CAFFE_ENFORCE(input_depth == output_depth);
-  for (int in_b = 0; in_b < input_batch; ++in_b) {
+  for (const auto in_b : c10::irange(input_batch)) {
     const int out_b = in_b % output_batch;
     const int offset_w = (in_b / output_batch) % block_size;
     const int offset_h = (in_b / output_batch) / block_size;
-    for (int d = 0; d < input_depth; ++d) {
-      for (int in_h = 0; in_h < input_height; ++in_h) {
+    for (const auto d : c10::irange(input_depth)) {
+      for (const auto in_h : c10::irange(input_height)) {
         const int out_h = in_h * block_size + offset_h - pad_t;
-        for (int in_w = 0; in_w < input_width; ++in_w) {
+        for (const auto in_w : c10::irange(input_width)) {
           const int out_w = in_w * block_size + offset_w - pad_l;
           if (out_h >= 0 && out_w >= 0 && out_h < output_height &&
               out_w < output_width) {
diff --git a/caffe2/operators/sparse_to_dense_mask_op.h b/caffe2/operators/sparse_to_dense_mask_op.h
index 36a7da3..67a63e1 100644
--- a/caffe2/operators/sparse_to_dense_mask_op.h
+++ b/caffe2/operators/sparse_to_dense_mask_op.h
@@ -6,6 +6,7 @@
 #include <vector>
 #include "caffe2/core/context.h"
 #include "caffe2/core/export_caffe2_op_to_c10.h"
+#include <c10/util/irange.h>
 #include "caffe2/core/operator.h"
 #include "caffe2/core/tensor.h"
 #include "caffe2/utils/math.h"
@@ -29,7 +30,7 @@
     auto biggest = *std::max_element(mask.begin(), mask.end());
     dense_.assign(std::min(kMaxDenseSize, biggest + 1), -1);
     // NOLINTNEXTLINE(clang-diagnostic-sign-compare)
-    for (int i = 0; i < mask.size(); i++) {
+    for (const auto i : c10::irange(mask.size())) {
       int64_t id = mask[i];
       CAFFE_ENFORCE_GE(id, 0, "Only positive IDs are allowed.");
       if (id >= kMaxDenseSize) {
@@ -155,7 +156,7 @@
     }
 
     int64_t offset = 0;
-    for (int r = 0; r < rows; r++) {
+    for (const auto r : c10::irange(rows)) {
       bool skippedSparseIndex = false;
       for (int c = 0; c < lengths_vec[r]; c++) {
         const auto sparse_index = sparse_indices_vec[offset + c];
@@ -272,7 +273,7 @@
     // SparseToDenseMask is not injective; gradient_used records
     // if the gradient is used for other input value from the same row
     vector<bool> gradient_used(cols, false);
-    for (int r = 0; r < rows; r++) {
+    for (const auto r : c10::irange(rows)) {
       std::fill(gradient_used.begin(), gradient_used.end(), false);
       for (int c = lengths_vec[r] - 1; c >= 0; c--) {
         int idx = this->getFeatureIdx(sparse_indices_vec[offset + c]);
diff --git a/caffe2/operators/sparse_to_dense_op.h b/caffe2/operators/sparse_to_dense_op.h
index 40498a9..5744540 100644
--- a/caffe2/operators/sparse_to_dense_op.h
+++ b/caffe2/operators/sparse_to_dense_op.h
@@ -89,7 +89,7 @@
     const auto block_nitems = sparse_values.size_from_dim(1);
     const TData* sparse_values_vec = sparse_values.template data<TData>();
 
-    for (int32_t i = 0; i < sparse_indices_len; i++) {
+    for (const auto i : c10::irange(sparse_indices_len)) {
       const TInd idx = sparse_indices_vec[i];
       CAFFE_ENFORCE_GE(idx, 0);
       CAFFE_ENFORCE_LT(idx, output_first_dim);
diff --git a/caffe2/operators/square_root_divide_op.h b/caffe2/operators/square_root_divide_op.h
index 9adaff4..d23c808 100644
--- a/caffe2/operators/square_root_divide_op.h
+++ b/caffe2/operators/square_root_divide_op.h
@@ -41,7 +41,7 @@
     auto* scalePtr = scale.template data<TScale>();
     auto* dataPtr = data.template data<TData>();
     auto* yPtr = Y->template mutable_data<TData>();
-    for (auto i = 0U; i < batchSize; ++i) {
+    for (const auto i : c10::irange(0U, batchSize)) {
       auto scale = scalePtr[i];
       CAFFE_ENFORCE(scale >= 0, scale, " < 0");
       auto multiplier = scale == 0 ? 1.0 : 1 / std::sqrt(scale);
diff --git a/caffe2/operators/string_ops.h b/caffe2/operators/string_ops.h
index 49cc322..2642bd3 100644
--- a/caffe2/operators/string_ops.h
+++ b/caffe2/operators/string_ops.h
@@ -20,7 +20,7 @@
 
   template <typename In, typename Out, typename Context>
   bool operator()(int n, const In* in, Out* out, Context* /*c*/) {
-    for (int i = 0; i < n; ++i) {
+    for (const auto i : c10::irange(n)) {
       out[i] = functor(in[i]);
     }
     return true;
diff --git a/caffe2/operators/tensor_protos_db_input.h b/caffe2/operators/tensor_protos_db_input.h
index d07763c..04cf0ec 100644
--- a/caffe2/operators/tensor_protos_db_input.h
+++ b/caffe2/operators/tensor_protos_db_input.h
@@ -51,7 +51,7 @@
     TensorProtos protos;
     CAFFE_ENFORCE(protos.ParseFromString(value_));
     CAFFE_ENFORCE(protos.protos_size() == OutputSize());
-    for (int i = 0; i < protos.protos_size(); ++i) {
+    for (const auto i : c10::irange(protos.protos_size())) {
       if (protos.protos(i).has_device_detail()) {
         protos.mutable_protos(i)->clear_device_detail();
       }
@@ -62,14 +62,14 @@
       //     CPU));
     }
   } else {
-    for (int item_id = 0; item_id < batch_size_; ++item_id) {
+    for (const auto item_id : c10::irange(batch_size_)) {
       reader.Read(&key_, &value_);
       TensorProtos protos;
       CAFFE_ENFORCE(protos.ParseFromString(value_));
       CAFFE_ENFORCE(protos.protos_size() == OutputSize());
       // Note: shape_inferred_ is ignored, we'll always get dimensions from
       // proto
-      for (int i = 0; i < protos.protos_size(); ++i) {
+      for (const auto i : c10::irange(protos.protos_size())) {
         vector<int64_t> dims(
             protos.protos(i).dims().begin(), protos.protos(i).dims().end());
         dims.insert(dims.begin(), batch_size_);
@@ -94,7 +94,7 @@
 
 template <class Context>
 bool TensorProtosDBInput<Context>::CopyPrefetched() {
-  for (int i = 0; i < OutputSize(); ++i) {
+  for (const auto i : c10::irange(OutputSize())) {
     OperatorBase::template Output<Tensor>(i, Context::GetDeviceType())
         ->CopyFrom(
             prefetched_blobs_[i].template Get<TensorCPU>(), /* async */ true);
diff --git a/caffe2/operators/tile_op.h b/caffe2/operators/tile_op.h
index 360b58a..2a960fc 100644
--- a/caffe2/operators/tile_op.h
+++ b/caffe2/operators/tile_op.h
@@ -113,12 +113,12 @@
   bool DoTile(const int outer_size, const int inner_size, const T* X, T* Y) {
     if (inner_size == 1) {
       EigenArrayMap<T> Y_arr(Y, tiles_, outer_size);
-      for (int i = 0; i < outer_size; ++i) {
+      for (const auto i : c10::irange(outer_size)) {
         Y_arr.col(i) = X[i];
       }
     } else {
       ConstEigenArrayMap<T> X_arr(X, inner_size, outer_size);
-      for (int i = 0; i < outer_size; ++i) {
+      for (const auto i : c10::irange(outer_size)) {
         EigenArrayMap<T>(Y + i * tiles_ * inner_size, inner_size, tiles_)
             .colwise() = X_arr.col(i);
       }
@@ -245,10 +245,10 @@
           dX,
           inner_size,
           &context_);
-      for (int i = 0; i < outer_size; ++i) {
+      for (const auto i : c10::irange(outer_size)) {
         const T* dY_ptr = dY + i * tiles_ * inner_size;
         T* dX_ptr = dX + i * inner_size;
-        for (int j = 1; j < tiles_; ++j) {
+        for (const auto j : c10::irange(1, tiles_)) {
           math::Add<T, Context>(
               inner_size, dX_ptr, dY_ptr + j * inner_size, dX_ptr, &context_);
         }
diff --git a/caffe2/operators/transpose_op.h b/caffe2/operators/transpose_op.h
index cfd2e63..c17d0d0 100644
--- a/caffe2/operators/transpose_op.h
+++ b/caffe2/operators/transpose_op.h
@@ -49,7 +49,7 @@
     }
     const at::IntArrayRef X_dims = X.sizes();
     std::vector<std::int64_t> Y_dims(ndim);
-    for (int i = 0; i < ndim; ++i) {
+    for (const auto i : c10::irange(ndim)) {
       Y_dims[i] = X_dims[axes_[i]];
     }
     Y->Resize(Y_dims);
diff --git a/caffe2/operators/tt_linear_op.h b/caffe2/operators/tt_linear_op.h
index d4dead2..6f452d5 100644
--- a/caffe2/operators/tt_linear_op.h
+++ b/caffe2/operators/tt_linear_op.h
@@ -127,7 +127,7 @@
     // Check that output size of Y is the element-wise product of out_sizes
     int prod_out_sizes = 1;
     // NOLINTNEXTLINE(clang-diagnostic-sign-compare)
-    for (int i = 0; i < out_sizes_.size(); i++) {
+    for (const auto i : c10::irange(out_sizes_.size())) {
       prod_out_sizes *= out_sizes_[i];
     }
     CAFFE_ENFORCE(
diff --git a/caffe2/operators/unsafe_coalesce.h b/caffe2/operators/unsafe_coalesce.h
index bb0f58a..4070312 100644
--- a/caffe2/operators/unsafe_coalesce.h
+++ b/caffe2/operators/unsafe_coalesce.h
@@ -3,6 +3,7 @@
 
 #include "caffe2/core/context.h"
 #include "caffe2/core/export_caffe2_op_to_c10.h"
+#include <c10/util/irange.h>
 #include "caffe2/core/operator.h"
 
 
@@ -16,7 +17,7 @@
 
   bool RunOnDevice() override {
     size_t coalesced_size = 0;
-    for (int i = 0; i < InputSize(); ++i) {
+    for (const auto i : c10::irange(InputSize())) {
       // For now only float type is supported
       CAFFE_ENFORCE(
           Input(i).dtype().template Match<float>(),
@@ -24,14 +25,14 @@
           i);
     }
 
-    for (int i = 0; i < InputSize(); ++i) {
+    for (const auto i : c10::irange(InputSize())) {
       coalesced_size += Input(i).numel();
     }
     auto* coalesced = Output(OutputSize() - 1, coalesced_size, at::dtype<float>());
     auto coalesced_data = coalesced->template mutable_data<float>();
 
     size_t coalesced_offset = 0;
-    for (auto i = 0; i < InputSize(); ++i) {
+    for (const auto i : c10::irange(InputSize())) {
       const auto num_elems = Input(i).numel();
       auto input_sizes = Input(i).sizes().vec();
       // Don't do anything if both tensors are already pointing on the same data
diff --git a/caffe2/operators/utility_ops.h b/caffe2/operators/utility_ops.h
index e766493..8e8a72a 100644
--- a/caffe2/operators/utility_ops.h
+++ b/caffe2/operators/utility_ops.h
@@ -8,6 +8,7 @@
 #include "caffe2/core/common_omp.h"
 #include "caffe2/core/context.h"
 #include "caffe2/core/export_caffe2_op_to_c10.h"
+#include <c10/util/irange.h>
 #include "caffe2/core/logging.h"
 #include "caffe2/core/operator.h"
 #include "caffe2/core/types.h"
@@ -64,7 +65,7 @@
     const auto* X_data = X.template data<T>();
     uint8_t* Y_data = Y->template mutable_data<uint8_t>();
     // NOLINTNEXTLINE(clang-diagnostic-sign-compare)
-    for (size_t i = 0; i < X.numel(); i++) {
+    for (const auto i : c10::irange(X.numel())) {
       Y_data[i] = (uint8_t)(std::isnan(X_data[i]));
     }
     return true;
@@ -299,7 +300,7 @@
     auto* output = Output(0, input0.sizes(), at::dtype<T>());
     T* output_data = output->template mutable_data<T>();
     // Dimension checking
-    for (int i = 1; i < InputSize(); ++i) {
+    for (const auto i : c10::irange(1, InputSize())) {
       if (output->sizes() != Input(i).sizes()) {
         CAFFE_THROW(
             "Check failed: output->sizes() == Input(i).sizes().",
@@ -320,7 +321,7 @@
         output_data,
         &context_);
     // Add remaining.
-    for (int i = 2; i < InputSize(); ++i) {
+    for (const auto i : c10::irange(2, InputSize())) {
       math::Add(
           output->numel(),
           output_data,
@@ -577,7 +578,7 @@
     float w0 = *weight0.template data<float>();
     // It's most likely a constant so exact comparison is fine
     if (w0 != 1.0) {
-      for (int i = 0; i < K; ++i) {
+      for (const auto i : c10::irange(K)) {
         Index idx = idxs[i];
         CAFFE_ENFORCE(
             0 <= idx && idx < N,
@@ -600,7 +601,7 @@
       CAFFE_ENFORCE_EQ(weight.numel(), 1);
       const T* x_data = X.template data<T>();
       float w = *weight.template data<float>();
-      for (int i = 0; i < K; ++i) {
+      for (const auto i : c10::irange(K)) {
         Index idx = idxs[i];
         // double-checking the indices, but it's fine as it's DCHECK only
         DCHECK(0 <= idx && idx < N)
@@ -746,7 +747,7 @@
       int64_t N,
       int64_t K,
       int64_t block_size) {
-    for (int i = 0; i < K; ++i) {
+    for (const auto i : c10::irange(K)) {
       Index idx = idxs[i];
       // double-checking the indices, but it's fine as it's DCHECK only
       DCHECK(0 <= idx && idx < N)
@@ -838,11 +839,9 @@
     // dst should have the same rank as idxs and src, but the dimension of dim
     // axis can be different. That is why in the above equation, there is the
     // difference of J_src and J_dst.
-    for (int64_t outer_batch = 0; outer_batch < outer_dims_product;
-         ++outer_batch) {
-      for (int64_t i = 0; i < N; ++i) {
-        for (int64_t inner_batch = 0; inner_batch < idxs_block_size;
-             ++inner_batch) {
+    for (const auto outer_batch : c10::irange(outer_dims_product)) {
+      for (const auto i : c10::irange(N)) {
+        for (const auto inner_batch : c10::irange(idxs_block_size)) {
           auto idxs_elem_idx =
               outer_batch * idxs_batch_size + i * idxs_block_size + inner_batch;
           auto src_elem_idx =
@@ -867,7 +866,7 @@
       const IndexType* indices,
       int64_t n,
       IndexType indexing_axis_dim) {
-    for (auto i = 0; i < n; ++i) {
+    for (const auto i : c10::irange(n)) {
       auto idx = indices[i];
       CAFFE_ENFORCE(
           0 <= idx && idx < indexing_axis_dim,
@@ -900,7 +899,7 @@
     output->Resize(total_length);
     auto* output_data = output->template mutable_data<int32_t>();
 
-    for (int i = 0; i < input.numel(); ++i) {
+    for (const auto i : c10::irange(input.numel())) {
       auto len = input_data[i];
       std::fill(output_data, output_data + len, i);
       output_data += len;
@@ -927,7 +926,7 @@
     auto* output_data = output->template mutable_data<int32_t>();
 
     int32_t offset = 0;
-    for (int i = 0; i < size; ++i) {
+    for (const auto i : c10::irange(size)) {
       auto len = input_data[i];
       output_data[i * 2] = offset;
       output_data[i * 2 + 1] = len;
@@ -961,7 +960,7 @@
     auto* output_data = output->template mutable_data<int32_t>();
 
     int32_t offset = 0;
-    for (int i = 0; i < size; ++i) {
+    for (const auto i : c10::irange(size)) {
       auto len = input_data[i];
       output_data[i] = offset;
       offset += len;
@@ -1018,7 +1017,7 @@
     }
     std::fill(output_data, output_data + num_segments, 0);
     Index prev = 0; // Assume that segment_id >= 0.
-    for (int64_t i = 0; i < input_size; i++) {
+    for (const auto i : c10::irange(input_size)) {
       CAFFE_ENFORCE(
           prev <= input_data[i],
           "Segment ids must be sorted: ",
@@ -1069,7 +1068,7 @@
     }
     std::fill(output_data, output_data + num_segments * 2, 0);
     Index prev = input_data[0];
-    for (int64_t i = 0; i < input_size; i++) {
+    for (const auto i : c10::irange(input_size)) {
       CAFFE_ENFORCE(
           prev <= input_data[i],
           "Segment ids must be sorted: ",
@@ -1109,7 +1108,7 @@
     auto* output = Output(0);
 
     int64_t output_size = 0;
-    for (auto i = 0; i < input_size; i++) {
+    for (const auto i : c10::irange(input_size)) {
       CAFFE_ENFORCE_GE(input_data[i], 0, "unexpected negative length value");
       output_size += input_data[i];
     }
@@ -1132,7 +1131,7 @@
     output->Resize(output_size);
     auto* output_data = output->template mutable_data<float>();
     int64_t cnt = 0;
-    for (auto i = 0; i < input_size; i++) {
+    for (const auto i : c10::irange(input_size)) {
       auto len = input_data[i];
       if (len == 0) {
         continue;
@@ -1159,7 +1158,7 @@
 
   bool RunOnDevice() override {
     bool res = false;
-    for (auto i = 0; i < InputSize(); ++i) {
+    for (const auto i : c10::irange(InputSize())) {
       const auto& input = Input(i);
       res = res || input.numel() > 0;
     }
@@ -1208,7 +1207,7 @@
     auto size = input.numel();
     auto first = input_data[0];
 
-    for (int i = 1; i < size; i++) {
+    for (const auto i : c10::irange(1, size)) {
       CAFFE_ENFORCE(
           input_data[i] == first, "All elements of input must be same ");
     }
@@ -1255,7 +1254,7 @@
     size_t start = 0;
     size_t blockSize = ranges.size_from_dim(1);
     // NOLINTNEXTLINE(clang-diagnostic-sign-compare)
-    for (size_t i = 0; i < batchSize; ++i) {
+    for (const auto i : c10::irange(batchSize)) {
       auto end = start + blockSize;
       outputLengthsPtr[i] = accumulate(rangesData, start, end);
       start = end;
@@ -1329,7 +1328,7 @@
 
     int64_t total_length = 0;
     // NOLINTNEXTLINE(clang-diagnostic-sign-compare)
-    for (size_t i = 0; i < indices.numel(); ++i) {
+    for (const auto i : c10::irange(indices.numel())) {
       auto idx = indices_data[i];
       CAFFE_ENFORCE_LT(idx, lengths.numel());
       total_length += lengths_data[idx];
@@ -1341,7 +1340,7 @@
     offsets_.clear();
     int64_t running_offset = 0;
     offsets_.reserve(lengths.numel());
-    for (size_t i = 0; i < lengths.numel(); ++i) {
+    for (const auto i : c10::irange(lengths.numel())) {
       offsets_.push_back(running_offset);
       running_offset += lengths_data[i];
     }
@@ -1355,7 +1354,7 @@
     auto block_bytesize = block_size * items.itemsize();
     auto out = static_cast<char*>(output->raw_mutable_data(items.dtype()));
 
-    for (size_t i = 0; i < indices.numel(); ++i) {
+    for (const auto i : c10::irange(indices.numel())) {
       auto idx = indices_data[i];
       auto length = lengths_data[idx];
       context_.CopyItemsSameDevice(
@@ -1406,7 +1405,7 @@
     math::Set<int64_t, Context>(
         num_output_buckets_, 0, cur_hist_data, &context_);
 
-    for (int i = 0; i < N; i++) {
+    for (const auto i : c10::irange(N)) {
       int bucket_index = -1;
       if (X_data[i] < lower_bound_) {
         bucket_index = 0;
@@ -1419,7 +1418,7 @@
       accumulate_hist_[bucket_index] += 1;
     }
 
-    for (int i = 0; i < num_output_buckets_; i++) {
+    for (const auto i : c10::irange(num_output_buckets_)) {
       acc_hist_data[i] = accumulate_hist_[i];
     }
 
@@ -1464,7 +1463,7 @@
     T start = 0;
     T step = 1;
 
-    for (int i = 0; i < InputSize(); ++i) {
+    for (const auto i : c10::irange(InputSize())) {
       CAFFE_ENFORCE_EQ(
           Input(i).numel(), 1, "All inputs must be scalar/1D tensor.");
     }
diff --git a/caffe2/operators/variable_length_sequence_padding.h b/caffe2/operators/variable_length_sequence_padding.h
index f86964d..4a594bc 100644
--- a/caffe2/operators/variable_length_sequence_padding.h
+++ b/caffe2/operators/variable_length_sequence_padding.h
@@ -17,7 +17,7 @@
     const int32_t* seqLengths,
     const T padValue,
     Context* /*context*/) {
-  for (int j = 0; j < B; j++) {
+  for (const auto j : c10::irange(B)) {
     for (int i = seqLengths[j]; i < N; i++) {
       EigenVectorArrayMap<T>(X + B * M * i + M * j, M).setConstant(padValue);
     }
diff --git a/caffe2/opt/custom/cc_amrc.h b/caffe2/opt/custom/cc_amrc.h
index 806e2bb..d6b099d 100644
--- a/caffe2/opt/custom/cc_amrc.h
+++ b/caffe2/opt/custom/cc_amrc.h
@@ -54,7 +54,7 @@
     }
     int before = 1, after = 1;
     vector<int64_t> output_dims(concat_input_0.sizes().vec());
-    for (int i = 0; i < concat_input_0.dim(); ++i) {
+    for (const auto i : c10::irange(concat_input_0.dim())) {
       if (i == canonical_axis) {
         continue;
       }
@@ -65,7 +65,7 @@
         after *= dim;
       }
       // check the input dims are compatible.
-      for (int j = concat_input_start; j < InputSize(); ++j) {
+      for (const auto j : c10::irange(concat_input_start, InputSize())) {
         int dim_j = Input(j).dim32(i);
         CAFFE_ENFORCE(
             dim == dim_j,
@@ -93,7 +93,7 @@
         "Cannot handle fused concat with dim > 2, please update your fusion logic");
 
     int output_channels = 0;
-    for (int i = concat_input_start; i < InputSize(); ++i) {
+    for (const auto i : c10::irange(concat_input_start, InputSize())) {
       axis_data[i - concat_input_start] = Input(i).dim32(canonical_axis);
       output_channels += Input(i).dim32(canonical_axis);
     }
@@ -101,7 +101,7 @@
     auto* output = Output(0, output_dims, at::dtype<float>());
 
     size_t output_offset = 0;
-    for (int i = concat_input_start; i < InputSize(); ++i) {
+    for (const auto i : c10::irange(concat_input_start, InputSize())) {
       auto& input = Input(i);
       auto axis_dim = input.dim32(canonical_axis);
       math::CopyMatrix<Context>(
@@ -127,7 +127,7 @@
     const auto _zeros = _mm256_set1_ps(0.f);
 
     output_offset = 0;
-    for (auto outer = 0; outer < before; ++outer) {
+    for (const auto outer : c10::irange(before)) {
       auto axis_dim = output->dim32(canonical_axis);
       size_t inner_size = axis_dim * after;
       auto inner = 0;
@@ -148,7 +148,7 @@
         _mm256_storeu_ps(&output_data[output_offset + inner], out_val);
       }
 
-      for (auto inner_omp = inner; inner_omp < inner_size; ++inner_omp) {
+      for (const auto inner_omp : c10::irange(inner, inner_size)) {
         float elem = output_data[output_offset + inner_omp];
         float add_elem = add_input_data[inner_omp];
         float mul_elem = mul_input_data[inner_omp];
diff --git a/caffe2/opt/nql/ast.h b/caffe2/opt/nql/ast.h
index 2fee8fc..0d6aefc 100644
--- a/caffe2/opt/nql/ast.h
+++ b/caffe2/opt/nql/ast.h
@@ -1,4 +1,5 @@
 #pragma once
+#include "c10/util/irange.h"
 #include <iostream>
 #include <string>
 #include <vector>
@@ -20,8 +21,7 @@
     return starInputsFlag;
   }
   void dump(int level = 0) const {
-    for (int i = 0; i < level; i++)
-      std::cout << "  ";
+    for (const auto i : c10::irange(level))std::cout << "  ";
     if (!isCall())
       std::cout << "Var: " << name << std::endl;
     else {
@@ -41,8 +41,7 @@
     delete rhs;
   }
   void dump(int level = 0) const {
-    for (int i = 0; i < level; i++)
-      std::cout << "  ";
+    for (const auto i : c10::irange(level))std::cout << "  ";
     std::cout << "LHS:" << std::endl;
     for (auto s : lhs) {
       for (int i = 0; i < level + 1; i++)
diff --git a/caffe2/opt/onnxifi_op.h b/caffe2/opt/onnxifi_op.h
index 9e3aa80..99f25a4 100644
--- a/caffe2/opt/onnxifi_op.h
+++ b/caffe2/opt/onnxifi_op.h
@@ -6,6 +6,7 @@
 
 #include <c10/util/Exception.h>
 #include <c10/util/SmallVector.h>
+#include <c10/util/irange.h>
 #include "caffe2/core/context.h"
 #include "caffe2/core/logging.h"
 #include "caffe2/core/operator.h"
@@ -138,7 +139,7 @@
 
     if (use_passed_output_shapes_) {
       // Populate output_shapes_per_bs_
-      for (int bs = 1; bs < max_batch_size_; ++bs) {
+      for (const auto bs : c10::irange(1, max_batch_size_)) {
         auto output_shapes_tp = helper.GetRepeatedArgument<TensorProto>("output_shapes_bs_" + caffe2::to_string(bs));
         auto output_qshapes_tp = helper.GetRepeatedArgument<TensorProto>("output_qshapes_bs_" + caffe2::to_string(bs));
         CAFFE_ENFORCE_EQ(output_names_.size(), output_shapes_tp.size() + output_qshapes_tp.size());
@@ -267,7 +268,7 @@
           ONNXIFI_STATUS_SUCCESS);
 
       // Release unused backend ids.
-      for (size_t i = 0; i < num_backends; ++i) {
+      for (const auto i : c10::irange(num_backends)) {
         if (i == static_cast<size_t>(backend_index)) {
           continue;
         }
@@ -287,7 +288,7 @@
 
       // Extra weight shapes
       std::unordered_map<std::string, ShapeInfo> weight_shape_info;
-      for (size_t i = 0; i < weight_names.size(); ++i) {
+      for (const auto i : c10::irange(weight_names.size())) {
         TensorShape shape;
         const auto& shape0 = weight_shapes[i];
         for (const auto d : shape0) {
diff --git a/caffe2/perfkernels/adagrad.h b/caffe2/perfkernels/adagrad.h
index 12cd410..f030e3e 100644
--- a/caffe2/perfkernels/adagrad.h
+++ b/caffe2/perfkernels/adagrad.h
@@ -6,6 +6,7 @@
 #include <immintrin.h>
 #endif
 #include <c10/util/Half.h>
+#include <c10/util/irange.h>
 
 namespace caffe2 {
 
@@ -26,7 +27,7 @@
     float epsilon,
     float lr,
     float weight_decay = 0.f) {
-  for (auto i = 0; i < N; ++i) {
+  for (const auto i : c10::irange(N)) {
     float gi = std::fma(weight_decay, w[i], g[i]);
     float hi = decay * h[i] + gi * gi;
     nh[i] = hi;
diff --git a/caffe2/perfkernels/lstm_unit_cpu-impl.h b/caffe2/perfkernels/lstm_unit_cpu-impl.h
index 4584ce0..5e76e1a 100644
--- a/caffe2/perfkernels/lstm_unit_cpu-impl.h
+++ b/caffe2/perfkernels/lstm_unit_cpu-impl.h
@@ -2,6 +2,7 @@
 #include <string.h>
 #include <cmath>
 #include <cstdint>
+#include "c10/util/irange.h"
 #include "caffe2/utils/conversions.h"
 
 #if (ENABLE_VECTORIZATION > 0) && !defined(_DEBUG) && !defined(DEBUG)
@@ -53,7 +54,7 @@
     T* H,
     const float forget_bias) {
   const T forgetBias = convert::To<float, T>(forget_bias);
-  for (int n = 0; n < N; ++n) {
+  for (const auto n : c10::irange(N)) {
     const bool valid = seqLengths == nullptr || t < seqLengths[n];
     if (!valid) {
       if (drop_states) {
@@ -67,7 +68,7 @@
       const T* X_D = &X[D];
       const T* X_2D = &X[2 * D];
       const T* X_3D = &X[3 * D];
-      VECTOR_LOOP for (int d = 0; d < D; ++d) {
+      VECTOR_LOOP for (const auto d : c10::irange(D)) {
         const T i = sigmoid(X[d]);
         const T f = sigmoid(X_D[d] + forgetBias);
         const T o = sigmoid(X_2D[d]);
@@ -105,7 +106,7 @@
     T* X_diff,
     const float forget_bias) {
   const T localForgetBias = convert::To<float, T>(forget_bias);
-  for (int n = 0; n < N; ++n) {
+  for (const auto n : c10::irange(N)) {
     const bool valid = seqLengths == nullptr || t < seqLengths[n];
 
     if (!valid) {
@@ -118,7 +119,7 @@
       }
       memset(X_diff, 0, 4 * sizeof(T) * D);
     } else {
-      VECTOR_LOOP for (int d = 0; d < D; ++d) {
+      VECTOR_LOOP for (const auto d : c10::irange(D)) {
         T* c_prev_diff = C_prev_diff + d;
         T* h_prev_diff = H_prev_diff + d;
         T* i_diff = X_diff + d;
diff --git a/caffe2/predictor/emulator/data_filler.h b/caffe2/predictor/emulator/data_filler.h
index e3021f6..55aed11 100644
--- a/caffe2/predictor/emulator/data_filler.h
+++ b/caffe2/predictor/emulator/data_filler.h
@@ -59,12 +59,12 @@
       : init_net_(init_net), data_net_(data_net) {
     // The output of the data_net_ will be served as the input
     int op_size = data_net_.op_size();
-    for (int i = 0; i < op_size; ++i) {
+    for (const auto i : c10::irange(op_size)) {
       OperatorDef op_def = data_net_.op(i);
       // We rely on Fill op to generate inputs
       CAFFE_ENFORCE(op_def.type().find("Fill") != std::string::npos);
       int output_size = op_def.output_size();
-      for (int j = 0; j < output_size; ++j) {
+      for (const auto j : c10::irange(output_size)) {
         input_names_.push_back(op_def.output(j));
       }
     }
@@ -105,7 +105,7 @@
       int input_index,
       const std::vector<std::vector<int64_t>>& input_dims) {
     Workspace ws;
-    for (int i = 0; i < op_def.input_size(); ++i) {
+    for (const auto i : c10::irange(op_def.input_size())) {
       // CreateOperator requires all input blobs present
       ws.CreateBlob(op_def.input(i));
     }
diff --git a/caffe2/python/pybind_state.h b/caffe2/python/pybind_state.h
index aa67d32..6d89b55 100644
--- a/caffe2/python/pybind_state.h
+++ b/caffe2/python/pybind_state.h
@@ -153,12 +153,12 @@
     if (numpy_type == NPY_OBJECT) {
       PyObject** outObj = reinterpret_cast<PyObject**>(outPtr);
       auto* str = tensor.template data<std::string>();
-      for (int i = 0; i < tensor.numel(); ++i) {
+      for (const auto i : c10::irange(tensor.numel())) {
         outObj[i] = PyBytes_FromStringAndSize(str->data(), str->size());
         str++;
         // cleanup on failure
         if (outObj[i] == nullptr) {
-          for (int j = 0; j < i; ++j) {
+          for (const auto j : c10::irange(i)) {
             Py_DECREF(outObj[j]);
           }
           CAFFE_THROW("Failed to allocate string for ndarray of strings.");
@@ -212,7 +212,7 @@
     int ndim = PyArray_NDIM(array);
     npy_intp* npy_dims = PyArray_DIMS(array);
     std::vector<int64_t> dims;
-    for (int i = 0; i < ndim; ++i) {
+    for (const auto i : c10::irange(ndim)) {
       dims.push_back(npy_dims[i]);
     }
 
@@ -229,7 +229,7 @@
               dims, at::dtype<std::string>().device(Context::GetDeviceType()));
         }
         auto* outPtr = tensor.template mutable_data<std::string>();
-        for (int i = 0; i < tensor.numel(); ++i) {
+        for (const auto i : c10::irange(tensor.numel())) {
           char* str;
           Py_ssize_t strSize;
           if (PyBytes_Check(input[i])) {
@@ -375,7 +375,7 @@
 
       std::vector<py::object> inputs;
       inputs.reserve(InputSize());
-      for (auto i = 0; i < InputSize(); ++i) {
+      for (const auto i : c10::irange(InputSize())) {
         const auto* blob = &InputBlob(i);
         // Allow CPU tensors in addition to operator context's tensors
         py::object py_obj;
@@ -395,7 +395,7 @@
       }
       std::vector<py::object> outputs;
       outputs.reserve(OutputSize());
-      for (auto i = 0; i < OutputSize(); ++i) {
+      for (const auto i : c10::irange(OutputSize())) {
         auto* blob = OutputBlob(i);
 
         // Python op is always used with CPUContext only and treats inputs and
diff --git a/caffe2/quantization/server/elementwise_dnnlowp_op.h b/caffe2/quantization/server/elementwise_dnnlowp_op.h
index aac1020..c7d1d24 100644
--- a/caffe2/quantization/server/elementwise_dnnlowp_op.h
+++ b/caffe2/quantization/server/elementwise_dnnlowp_op.h
@@ -127,7 +127,7 @@
         size_t n,                                                            \
         size_t post,                                                         \
         CPUContext*) {                                                       \
-      for (int i = 0; i < pre; ++i) {                                        \
+      for (const auto i : c10::irange(pre)) {                                        \
         EigenArrayMap<R>(out + i * n * post, post, n) = eigen_op(            \
             (ConstEigenArrayMap<T>(a + i * n * post, post, n).rowwise()),    \
             (Eigen::Map<const Eigen::Array<T, 1, Eigen::Dynamic>>(b, n)));   \
diff --git a/caffe2/quantization/server/im2col_dnnlowp.h b/caffe2/quantization/server/im2col_dnnlowp.h
index 92f7b27..dc34714 100644
--- a/caffe2/quantization/server/im2col_dnnlowp.h
+++ b/caffe2/quantization/server/im2col_dnnlowp.h
@@ -50,7 +50,7 @@
       auto* dst = data_col + nip * (kernel_h * kernel_w * output_h * output_w) +
           kh * (kernel_w * output_h * output_w) + kw * (output_h * output_w);
       const auto* src = data_im + nip * (height * width);
-      for (auto y = 0; y < output_h; y++) {
+      for (const auto y : c10::irange(output_h)) {
         const auto iy = y * stride_h + kh;
         const auto ix = kw;
         if (stride_w == 1) {
@@ -59,7 +59,7 @@
               src + (iy * width + ix),
               sizeof(T) * output_w);
         } else {
-          for (auto x = 0; x < output_w; x++) {
+          for (const auto x : c10::irange(output_w)) {
             memcpy(
                 dst + (y * output_w + x),
                 src + (iy * width + ix + x * stride_w),
@@ -78,8 +78,8 @@
     const int pad_w = pad_l;
     const int channel_size = height * width;
     for (int channel = channels; channel--; data_im += channel_size) {
-      for (int kernel_row = 0; kernel_row < kernel_h; kernel_row++) {
-        for (int kernel_col = 0; kernel_col < kernel_w; kernel_col++) {
+      for (const auto kernel_row : c10::irange(kernel_h)) {
+        for (const auto kernel_col : c10::irange(kernel_w)) {
           int input_row = -pad_h + kernel_row * dilation_h;
           for (int output_rows = output_h; output_rows; output_rows--) {
             if (!utils::IsAGeZeroAndALtB(input_row, height)) {
@@ -113,12 +113,12 @@
   int width_col = (width + pad_l + pad_r - dkernel_w) / stride_w + 1;
 
   int channels_col = channels * kernel_h * kernel_w;
-  for (int c = 0; c < channels_col; ++c) {
+  for (const auto c : c10::irange(channels_col)) {
     int w_offset = c % kernel_w;
     int h_offset = (c / kernel_w) % kernel_h;
     int c_im = c / kernel_h / kernel_w;
-    for (int h = 0; h < height_col; ++h) {
-      for (int w = 0; w < width_col; ++w) {
+    for (const auto h : c10::irange(height_col)) {
+      for (const auto w : c10::irange(width_col)) {
         int h_pad = h * stride_h - pad_t + h_offset * dilation_h;
         int w_pad = w * stride_w - pad_l + w_offset * dilation_w;
         if (h_pad >= 0 && h_pad < height && w_pad >= 0 && w_pad < width)
@@ -152,20 +152,20 @@
       kernel_shape, kernel_shape + N, 1, std::multiplies<int>());
   std::vector<int> d_offset(N, 0);
   std::vector<int> d_iter(N, 0);
-  for (int i = 0; i < outer_size; ++i) {
+  for (const auto i : c10::irange(outer_size)) {
     // Loop over spatial axes in reverse order to compute a per-axis offset.
     int offset = i;
     for (int d_i = N - 1; d_i >= 0; --d_i) {
       d_offset[d_i] = offset % kernel_shape[d_i];
       offset /= kernel_shape[d_i];
     }
-    for (int j = 0; j < inner_size; ++j) {
+    for (const auto j : c10::irange(inner_size)) {
       // Loop over spatial axes in forward order to compute the indices in the
       // image and column, and whether the index lies in the padding.
       const int col_index = i * inner_size + j;
       int img_index = i / kernel_size;
       bool is_padding = false;
-      for (int d_i = 0; d_i < N; ++d_i) {
+      for (const auto d_i : c10::irange(N)) {
         const int d_img = d_iter[d_i] * stride[d_i] - pad[d_i] +
             d_offset[d_i] * dilation[d_i];
         is_padding |= d_img < 0 || d_img >= img_shape[d_i + 1];
@@ -216,13 +216,13 @@
     T* data_col_temp =
         data_col + h * width_col * kernel_h * kernel_w * channels;
     int w_pad = -pad_l;
-    for (int w = 0; w < width_col; ++w) {
+    for (const auto w : c10::irange(width_col)) {
       int r = 0;
       for (int ih = h_pad; ih < h_pad + dkernel_h; ih += dilation_h, ++r) {
         int s = 0;
         for (int iw = w_pad; iw < w_pad + dkernel_w; iw += dilation_w, ++s) {
           if (ih >= 0 && ih < height && iw >= 0 && iw < width) {
-            for (int g = 0; g < groups; ++g) {
+            for (const auto g : c10::irange(groups)) {
               memcpy(
                   data_col_temp +
                       ((g * kernel_h + r) * kernel_w + s) * (channels / groups),
@@ -232,7 +232,7 @@
             }
           } else {
             // This should be simply padded with zero.
-            for (int g = 0; g < groups; ++g) {
+            for (const auto g : c10::irange(groups)) {
               for (int i = 0; i < channels / groups; ++i) {
                 data_col_temp
                     [(((g * kernel_h + r) * kernel_w) + s) *
@@ -293,12 +293,12 @@
 #endif
   for (int t = 0; t < frame_col; ++t) {
     int t_pad = -pad_p + t * stride_t;
-    for (int h = 0; h < height_col; ++h) {
+    for (const auto h : c10::irange(height_col)) {
       int h_pad = -pad_t + h * stride_h;
       T* data_col_temp = data_col +
           (t * height_col + h) * width_col * kernel_t * kernel_h * kernel_w *
               channels;
-      for (int w = 0; w < width_col; ++w) {
+      for (const auto w : c10::irange(width_col)) {
         int w_pad = -pad_l + w * stride_w;
         int q = 0;
         for (int it = t_pad; it < t_pad + dkernel_t; it += dilation_t, ++q) {
@@ -309,7 +309,7 @@
                  iw += dilation_w, ++s) {
               if (it >= 0 && it < num_frames && ih >= 0 && ih < height &&
                   iw >= 0 && iw < width) {
-                for (int g = 0; g < groups; ++g) {
+                for (const auto g : c10::irange(groups)) {
                   memcpy(
                       data_col_temp +
                           (((g * kernel_t + q) * kernel_h + r) * kernel_w + s) *
@@ -320,7 +320,7 @@
                 }
               } else {
                 // This should be simply padded with zero.
-                for (int g = 0; g < groups; ++g) {
+                for (const auto g : c10::irange(groups)) {
                   for (int i = 0; i < channels / groups; ++i) {
                     data_col_temp
                         [((((g * kernel_t + q) * kernel_h + r) * kernel_w) +
diff --git a/caffe2/quantization/server/mmio.h b/caffe2/quantization/server/mmio.h
index 91564e5..b52c408 100644
--- a/caffe2/quantization/server/mmio.h
+++ b/caffe2/quantization/server/mmio.h
@@ -36,8 +36,8 @@
     }
     fprintf(fp, "%d %d\n", m, n);
     // matrix market array format uses column-major order
-    for (int j = 0; j < n; ++j) {
-      for (int i = 0; i < m; ++i) {
+    for (const auto j : c10::irange(n)) {
+      for (const auto i : c10::irange(m)) {
         if (is_integral<T>::value) {
           // NOLINTNEXTLINE(clang-analyzer-core.NullDereference)
           fprintf(fp, "%d\n", static_cast<int>(a[j * m + i]));
diff --git a/caffe2/quantization/server/utility_dnnlowp_ops.h b/caffe2/quantization/server/utility_dnnlowp_ops.h
index 1a0d830..9818e71 100644
--- a/caffe2/quantization/server/utility_dnnlowp_ops.h
+++ b/caffe2/quantization/server/utility_dnnlowp_ops.h
@@ -54,7 +54,7 @@
     const Index* idxs = indices.template data<Index>();
     auto out = static_cast<char*>(output->raw_mutable_data(data.dtype()));
 
-    for (int i = 0; i < N; ++i) {
+    for (const auto i : c10::irange(N)) {
       auto idx = idxs[i];
       CAFFE_ENFORCE(
           0 <= idx && idx < data.size(0),
diff --git a/caffe2/queue/queue_ops.h b/caffe2/queue/queue_ops.h
index 1d56716..006fd3a 100644
--- a/caffe2/queue/queue_ops.h
+++ b/caffe2/queue/queue_ops.h
@@ -149,7 +149,7 @@
     }
 
     const int kTensorGrowthPct = 40;
-    for (int i = 0; i < numRecords_; ++i) {
+    for (const auto i : c10::irange(numRecords_)) {
       if (!queue->blockingRead(blobPtrs_)) {
         // if we read at least one record, status is still true
         return i > 0;
diff --git a/caffe2/queue/rebatching_queue_ops.h b/caffe2/queue/rebatching_queue_ops.h
index 999c2b5..2338b15 100644
--- a/caffe2/queue/rebatching_queue_ops.h
+++ b/caffe2/queue/rebatching_queue_ops.h
@@ -32,7 +32,7 @@
     CAFFE_ENFORCE_EQ(InputSize(), queue->numBlobs() + 1);
     std::vector<const Tensor*> inputTensors;
     inputTensors.reserve(InputSize() - 1);
-    for (int i = 1; i < InputSize(); ++i) {
+    for (const auto i : c10::irange(1, InputSize())) {
       inputTensors.push_back(&Input(i));
     }
 
@@ -56,7 +56,7 @@
 
     std::vector<Tensor*> outputTensors;
     outputTensors.reserve(OutputSize());
-    for (int i = 0; i < OutputSize(); ++i) {
+    for (const auto i : c10::irange(OutputSize())) {
       outputTensors.push_back(Output(i));
     }
 
diff --git a/caffe2/sgd/adadelta_op.h b/caffe2/sgd/adadelta_op.h
index 402edf7..d24ba2a 100644
--- a/caffe2/sgd/adadelta_op.h
+++ b/caffe2/sgd/adadelta_op.h
@@ -18,7 +18,7 @@
     float* nh,
     float* nd,
     Context* /*context*/) {
-  for (int i = 0; i < N; ++i) {
+  for (const auto i : c10::irange(N)) {
     float gi = g[i];
     float di = d[i];
     float hi = nh[i] = decay * h[i] + (1.0f - decay) * gi * gi;
@@ -120,7 +120,7 @@
     }
 
     auto block_size = Input(GRAD).numel() / n;
-    for (int i = 0; i < n; ++i) {
+    for (const auto i : c10::irange(n)) {
       auto idx = indices[i];
       if (block_size == 1) {
         float gi = gradIn[i];
diff --git a/caffe2/sgd/adagrad_fused.h b/caffe2/sgd/adagrad_fused.h
index 29c506f..9f18433 100644
--- a/caffe2/sgd/adagrad_fused.h
+++ b/caffe2/sgd/adagrad_fused.h
@@ -82,8 +82,8 @@
     auto* grad_buffer_data =
         is_mean ? grad_buffer_.template mutable_data<T>() : NULL;
     if (is_mean) {
-      for (auto rangeIndex = 0; rangeIndex < numSegments; ++rangeIndex) {
-        for (auto tmpIndex = 0; tmpIndex < block_size; ++tmpIndex) {
+      for (const auto rangeIndex : c10::irange(numSegments)) {
+        for (const auto tmpIndex : c10::irange(block_size)) {
           auto offsetI = rangeIndex * block_size;
           grad_buffer_data[offsetI + tmpIndex] = lengths[rangeIndex] > 0
               ? gradIn[offsetI + tmpIndex] / lengths[rangeIndex]
@@ -92,7 +92,7 @@
       }
     }
 
-    for (auto rangeIndex = 0; rangeIndex < numSegments; ++rangeIndex) {
+    for (const auto rangeIndex : c10::irange(numSegments)) {
       for (auto start = dataIndex; dataIndex < start + lengths[rangeIndex];
            ++dataIndex) {
         std::size_t idx = indices[dataIndex];
@@ -243,7 +243,7 @@
     // ignores this dependency and fuses these two loops.
     std::vector<T> temp_grad(block_size);
     int dataIndex = 0;
-    for (auto rangeIndex = 0; rangeIndex < numSegments; ++rangeIndex) {
+    for (const auto rangeIndex : c10::irange(numSegments)) {
       for (auto start = dataIndex; dataIndex < start + lengths[rangeIndex];
            ++dataIndex) {
         std::size_t idx = indices[dataIndex];
@@ -277,7 +277,7 @@
     CAFFE_ENFORCE_EQ(dataIndex, n);
 
     dataIndex = 0;
-    for (auto rangeIndex = 0; rangeIndex < numSegments; ++rangeIndex) {
+    for (const auto rangeIndex : c10::irange(numSegments)) {
       for (auto start = dataIndex; dataIndex < start + lengths[rangeIndex];
            ++dataIndex) {
         std::size_t idx = indices[dataIndex];
@@ -285,7 +285,7 @@
         auto offsetIdx = idx * block_size;
         auto localOffset = dataIndex - start;
 
-        for (int i = 0; i < block_size; ++i) {
+        for (const auto i : c10::irange(block_size)) {
           temp_grad[i] = auxParamIn[localOffset] * gradIn[offsetI + i];
         }
 
@@ -409,7 +409,7 @@
 
     std::vector<T> temp_grad(block_size);
     int dataIndex = 0;
-    for (auto rangeIndex = 0; rangeIndex < numSegments; ++rangeIndex) {
+    for (const auto rangeIndex : c10::irange(numSegments)) {
       for (auto start = dataIndex; dataIndex < start + lengths[rangeIndex];
            ++dataIndex) {
         std::size_t idx = indices[dataIndex];
@@ -440,7 +440,7 @@
             auxGrad + dataIndex,
             &context_);
 
-        for (int i = 0; i < block_size; ++i) {
+        for (const auto i : c10::irange(block_size)) {
           temp_grad[i] = auxParamIn[localOffset] * gradIn[offsetI + i];
         }
 
diff --git a/caffe2/sgd/adagrad_op.h b/caffe2/sgd/adagrad_op.h
index b683b7e..8646d01 100644
--- a/caffe2/sgd/adagrad_op.h
+++ b/caffe2/sgd/adagrad_op.h
@@ -39,7 +39,7 @@
     const float* lr,
     Context* /*context*/,
     float weight_decay = 0.f) {
-  for (auto i = 0; i < N; ++i) {
+  for (const auto i : c10::irange(N)) {
     float grad = std::fma(weight_decay, paramIn[i], gradIn[i]);
     float moment = momentOut[i] = decay * momentIn[i] + grad * grad;
     float effective_lr = effectiveLROut[i] =
@@ -63,7 +63,7 @@
     const float* lr,
     Context* /*context*/,
     float weight_decay = 0.f) {
-  for (auto i = 0; i < N; ++i) {
+  for (const auto i : c10::irange(N)) {
     float grad = std::fma(weight_decay, paramIn[i], gradIn[i]);
     float moment = momentOut[i] = decay * momentIn[i] + grad * grad;
     float effective_lr = effectiveLROut[i] =
@@ -300,7 +300,7 @@
     const auto* momentIn = Input(MOMENT_1).template data<float>();
 
     std::vector<float> grad(block_size);
-    for (auto i = 0; i < n; ++i) {
+    for (const auto i : c10::irange(n)) {
       auto idx = indices[i];
       auto offsetI = i * block_size;
       auto offsetIdx = idx * block_size;
@@ -504,7 +504,7 @@
 #else
     VLOG(1) << "using plain adagrad updates in RowWiseSparseAdagradOp";
 
-    for (auto i = 0; i < n; ++i) {
+    for (const auto i : c10::irange(n)) {
       auto idx = indices[i];
       float freq = (counter_halflife_ > 0 && count[idx] > 0)
           ? counter_halflife_ / count[idx]
@@ -542,13 +542,13 @@
         const float* g = gradIn + offsetI;
         float* h = moment + idx;
         float hs = 0.;
-        for (auto j = 0; j < block_size; ++j) {
+        for (const auto j : c10::irange(block_size)) {
           float gj = std::fma(weight_decay_ * freq, w[j], g[j]);
           hs += gj * gj;
         }
         float hi = h[0] = h[0] + hs / block_size;
         float step = lr[0] / (std::sqrt(hi) + epsilon_);
-        for (auto j = 0; j < block_size; ++j) {
+        for (const auto j : c10::irange(block_size)) {
           float gj = std::fma(weight_decay_ * freq, w[j], g[j]);
           w[j] = w[j] + gj * step;
         }
diff --git a/caffe2/sgd/adam_op.h b/caffe2/sgd/adam_op.h
index ca1c5ae..d0aa6dd 100644
--- a/caffe2/sgd/adam_op.h
+++ b/caffe2/sgd/adam_op.h
@@ -21,7 +21,7 @@
     float correction,
     const float* lr,
     Context* /*context*/) {
-  for (auto i = 0; i < N; ++i) {
+  for (const auto i : c10::irange(N)) {
     float gi = g[i];
     float mi = nm[i] = m[i] * beta1 + gi * (1 - beta1);
     float vi = nv[i] = v[i] * beta2 + gi * gi * (1 - beta2);
@@ -45,7 +45,7 @@
     float correction,
     const float* lr,
     Context* /*context*/) {
-  for (auto i = 0; i < N; ++i) {
+  for (const auto i : c10::irange(N)) {
     float gi = g[i];
     float mi = nm[i] = m[i] * beta1 + gi * (1 - beta1);
     float vi = nv[i] = v[i] * beta2 + gi * gi * (1 - beta2);
@@ -74,7 +74,7 @@
     Context* /*context*/) {
   float k = (float)(t - lastSeenIn[0]);
   lastSeenOut[0] = t;
-  for (auto i = 0; i < N; ++i) {
+  for (const auto i : c10::irange(N)) {
     float gi = g[i];
     // The number of steps since this param was last seen.
     // We don't need integer precision for k.  Float is fine and it's faster to convert here.
@@ -107,7 +107,7 @@
     float correction,
     const float* lr,
     Context* /*context*/) {
-  for (auto i = 0; i < N; ++i) {
+  for (const auto i : c10::irange(N)) {
     float gi = g[i];
     float mi = nm[i] = m[i] * beta1 + gi * (1 - beta1);
     float vi = nv[i] = v[i] * beta2 + gi * gi * (1 - beta2);
@@ -135,7 +135,7 @@
     float r_correction,
     const float* lr,
     Context* /*context*/) {
-  for (auto i = 0; i < N; ++i) {
+  for (const auto i : c10::irange(N)) {
     float gi = g[i];
     float mi = nm[i] = m[i] * beta1 + gi * (1 - beta1);
     float vi = nv[i] = v[i] * beta2 + gi * gi * (1 - beta2);
@@ -169,7 +169,7 @@
     float r_correction,
     const float* lr,
     Context* /*context*/) {
-  for (auto i = 0; i < N; ++i) {
+  for (const auto i : c10::irange(N)) {
     float gi = g[i];
     float mi = nm[i] = m[i] * beta1 + gi * (1 - beta1);
     float vi = nv[i] = v[i] * beta2 + gi * gi * (1 - beta2);
@@ -204,7 +204,7 @@
     float r_correction,
     const float* lr,
     Context* /*context*/) {
-  for (auto i = 0; i < N; ++i) {
+  for (const auto i : c10::irange(N)) {
     float gi = g[i];
     float mi = nm[i] = m[i] * beta1 + gi * (1 - beta1);
     float vi = nv[i] = v[i] * beta2 + gi * gi * (1 - beta2);
@@ -350,7 +350,7 @@
     auto* moment2Out = Output(OUTPUT_MOMENT_2)->template mutable_data<T>();
 
     if (OutputSize() == 3) {
-      for (auto i = 0; i < n; ++i) {
+      for (const auto i : c10::irange(n)) {
         auto idx = indices[i];
 
         if (block_size == 1) {
@@ -444,7 +444,7 @@
     } else {
       Output(OUTPUT_GRAD)->ResizeLike(Input(GRAD));
       auto* gradOut = Output(OUTPUT_GRAD)->template mutable_data<T>();
-      for (auto i = 0; i < n; ++i) {
+      for (const auto i : c10::irange(n)) {
         auto idx = indices[i];
 
         if (block_size == 1) {
@@ -593,7 +593,7 @@
     auto* moment2Out = Output(OUTPUT_MOMENT_2)->template mutable_data<T>();
     int64_t* lastSeenOut = Output(OUTPUT_LAST_SEEN)->template mutable_data<int64_t>();
 
-    for (auto i = 0; i < n; ++i) {
+    for (const auto i : c10::irange(n)) {
         auto idx = indices[i];
         auto offsetI = i * block_size;
         auto offsetIdx = idx * block_size;
@@ -673,7 +673,7 @@
     auto* moment2Out = Output(OUTPUT_MOMENT_2)->template mutable_data<T>();
 
     if (OutputSize() == 3) {
-      for (auto i = 0; i < n; ++i) {
+      for (const auto i : c10::irange(n)) {
         auto idx = indices[i];
 
         if (block_size == 1) {
@@ -719,13 +719,13 @@
           float* nm2 = moment2Out + idx;
 
           float m2_sum = 0.;
-          for (auto j = 0; j < block_size; ++j) {
+          for (const auto j : c10::irange(block_size)) {
             float gj = g[j];
             m2_sum += gj * gj;
           }
           float vi = nm2[0] =
               m2[0] * beta2_ + (m2_sum / block_size) * (1 - beta2_);
-          for (auto j = 0; j < block_size; ++j) {
+          for (const auto j : c10::irange(block_size)) {
             float mi = nm1[j] = m1[j] * beta1_ + g[j] * (1 - beta1_);
             nw[j] = w[j] + lr[0] * correction * mi / (std::sqrt(vi) + epsilon_);
           }
@@ -734,7 +734,7 @@
     } else {
       Output(OUTPUT_GRAD)->ResizeLike(Input(GRAD));
       auto* gradOut = Output(OUTPUT_GRAD)->template mutable_data<T>();
-      for (auto i = 0; i < n; ++i) {
+      for (const auto i : c10::irange(n)) {
         auto idx = indices[i];
 
         if (block_size == 1) {
@@ -781,13 +781,13 @@
           float* ng = gradOut + offsetI;
 
           float m2_sum = 0.;
-          for (auto j = 0; j < block_size; ++j) {
+          for (const auto j : c10::irange(block_size)) {
             float gj = g[j];
             m2_sum += gj * gj;
           }
           float vi = nm2[0] =
               m2[0] * beta2_ + (m2_sum / block_size) * (1 - beta2_);
-          for (auto j = 0; j < block_size; ++j) {
+          for (const auto j : c10::irange(block_size)) {
             float mi = nm1[j] = m1[j] * beta1_ + g[j] * (1 - beta1_);
             float ngi = ng[j] = correction * mi / (std::sqrt(vi) + epsilon_);
             nw[j] = w[j] + lr[0] * ngi;
diff --git a/caffe2/sgd/learning_rate_adaption_op.h b/caffe2/sgd/learning_rate_adaption_op.h
index ff3e30f..10a4480 100644
--- a/caffe2/sgd/learning_rate_adaption_op.h
+++ b/caffe2/sgd/learning_rate_adaption_op.h
@@ -21,7 +21,7 @@
   float x = 0;
   float y = 0, z = 0;
   const float kEps = 1e-12f;
-  for (auto i = 0; i < n; i++) {
+  for (const auto i : c10::irange(n)) {
     x += grad[i] * effgrad[i];
     if (normalized_lr_adaption) {
       y += grad[i] * grad[i];
diff --git a/caffe2/sgd/learning_rate_op.h b/caffe2/sgd/learning_rate_op.h
index fb0998a..74387f4 100644
--- a/caffe2/sgd/learning_rate_op.h
+++ b/caffe2/sgd/learning_rate_op.h
@@ -5,6 +5,7 @@
 #include <cmath>
 #include "caffe2/core/context.h"
 #include "caffe2/core/export_caffe2_op_to_c10.h"
+#include <c10/util/irange.h>
 #include "caffe2/core/operator.h"
 #include "caffe2/sgd/learning_rate_functors.h"
 
@@ -162,7 +163,7 @@
           sub_policy_num_iters.size(),
           0,
           "Must specify at least one sub learning rate policy.");
-      for (size_t i = 0; i < sub_policy_num_iters.size(); ++i) {
+      for (const auto i : c10::irange(sub_policy_num_iters.size())) {
         CAFFE_ENFORCE_GT(
             sub_policy_num_iters[i],
             0,
diff --git a/caffe2/sgd/momentum_sgd_op.h b/caffe2/sgd/momentum_sgd_op.h
index 89b1352..5ef49ad 100644
--- a/caffe2/sgd/momentum_sgd_op.h
+++ b/caffe2/sgd/momentum_sgd_op.h
@@ -17,7 +17,7 @@
     float* param,
     Context* /*context*/) {
   const float LR = lr[0];
-  for (auto i = 0; i < N; ++i) {
+  for (const auto i : c10::irange(N)) {
     if (!nesterov) {
       const float adjusted_gradient = LR * g[i] + momentum * m[i];
       nm[i] = adjusted_gradient;
@@ -154,7 +154,7 @@
     auto* momentumOut = Output(OUTPUT_MOMENTUM)->template mutable_data<T>();
     auto* paramOut = Output(OUTPUT_PARAM)->template mutable_data<T>();
 
-    for (auto i = 0; i < n; ++i) {
+    for (const auto i : c10::irange(n)) {
       auto idx = indices[i];
       auto offsetI = i * block_size;
       auto offsetIdx = idx * block_size;
diff --git a/caffe2/sgd/rowwise_adagrad_fused.h b/caffe2/sgd/rowwise_adagrad_fused.h
index 953ccbe..1d1076a 100644
--- a/caffe2/sgd/rowwise_adagrad_fused.h
+++ b/caffe2/sgd/rowwise_adagrad_fused.h
@@ -217,8 +217,8 @@
     auto* grad_buffer_data =
         is_mean ? grad_buffer_.template mutable_data<T>() : NULL;
     if (is_mean) {
-      for (auto rangeIndex = 0; rangeIndex < numSegments; ++rangeIndex) {
-        for (auto tmpIndex = 0; tmpIndex < block_size; ++tmpIndex) {
+      for (const auto rangeIndex : c10::irange(numSegments)) {
+        for (const auto tmpIndex : c10::irange(block_size)) {
           auto offsetI = rangeIndex * block_size;
           grad_buffer_data[offsetI + tmpIndex] = lengths[rangeIndex] > 0
               ? gradIn[offsetI + tmpIndex] / lengths[rangeIndex]
@@ -269,7 +269,7 @@
       T counter_halflife,
       rowWiseAdagradT& kernel) {
     int dataIndex = 0;
-    for (auto rangeIndex = 0; rangeIndex < numSegments; ++rangeIndex) {
+    for (const auto rangeIndex : c10::irange(numSegments)) {
       auto offsetI = rangeIndex * block_size;
       const float* g = gradIn + offsetI;
 
@@ -557,7 +557,7 @@
     // ignores this dependency and fuses these two loops.
     std::vector<T> temp_grad(block_size);
     int dataIndex = 0;
-    for (auto rangeIndex = 0; rangeIndex < numSegments; ++rangeIndex) {
+    for (const auto rangeIndex : c10::irange(numSegments)) {
       for (auto start = dataIndex; dataIndex < start + lengths[rangeIndex];
            ++dataIndex) {
         std::size_t idx = indices[dataIndex];
@@ -591,7 +591,7 @@
     CAFFE_ENFORCE_EQ(dataIndex, n);
 
     dataIndex = 0;
-    for (auto rangeIndex = 0; rangeIndex < numSegments; ++rangeIndex) {
+    for (const auto rangeIndex : c10::irange(numSegments)) {
       auto offsetI = rangeIndex * block_size;
       const float* g = gradIn + offsetI;
 
@@ -606,7 +606,7 @@
         auto offsetIdx = idx * block_size;
         auto localOffset = dataIndex - start;
 
-        for (int i = 0; i < block_size; ++i) {
+        for (const auto i : c10::irange(block_size)) {
           temp_grad[i] = auxParamIn[localOffset] * g[i];
         }
 
@@ -839,7 +839,7 @@
 
     std::vector<T> temp_grad(block_size);
     int dataIndex = 0;
-    for (auto rangeIndex = 0; rangeIndex < numSegments; ++rangeIndex) {
+    for (const auto rangeIndex : c10::irange(numSegments)) {
       auto offsetI = rangeIndex * block_size;
       const float* g = gradIn + offsetI;
 
@@ -902,7 +902,7 @@
 
         alignas(64) float temp[VLEN];
         _mm256_store_ps(temp, acc_v);
-        for (int j = 0; j < VLEN; ++j) {
+        for (const auto j : c10::irange(VLEN)) {
           acc += temp[j];
         }
 #endif
diff --git a/caffe2/sgd/rowwise_counter.h b/caffe2/sgd/rowwise_counter.h
index fb0647d..db8fa19 100644
--- a/caffe2/sgd/rowwise_counter.h
+++ b/caffe2/sgd/rowwise_counter.h
@@ -40,7 +40,7 @@
       return true;
     }
 
-    for (auto i = 0; i < n; ++i) {
+    for (const auto i : c10::irange(n)) {
       const std::size_t idx = indices[i];
       CAFFE_ENFORCE_GE(
           Input(COUNTER).numel(),
diff --git a/caffe2/sgd/storm_op.h b/caffe2/sgd/storm_op.h
index 0ecb0fa..5abf0c8 100644
--- a/caffe2/sgd/storm_op.h
+++ b/caffe2/sgd/storm_op.h
@@ -19,7 +19,7 @@
     const float beta,
     Context* /*context*/) {
   float gradSqSumTmp = 0.0;
-  for (auto i = 0; i < N; ++i) {
+  for (const auto i : c10::irange(N)) {
     const float gi = gradIn[i];
     gradSqSumTmp += gi * gi;
   }
@@ -27,7 +27,7 @@
 
   const float nlr = lr[0] * std::pow(beta + gradSqSumOut[0], -1.0 / 3.0);
   const float alpha = momentum * nlr * nlr;
-  for (auto i = 0; i < N; ++i) {
+  for (const auto i : c10::irange(N)) {
     const float gi = gradIn[i];
     const float mi = momentIn[i];
     float new_mi = momentOut[i] = gi + (1.0 - alpha) * (mi - gi);
@@ -120,7 +120,7 @@
     }
 
     float gradSqSumTmp = 0.0;
-    for (auto i = 0; i < Input(GRAD).numel(); ++i) {
+    for (const auto i : c10::irange(Input(GRAD).numel())) {
       const float gi = gradIn[i];
       gradSqSumTmp += gi * gi;
     }
@@ -130,7 +130,7 @@
     const float alpha = momentum_ * nlr * nlr;
     const auto block_size = Input(GRAD).numel() / n;
 
-    for (auto i = 0; i < n; ++i) {
+    for (const auto i : c10::irange(n)) {
       auto idx = indices[i];
       if (block_size == 1) {
         const float gi = gradIn[i];
@@ -162,7 +162,7 @@
             i);
 #endif
 
-        for (auto j = 0; j < block_size; ++j) {
+        for (const auto j : c10::irange(block_size)) {
           const float gi = gradIn[offsetI + j];
           const float mi = momentIn[offsetIdx + j];
           float new_mi = momentOut[offsetIdx + j] =
diff --git a/caffe2/sgd/wngrad_op.h b/caffe2/sgd/wngrad_op.h
index e9e1fd1..862efa9 100644
--- a/caffe2/sgd/wngrad_op.h
+++ b/caffe2/sgd/wngrad_op.h
@@ -15,12 +15,12 @@
     float epsilon,
     const float* lr,
     Context* /*context*/) {
-  for (auto i = 0; i < N; ++i) {
+  for (const auto i : c10::irange(N)) {
     float gi = g[i];
     nw[i] = w[i] + lr[0] * gi / (h[0] + epsilon);
   }
   float nhTmp = 0.0;
-  for (auto i = 0; i < N; ++i) {
+  for (const auto i : c10::irange(N)) {
     float gi = g[i];
     nhTmp += gi * gi;
   }
@@ -42,13 +42,13 @@
     Context* /*context*/) {
   effectiveLROut[0] = lr[0] / (seqBIn[0] + epsilon);
   float seqBTmp = 0.0;
-  for (auto i = 0; i < N; ++i) {
+  for (const auto i : c10::irange(N)) {
     float gi = gradIn[i];
     seqBTmp += gi * gi;
   }
   seqBTmp /= (seqBIn[0] + epsilon);
   seqBOut[0] = seqBIn[0] + seqBTmp;
-  for (auto i = 0; i < N; ++i) {
+  for (const auto i : c10::irange(N)) {
     float grad = gradIn[i];
     paramOut[i] = paramIn[i] + effectiveLROut[0] * grad;
   }
@@ -69,14 +69,14 @@
     Context* /*context*/) {
   effectiveLROut[0] = lr[0] / (seqBIn[0] + epsilon);
   float seqBTmp = 0.0;
-  for (auto i = 0; i < N; ++i) {
+  for (const auto i : c10::irange(N)) {
     float gi = gradIn[i];
     seqBTmp += gi * gi;
   }
   seqBTmp /= (seqBIn[0] + epsilon);
   seqBOut[0] = seqBIn[0] + seqBTmp;
 
-  for (auto i = 0; i < N; ++i) {
+  for (const auto i : c10::irange(N)) {
     float grad = gradIn[i];
     float update = updateOut[i] = effectiveLROut[0] * grad;
     paramOut[i] = paramIn[i] + update;
@@ -193,7 +193,7 @@
 
     auto block_size = Input(GRAD).numel() / n;
 
-    for (auto i = 0; i < n; ++i) {
+    for (const auto i : c10::irange(n)) {
       auto idx = indices[i];
       if (block_size == 1) {
         float gi = gradIn[i];
@@ -222,7 +222,7 @@
             " for input i:",
             i);
 #endif
-        for (auto j = 0; j < block_size; ++j) {
+        for (const auto j : c10::irange(block_size)) {
           float gi = gradIn[offsetI + j];
           paramOut[offsetIdx + j] =
               paramIn[offsetIdx + j] + lr[0] * gi / (seqBIn[0] + epsilon_);
@@ -230,7 +230,7 @@
       }
     }
     float seqBTmp = 0.0;
-    for (auto i = 0; i < Input(GRAD).numel(); ++i) {
+    for (const auto i : c10::irange(Input(GRAD).numel())) {
       float gi = gradIn[i];
       seqBTmp += gi * gi;
     }
diff --git a/caffe2/sgd/yellowfin_op.h b/caffe2/sgd/yellowfin_op.h
index a3608aa..3ae7dd1 100644
--- a/caffe2/sgd/yellowfin_op.h
+++ b/caffe2/sgd/yellowfin_op.h
@@ -133,7 +133,7 @@
 CAFFE_ENFORCE_EQ(param_tensor.dim(), g_avg_tensor.dim());
 CAFFE_ENFORCE_EQ(param_tensor.dim(), g2_avg_tensor.dim());
 CAFFE_ENFORCE_EQ(param_tensor.dim(), grad_tensor.dim());
-for (int i = 0; i < param_tensor.dim(); ++i) {
+for (const auto i : c10::irange(param_tensor.dim())) {
   CAFFE_ENFORCE_EQ(param_tensor.dim32(i), moment_tensor.dim32(i));
   CAFFE_ENFORCE_EQ(param_tensor.dim32(i), g_avg_tensor.dim32(i));
   CAFFE_ENFORCE_EQ(param_tensor.dim32(i), g2_avg_tensor.dim32(i));
diff --git a/caffe2/transforms/pattern_net_transform.h b/caffe2/transforms/pattern_net_transform.h
index 95638f4..bb18329 100644
--- a/caffe2/transforms/pattern_net_transform.h
+++ b/caffe2/transforms/pattern_net_transform.h
@@ -28,7 +28,7 @@
         "External outputs do not match!");
     ordered_ops_ = GetPatternTraversalOrder(p_);
     inverse_ops_.resize(ordered_ops_.size());
-    for (size_t i = 0; i < ordered_ops_.size(); i++) {
+    for (const auto i : c10::irange(ordered_ops_.size())) {
       inverse_ops_[ordered_ops_[i]] = i;
     }
   }
diff --git a/caffe2/utils/proto_utils.h b/caffe2/utils/proto_utils.h
index b5c6b31..a690342 100644
--- a/caffe2/utils/proto_utils.h
+++ b/caffe2/utils/proto_utils.h
@@ -9,6 +9,7 @@
 
 #include <c10/util/Logging.h>
 #include <c10/util/string_view.h>
+#include <c10/util/irange.h>
 
 #include "caffe2/utils/proto_wrap.h"
 #include "caffe2/proto/caffe2_pb.h"
diff --git a/caffe2/utils/threadpool/WorkersPool.h b/caffe2/utils/threadpool/WorkersPool.h
index 145dbc1..8126b82 100644
--- a/caffe2/utils/threadpool/WorkersPool.h
+++ b/caffe2/utils/threadpool/WorkersPool.h
@@ -4,6 +4,7 @@
 #include <condition_variable>
 #include <thread>
 #include "c10/util/thread_name.h"
+#include <c10/util/irange.h>
 #include "caffe2/core/common.h"
 #include "caffe2/core/logging.h"
 
@@ -339,7 +340,7 @@
     CreateWorkers(workers_count);
     DCHECK_LE(workers_count, (int)workers_.size());
     counter_to_decrement_when_ready_.Reset(workers_count);
-    for (size_t task = 1; task < tasks.size(); ++task) {
+    for (const auto task : c10::irange(1, tasks.size())) {
       workers_[task - 1]->StartWork(tasks[task].get());
     }
     // Execute the remaining workload immediately on the current thread.
diff --git a/caffe2/video/video_input_op.h b/caffe2/video/video_input_op.h
index 9bab5f2..27f7e22 100644
--- a/caffe2/video/video_input_op.h
+++ b/caffe2/video/video_input_op.h
@@ -8,6 +8,7 @@
 #include <string>
 
 #include <c10/core/thread_pool.h>
+#include <c10/util/irange.h>
 #include <caffe2/core/db.h>
 #include <caffe2/core/logging.h>
 #include <caffe2/operators/prefetch_op.h>
@@ -225,7 +226,7 @@
     if (random_sampling_rate_) {
       LOG(INFO) << "random sampling with max:" << random_sampling_rate_;
     }
-    for (int i = 0; i < channels_rgb_; i++) {
+    for (const auto i : c10::irange(channels_rgb_)) {
       LOG(INFO) << "    RGB " << i << "-th channel mean: " << mean_rgb_[i]
                 << " std: " << 1.f / inv_std_rgb_[i];
     }
@@ -237,7 +238,7 @@
               << "and a sampling rate of 1:" << sampling_rate_of_
               << " flow_data_type_: " << flow_data_type_
               << " flow_alg_type_: " << flow_alg_type_;
-    for (int i = 0; i < channels_of_; i++) {
+    for (const auto i : c10::irange(channels_of_)) {
       LOG(INFO) << "    Optical flow" << i
                 << "-th channel mean: " << mean_of_[i]
                 << " std: " << 1.f / inv_std_of_[i];
@@ -257,7 +258,7 @@
   if (video_res_type_ == VideoResType::USE_SHORT_EDGE) {
     if (jitter_scales_.size() > 0) {
       LOG(INFO) << "Using scale jittering:";
-      for (int idx = 0; idx < jitter_scales_.size(); idx++) {
+      for (const auto idx : c10::irange(jitter_scales_.size())) {
         LOG(INFO) << "scale " << idx << ": " << jitter_scales_[idx];
       }
     } else {
@@ -390,7 +391,7 @@
       }
 
       channels_rgb_ = 3;
-      for (int i = 4; i < 7; i++) {
+      for (const auto i : c10::irange(4, 7)) {
         mean_rgb_.push_back(InputDataMean[i]);
         inv_std_rgb_.push_back(1.f / InputDataStd[i]);
       }
@@ -403,7 +404,7 @@
       get_optical_flow_ = false;
       get_rgb_ = true;
       sampling_rate_rgb_ = 1;
-      for (int i = 4; i < 7; i++) {
+      for (const auto i : c10::irange(4, 7)) {
         mean_rgb_.push_back(InputDataMean[i]);
         inv_std_rgb_.push_back(1.f / InputDataStd[i]);
       }
@@ -420,7 +421,7 @@
       switch (flow_data_type_) {
         case FlowDataType::Flow2C:
           channels_of_ = 2;
-          for (int i = 0; i < channels_of_; i++) {
+          for (const auto i : c10::irange(channels_of_)) {
             mean_of_.push_back(InputDataMean[i]);
             inv_std_of_.push_back(1.f / InputDataStd[i]);
           }
@@ -428,7 +429,7 @@
 
         case FlowDataType::Flow3C:
           channels_of_ = 3;
-          for (int i = 0; i < channels_of_; i++) {
+          for (const auto i : c10::irange(channels_of_)) {
             mean_of_.push_back(InputDataMean[i]);
             inv_std_of_.push_back(1.f / InputDataStd[i]);
           }
@@ -437,7 +438,7 @@
         // early fusion with gray
         case FlowDataType::FlowWithGray:
           channels_of_ = 3;
-          for (int i = 0; i < 2; i++) {
+          for (const auto i : c10::irange(2)) {
             mean_of_.push_back(InputDataMean[i]);
             inv_std_of_.push_back(1.f / InputDataStd[i]);
           }
@@ -448,11 +449,11 @@
         // early fusion with RGB
         case FlowDataType::FlowWithRGB:
           channels_of_ = 5;
-          for (int i = 0; i < 2; i++) {
+          for (const auto i : c10::irange(2)) {
             mean_of_.push_back(InputDataMean[i]);
             inv_std_of_.push_back(1.f / InputDataStd[i]);
           }
-          for (int i = 4; i < 7; i++) {
+          for (const auto i : c10::irange(4, 7)) {
             mean_of_.push_back(InputDataMean[i]);
             inv_std_of_.push_back(1.f / InputDataStd[i]);
           }
@@ -527,15 +528,15 @@
     int* label_data) {
   int num_clips = clip_per_video_ * crop_per_clip_;
   if (!do_multi_label_) {
-    for (int i = 0; i < num_clips; i++) {
+    for (const auto i : c10::irange(num_clips)) {
       label_data[i] = label_proto.int32_data(0);
     }
   } else {
     // For multiple label case, output label is a binary vector
     // where presented concepts are marked 1
     memset(label_data, 0, sizeof(int) * num_of_class_ * num_clips);
-    for (int i = 0; i < num_clips; i++) {
-      for (int j = 0; j < label_proto.int32_data_size(); j++) {
+    for (const auto i : c10::irange(num_clips)) {
+      for (const auto j : c10::irange(label_proto.int32_data_size())) {
         CAFFE_ENFORCE_LT(
             label_proto.int32_data(j),
             num_of_class_,
@@ -659,7 +660,7 @@
     const TensorProto& start_frm_proto = protos.protos(curr_proto_idx++);
     start_frm = start_frm_proto.int32_data(0);
     if (get_start_frame_) {
-      for (int i = 0; i < num_clips; i++) {
+      for (const auto i : c10::irange(num_clips)) {
         start_frame_data[i] = start_frm;
       }
     }
@@ -669,7 +670,7 @@
     CAFFE_ENFORCE_GE(
         protos.protos_size(), curr_proto_idx + 1, "Video Id not provided");
     const TensorProto& video_id_proto = protos.protos(curr_proto_idx);
-    for (int i = 0; i < num_clips; i++) {
+    for (const auto i : c10::irange(num_clips)) {
       video_id_data[i] = video_id_proto.int64_data(0);
     }
   }
@@ -774,7 +775,7 @@
     int clip_offset_of = channels_of_ * length_of_ * crop_size_ * crop_size_;
     for (int i = 0; i < std::min(clip_per_video_, int(buffer_rgb.size()));
          i++) {
-      for (int j = 0; j < crop_per_clip_; j++) {
+      for (const auto j : c10::irange(crop_per_clip_)) {
         // get the rectangle for cropping
         int h_off = 0;
         int w_off = 0;
@@ -857,7 +858,7 @@
       }
     }
     if (buffer_rgb.size() > 0) {
-      for (int i = 0; i < buffer_rgb.size(); i++) {
+      for (const auto i : c10::irange(buffer_rgb.size())) {
         unsigned char* buff = buffer_rgb[i];
         delete[] buff;
       }
@@ -886,12 +887,12 @@
     // Prefetching handled with a thread pool of "decode_threads" threads.
     std::mt19937 meta_randgen(time(nullptr));
     std::vector<std::mt19937> randgen_per_thread;
-    for (int i = 0; i < num_decode_threads_; ++i) {
+    for (const auto i : c10::irange(num_decode_threads_)) {
       randgen_per_thread.emplace_back(meta_randgen());
     }
 
     std::bernoulli_distribution mirror_this_clip(0.5);
-    for (int item_id = 0; item_id < batch_size_; ++item_id) {
+    for (const auto item_id : c10::irange(batch_size_)) {
       std::mt19937* randgen =
           &randgen_per_thread[item_id % num_decode_threads_];
 
diff --git a/test/cpp/api/dataloader.cpp b/test/cpp/api/dataloader.cpp
index b49330c..c0622ba 100644
--- a/test/cpp/api/dataloader.cpp
+++ b/test/cpp/api/dataloader.cpp
@@ -5,6 +5,7 @@
 #include <test/cpp/api/support.h>
 
 #include <c10/util/ArrayRef.h>
+#include <c10/util/irange.h>
 #include <c10/util/tempfile.h>
 
 #include <algorithm>
@@ -173,7 +174,7 @@
   for (auto& batch : *data_loader) {
     ASSERT_LT(batch_index, 3);
     ASSERT_EQ(batch.size(), kBatchSize);
-    for (size_t j = 0; j < kBatchSize; ++j) {
+    for (const auto j : c10::irange(kBatchSize)) {
       ASSERT_EQ(batch.at(j), 1 + (batch_index * kBatchSize) + j);
     }
     batch_index += 1;
@@ -837,7 +838,7 @@
 
   size_t i = 0;
   for (auto batch : *data_loader) {
-    for (int j = 0; j < kBatchSize; ++j) {
+    for (const auto j : c10::irange(kBatchSize)) {
       ASSERT_EQ(batch.at(j), 10 + j);
     }
     i += 1;
@@ -857,7 +858,7 @@
   ASSERT_EQ(res.size(), sample_count);
 
   std::sort(res.begin(), res.end());
-  for (size_t i = 0; i < res.size(); ++i) {
+  for (const auto i : c10::irange(res.size())) {
     ASSERT_EQ(res[i], i);
   }
 }
@@ -872,14 +873,14 @@
                            size_t batch_size) {
     std::vector<std::unique_ptr<samplers::DistributedRandomSampler>> samplers;
 
-    for (size_t i = 0; i < num_replicas; ++i) {
+    for (const auto i : c10::irange(num_replicas)) {
       samplers.emplace_back(
           torch::make_unique<samplers::DistributedRandomSampler>(
               sample_count, num_replicas, i, allow_duplicates));
     }
 
     std::vector<size_t> res;
-    for (size_t i = 0; i < num_replicas; ++i) {
+    for (const auto i : c10::irange(num_replicas)) {
       (*samplers[i]).reset();
       torch::optional<std::vector<size_t>> idx;
       while ((idx = (*samplers[i]).next(batch_size)).has_value()) {
@@ -953,7 +954,7 @@
   ASSERT_EQ(res.size(), sample_count);
 
   std::sort(res.begin(), res.end());
-  for (size_t i = 0; i < res.size(); ++i) {
+  for (const auto i : c10::irange(res.size())) {
     ASSERT_EQ(res[i], i);
   }
 }
@@ -969,14 +970,14 @@
     std::vector<std::unique_ptr<samplers::DistributedSequentialSampler>>
         samplers;
 
-    for (size_t i = 0; i < num_replicas; ++i) {
+    for (const auto i : c10::irange(num_replicas)) {
       samplers.emplace_back(
           torch::make_unique<samplers::DistributedSequentialSampler>(
               sample_count, num_replicas, i, allow_duplicates));
     }
 
     std::vector<size_t> res;
-    for (size_t i = 0; i < num_replicas; ++i) {
+    for (const auto i : c10::irange(num_replicas)) {
       (*samplers[i]).reset();
       torch::optional<std::vector<size_t>> idx;
       while ((idx = (*samplers[i]).next(batch_size)).has_value()) {
@@ -1490,7 +1491,7 @@
 
   auto data_loader = torch::data::make_data_loader(D{});
 
-  for (size_t i = 0; i < 10; ++i) {
+  for (const auto i : c10::irange(10)) {
     const auto number_of_iterations =
         std::distance(data_loader->begin(), data_loader->end());
     ASSERT_EQ(
@@ -1531,7 +1532,7 @@
       torch::data::datasets::make_shared_dataset<D>(),
       DataLoaderOptions().workers(kNumberOfWorkers));
 
-  for (size_t i = 0; i < 10; ++i) {
+  for (const auto i : c10::irange(10)) {
     const auto number_of_iterations =
         std::distance(data_loader->begin(), data_loader->end());
     ASSERT_EQ(
@@ -1574,7 +1575,7 @@
               })),
       DataLoaderOptions{});
 
-  for (size_t i = 0; i < 10; ++i) {
+  for (const auto i : c10::irange(10)) {
     const auto number_of_iterations =
         std::distance(data_loader->begin(), data_loader->end());
     ASSERT_EQ(
@@ -1675,7 +1676,8 @@
             dataset,
             DataLoaderOptions(batch_size).workers(dataloader_worker_count));
 
-        for (int epoch_index = 0; epoch_index < epoch_count; ++epoch_index) {
+        for (const auto epoch_index : c10::irange(epoch_count)) {
+          (void)epoch_index; // Suppress unused variable warning
           std::vector<bool> result(total_example_count, false);
           int iteration_count = 0;
           for (auto iterator = data_loader->begin();
@@ -1687,11 +1689,11 @@
             // When prefetch_count is equal to 1 and no worker thread, the batch
             // order is deterministic. So we can verify elements in each batch.
             if (prefetch_count == 1 && dataloader_worker_count == 0) {
-              for (size_t j = 0; j < batch_size; ++j) {
+              for (const auto j : c10::irange(batch_size)) {
                 ASSERT_EQ(batch[j], iteration_count * batch_size + j);
               }
             }
-            for (size_t j = 0; j < batch_size; ++j) {
+            for (const auto j : c10::irange(batch_size)) {
               result[batch[j]] = true;
             }
           }
@@ -1978,7 +1980,8 @@
         dataset,
         DataLoaderOptions(batch_size).workers(dataloader_worker_count));
 
-    for (int epoch_index = 0; epoch_index < epoch_count; ++epoch_index) {
+    for (const auto epoch_index : c10::irange(epoch_count)) {
+      (void)epoch_index; // Suppress unused variable warning
       int iteration_count = 0;
       for (auto iterator = data_loader->begin(); iterator != data_loader->end();
            ++iterator, ++iteration_count) {
@@ -2079,7 +2082,7 @@
   auto data_loader = torch::data::make_data_loader(
       dataset, DataLoaderOptions(batch_size).workers(dataloader_worker_count));
 
-  for (int epoch_index = 0; epoch_index < epoch_count; ++epoch_index) {
+  for (const auto epoch_index : c10::irange(epoch_count)) {
     int iteration_count = 0;
 
     // For the first epoch, the returned batch should be returned from the
@@ -2128,7 +2131,7 @@
       size_t index = 0;
 
       // Repeatly sample every 5 indices.
-      for (size_t i = 0; i < batch_size; ++i) {
+      for (const auto i : c10::irange(batch_size)) {
         for (size_t j = 0; j < size_ / batch_size; ++j) {
           indices_[index++] = i + batch_size * j;
         }
@@ -2222,11 +2225,11 @@
         // construct expected result
         int offset = 0;
 
-        for (int i = 0; i < (chunk_count + cross_chunk_shuffle_count - 1) /
-                 cross_chunk_shuffle_count;
-             i++) {
-          for (int j = 0; j < chunk_size; ++j) {
-            for (int k = 0; k < cross_chunk_shuffle_count; ++k) {
+        for (const auto i : c10::irange((chunk_count + cross_chunk_shuffle_count - 1) /
+                 cross_chunk_shuffle_count)) {
+          for (const auto j : c10::irange(chunk_size)) {
+            (void)j; // Suppress unused variable warning
+            for (const auto k : c10::irange(cross_chunk_shuffle_count)) {
               if (i * cross_chunk_shuffle_count + k < chunk_count) {
                 expected_result.push_back(i * cross_chunk_shuffle_count + k);
               }
diff --git a/test/cpp/api/dispatch.cpp b/test/cpp/api/dispatch.cpp
index 6416fe3..ba53006 100644
--- a/test/cpp/api/dispatch.cpp
+++ b/test/cpp/api/dispatch.cpp
@@ -2,6 +2,7 @@
 
 #include <torch/torch.h>
 #include <ATen/native/Pow.h>
+#include <c10/util/irange.h>
 #include <torch/types.h>
 #include <torch/utils.h>
 #include <test/cpp/api/support.h>
@@ -24,7 +25,7 @@
   setenv("ATEN_CPU_CAPABILITY", "avx2", 1);
 #endif
   const auto actual_pow_avx2 = vals_tensor.pow(pows_tensor);
-  for (int i = 0; i < 4; i++) {
+  for (const auto i : c10::irange(4)) {
     ASSERT_EQ(result[i], actual_pow_avx2[i].item<int>());
   }
 }
@@ -40,7 +41,7 @@
   setenv("ATEN_CPU_CAPABILITY", "avx512", 1);
 #endif
   const auto actual_pow_avx512 = vals_tensor.pow(pows_tensor);
-  for (int i = 0; i < 4; i++) {
+  for (const auto i : c10::irange(4)) {
     ASSERT_EQ(result[i], actual_pow_avx512[i].item<int>());
   }
 }
@@ -56,7 +57,7 @@
   setenv("ATEN_CPU_CAPABILITY", "default", 1);
 #endif
   const auto actual_pow_default = vals_tensor.pow(pows_tensor);
-  for (int i = 0; i < 4; i++) {
+  for (const auto i : c10::irange(4)) {
     ASSERT_EQ(result[i], actual_pow_default[i].item<int>());
   }
 }
diff --git a/test/cpp/api/expanding-array.cpp b/test/cpp/api/expanding-array.cpp
index 0ad6dd6..b3e7257 100644
--- a/test/cpp/api/expanding-array.cpp
+++ b/test/cpp/api/expanding-array.cpp
@@ -1,5 +1,6 @@
 #include <gtest/gtest.h>
 
+#include <c10/util/irange.h>
 #include <torch/torch.h>
 
 #include <test/cpp/api/support.h>
@@ -13,7 +14,7 @@
 TEST_F(ExpandingArrayTest, CanConstructFromInitializerList) {
   torch::ExpandingArray<5> e({1, 2, 3, 4, 5});
   ASSERT_EQ(e.size(), 5);
-  for (size_t i = 0; i < e.size(); ++i) {
+  for (const auto i : c10::irange(e.size())) {
     ASSERT_EQ((*e)[i], i + 1);
   }
 }
@@ -21,7 +22,7 @@
 TEST_F(ExpandingArrayTest, CanConstructFromVector) {
   torch::ExpandingArray<5> e(std::vector<int64_t>{1, 2, 3, 4, 5});
   ASSERT_EQ(e.size(), 5);
-  for (size_t i = 0; i < e.size(); ++i) {
+  for (const auto i : c10::irange(e.size())) {
     ASSERT_EQ((*e)[i], i + 1);
   }
 }
@@ -29,7 +30,7 @@
 TEST_F(ExpandingArrayTest, CanConstructFromArray) {
   torch::ExpandingArray<5> e(std::array<int64_t, 5>({1, 2, 3, 4, 5}));
   ASSERT_EQ(e.size(), 5);
-  for (size_t i = 0; i < e.size(); ++i) {
+  for (const auto i : c10::irange(e.size())) {
     ASSERT_EQ((*e)[i], i + 1);
   }
 }
@@ -37,7 +38,7 @@
 TEST_F(ExpandingArrayTest, CanConstructFromSingleValue) {
   torch::ExpandingArray<5> e(5);
   ASSERT_EQ(e.size(), 5);
-  for (size_t i = 0; i < e.size(); ++i) {
+  for (const auto i : c10::irange(e.size())) {
     ASSERT_EQ((*e)[i], 5);
   }
 }
diff --git a/test/cpp/api/fft.cpp b/test/cpp/api/fft.cpp
index 5648a3d..5b6452d 100644
--- a/test/cpp/api/fft.cpp
+++ b/test/cpp/api/fft.cpp
@@ -1,5 +1,6 @@
 #include <gtest/gtest.h>
 
+#include <c10/util/irange.h>
 #include <torch/torch.h>
 #include <test/cpp/api/support.h>
 
@@ -14,15 +15,15 @@
   // Roots of unity, exp(-2*pi*j*n/N) for n in [0, N), reversed for inverse transform
   std::vector<c10::complex<double>> roots(len);
   const auto angle_base = (forward ? -2.0 : 2.0) * M_PI / len;
-  for (int64_t i = 0; i < len; ++i) {
+  for (const auto i : c10::irange(len)) {
     auto angle = i * angle_base;
     roots[i] = c10::complex<double>(std::cos(angle), std::sin(angle));
   }
 
   const auto in = x.data_ptr<c10::complex<double>>();
   const auto out = out_tensor.data_ptr<c10::complex<double>>();
-  for (int64_t i = 0; i < len; ++i) {
-    for (int64_t j = 0; j < len; ++j) {
+  for (const auto i : c10::irange(len)) {
+    for (const auto j : c10::irange(len)) {
       out[i] += roots[(j * i) % len] * in[j];
     }
   }
diff --git a/test/cpp/api/functional.cpp b/test/cpp/api/functional.cpp
index 8b7889f..efcb1e8 100644
--- a/test/cpp/api/functional.cpp
+++ b/test/cpp/api/functional.cpp
@@ -1,5 +1,6 @@
 #include <gtest/gtest.h>
 
+#include <c10/util/irange.h>
 #include <torch/torch.h>
 
 #include <test/cpp/api/support.h>
@@ -1127,7 +1128,7 @@
   int dims[] = {1, -1};
   // NOLINTNEXTLINE(cppcoreguidelines-avoid-c-arrays,modernize-avoid-c-arrays,cppcoreguidelines-avoid-magic-numbers)
   int expected[] = {5*3, 5*4};
-  for(auto i=0; i<2; i++) {
+  for (const auto i : c10::irange(2)) {
     auto logits = torch::randn({5, 4, 3});
     int expected_count = expected[i];
     auto y_draw = F::gumbel_softmax(logits, F::GumbelSoftmaxFuncOptions().hard(true).dim(dims[i]));
@@ -1149,7 +1150,8 @@
 
     auto counts = torch::zeros_like(logits);
     torch::Tensor y_draw;
-    for (auto i=0; i<num_draws; i++) {
+    for (const auto i : c10::irange(num_draws)) {
+        (void)i; // Suppress unused variable warning
         y_draw = F::gumbel_softmax(logits, F::GumbelSoftmaxFuncOptions().hard(true));
         counts += y_draw;
     }
@@ -1175,7 +1177,7 @@
   auto output = F::softmax(input, /*dim=*/1);
   auto sum = torch::sum(torch::exp(input), 1);
 
-  for (int i = 0; i < 2; i++) {
+  for (const auto i : c10::irange(2)) {
     auto expected = torch::exp(input[i]) / sum[i];
     ASSERT_TRUE(torch::allclose(output[i], expected));
   }
@@ -1187,7 +1189,7 @@
   auto output = F::softmin(input, /*dim=*/1);
   auto sum = torch::sum(torch::exp(-input), 1);
 
-  for (int i = 0; i < 2; i++) {
+  for (const auto i : c10::irange(2)) {
     auto expected = torch::exp(-input[i]) / sum[i];
     ASSERT_TRUE(torch::allclose(output[i], expected));
   }
@@ -1199,7 +1201,7 @@
   auto output = F::log_softmax(input, /*dim=*/1);
   auto sum = torch::sum(torch::exp(input), 1);
 
-  for (int i = 0; i < 2; i++) {
+  for (const auto i : c10::irange(2)) {
     auto expected = torch::log(torch::exp(input[i]) / sum[i]);
     ASSERT_TRUE(torch::allclose(output[i], expected));
   }
diff --git a/test/cpp/api/init.cpp b/test/cpp/api/init.cpp
index 71f6767..9e2ed42 100644
--- a/test/cpp/api/init.cpp
+++ b/test/cpp/api/init.cpp
@@ -1,5 +1,6 @@
 #include <gtest/gtest.h>
 
+#include <c10/util/irange.h>
 #include <torch/torch.h>
 
 #include <test/cpp/api/init_baseline.h>
@@ -14,7 +15,7 @@
     const std::vector<std::vector<torch::Tensor>>& expected_parameters) {
   ASSERT_EQ(parameters.size(), expected_parameters.size());
 
-  for (size_t i = 0; i < parameters.size(); i++) {
+  for (const auto i : c10::irange(parameters.size())) {
     auto layerParameters = parameters[i];
     auto expectedLayerParameters = expected_parameters[i];
 
@@ -27,7 +28,7 @@
       ASSERT_TRUE(false);
     }
 
-    for (size_t p = 0; p < layerParameters.size(0); p++) {
+    for (const auto p : c10::irange(layerParameters.size(0))) {
       // Always compare using double dtype, regardless of the original dtype of the tensors
       auto tensor = layerParameters[p].to(torch::kFloat64);
       auto expectedTensor = expectedLayerParameters[p].to(torch::kFloat64);
diff --git a/test/cpp/api/integration.cpp b/test/cpp/api/integration.cpp
index 7a57d82..cbdf49d 100644
--- a/test/cpp/api/integration.cpp
+++ b/test/cpp/api/integration.cpp
@@ -1,5 +1,6 @@
 #include <gtest/gtest.h>
 
+#include <c10/util/irange.h>
 #include <torch/torch.h>
 
 #include <test/cpp/api/support.h>
@@ -122,10 +123,12 @@
   torch::Device device(with_cuda ? torch::kCUDA : torch::kCPU);
   model->to(device);
 
-  for (size_t epoch = 0; epoch < number_of_epochs; epoch++) {
+  for (const auto epoch : c10::irange(number_of_epochs)) {
+    (void)epoch; // Suppress unused variable warning
     // NOLINTNEXTLINE(performance-for-range-copy)
     for (torch::data::Example<> batch : *data_loader) {
-      auto data = batch.data.to(device), targets = batch.target.to(device);
+      auto data = batch.data.to(device);
+      auto targets = batch.target.to(device);
       torch::Tensor prediction = forward_op(std::move(data));
       // NOLINTNEXTLINE(performance-move-const-arg)
       torch::Tensor loss = torch::nll_loss(prediction, std::move(targets));
@@ -196,7 +199,7 @@
 
     std::vector<torch::Tensor> policy_loss;
     std::vector<torch::Tensor> value_loss;
-    for (auto i = 0U; i < saved_log_probs.size(); i++) {
+    for (const auto i : c10::irange(0U, saved_log_probs.size())) {
       auto advantage = r_t[i] - saved_values[i].item<float>();
       policy_loss.push_back(-advantage * saved_log_probs[i]);
       value_loss.push_back(
diff --git a/test/cpp/api/module.cpp b/test/cpp/api/module.cpp
index d9dd02c..bb46324 100644
--- a/test/cpp/api/module.cpp
+++ b/test/cpp/api/module.cpp
@@ -1,5 +1,6 @@
 #include <gtest/gtest.h>
 
+#include <c10/util/irange.h>
 #include <torch/torch.h>
 
 #include <test/cpp/api/support.h>
@@ -704,7 +705,7 @@
   std::vector<std::shared_ptr<torch::nn::Module>> expected = {
       model.ptr(), model[0], model[1], model[2]};
   ASSERT_EQ(modules.size(), expected.size());
-  for (size_t i = 0; i < expected.size(); ++i) {
+  for (const auto i : c10::irange(expected.size())) {
     // Assert pointer equality.
     ASSERT_EQ(modules[i].get(), expected[i].get());
   }
@@ -717,7 +718,7 @@
   std::vector<std::shared_ptr<torch::nn::Module>> expected = {
       model[0], model[1], model[2]};
   ASSERT_EQ(modules.size(), expected.size());
-  for (size_t i = 0; i < expected.size(); ++i) {
+  for (const auto i : c10::irange(expected.size())) {
     // Assert pointer equality.
     ASSERT_EQ(modules[i].get(), expected[i].get());
   }
@@ -730,7 +731,7 @@
   std::vector<std::shared_ptr<torch::nn::Module>> expected = {
       model.ptr(), model[0], model[1], model[2]};
   ASSERT_EQ(modules.size(), expected.size());
-  for (size_t i = 0; i < expected.size(); ++i) {
+  for (const auto i : c10::irange(expected.size())) {
     // Assert pointer equality.
     ASSERT_EQ(modules[i].key(), i ? std::to_string(i - 1) : std::string());
     ASSERT_EQ(modules[i].value().get(), expected[i].get());
@@ -745,7 +746,7 @@
   std::vector<std::shared_ptr<torch::nn::Module>> expected = {
       model[0], model[1], model[2]};
   ASSERT_EQ(modules.size(), expected.size());
-  for (size_t i = 0; i < expected.size(); ++i) {
+  for (const auto i : c10::irange(expected.size())) {
     // Assert pointer equality.
     ASSERT_EQ(modules[i].key(), std::to_string(i));
     ASSERT_EQ(modules[i].value().get(), expected[i].get());
@@ -758,7 +759,7 @@
   std::vector<std::shared_ptr<torch::nn::Module>> expected = {
       model[0], model[1], model[2]};
   ASSERT_EQ(modules.size(), expected.size());
-  for (size_t i = 0; i < expected.size(); ++i) {
+  for (const auto i : c10::irange(expected.size())) {
     // Assert pointer equality.
     ASSERT_EQ(modules[i].get(), expected[i].get());
   }
@@ -774,7 +775,7 @@
   std::vector<std::shared_ptr<torch::nn::Module>> expected = {
       model[0], model[1], model[2]};
   ASSERT_EQ(modules.size(), expected.size());
-  for (size_t i = 0; i < expected.size(); ++i) {
+  for (const auto i : c10::irange(expected.size())) {
     // Assert pointer equality.
     ASSERT_EQ(modules[i].key(), std::to_string(i));
     ASSERT_EQ(modules[i].value().get(), expected[i].get());
@@ -822,7 +823,7 @@
 struct TestContainer : torch::nn::Module {
   TestContainer(int64_t number, std::vector<TestContainer> modules = {})
       : tensor(torch::tensor(number)) {
-    for (size_t i = 0; i < modules.size(); ++i) {
+    for (const auto i : c10::irange(modules.size())) {
       register_module(
           std::to_string(i),
           std::make_shared<TestContainer>(std::move(modules[i])));
@@ -866,7 +867,7 @@
   std::vector<std::shared_ptr<torch::nn::Module>> modules = model->modules();
 
   ASSERT_EQ(modules.size(), 10);
-  for (size_t i = 0; i < modules.size(); ++i) {
+  for (const auto i : c10::irange(modules.size())) {
     ASSERT_EQ(get_test_container_item(modules[i]), i);
   }
 }
@@ -879,7 +880,7 @@
 
   ASSERT_EQ(modules.size(), expected.size());
 
-  for (size_t i = 0; i < expected.size(); ++i) {
+  for (const auto i : c10::irange(expected.size())) {
     ASSERT_EQ(modules[i].key(), expected[i].first);
     ASSERT_EQ(get_test_container_item(modules[i].value()), expected[i].second);
   }
diff --git a/test/cpp/api/modulelist.cpp b/test/cpp/api/modulelist.cpp
index 98effb9..aa4fd05 100644
--- a/test/cpp/api/modulelist.cpp
+++ b/test/cpp/api/modulelist.cpp
@@ -1,5 +1,6 @@
 #include <gtest/gtest.h>
 
+#include <c10/util/irange.h>
 #include <torch/torch.h>
 
 #include <algorithm>
@@ -118,7 +119,7 @@
   ASSERT_EQ(list->size(), 3);
 
   // returns the correct module for a given index
-  for (size_t i = 0; i < modules.size(); ++i) {
+  for (const auto i : c10::irange(modules.size())) {
     ASSERT_EQ(&list->at<M>(i), modules[i].get());
   }
 
@@ -143,7 +144,7 @@
   ASSERT_EQ(list->size(), 3);
 
   // returns the correct module for a given index
-  for (size_t i = 0; i < modules.size(); ++i) {
+  for (const auto i : c10::irange(modules.size())) {
     ASSERT_EQ(list->ptr(i).get(), modules[i].get());
     ASSERT_EQ(list[i].get(), modules[i].get());
     ASSERT_EQ(list->ptr<M>(i).get(), modules[i].get());
diff --git a/test/cpp/api/modules.cpp b/test/cpp/api/modules.cpp
index 0c40cf1..74cc2d3 100644
--- a/test/cpp/api/modules.cpp
+++ b/test/cpp/api/modules.cpp
@@ -1,5 +1,6 @@
 #include <gtest/gtest.h>
 
+#include <c10/util/irange.h>
 #include <torch/torch.h>
 
 #include <test/cpp/api/support.h>
@@ -1148,7 +1149,7 @@
   s.backward();
   ASSERT_EQ(y.ndimension(), 2);
   ASSERT_EQ(s.ndimension(), 0);
-  for (auto i = 0; i < 2; i++) {
+  for (const auto i : c10::irange(2)) {
     ASSERT_EQ(y.size(i), 2);
   }
 
@@ -1166,7 +1167,7 @@
   s.backward();
   ASSERT_EQ(y.ndimension(), 2);
   ASSERT_EQ(s.ndimension(), 0);
-  for (auto i = 0; i < 2; i++) {
+  for (const auto i : c10::irange(2)) {
     ASSERT_EQ(y.size(i), 2);
   }
 
@@ -2595,7 +2596,7 @@
   auto output = m(input);
   auto sum = torch::sum(torch::exp(input), 1);
 
-  for (int i = 0; i < 2; i++) {
+  for (const auto i : c10::irange(2)) {
     auto expected = torch::exp(input[i]) / sum[i];
     ASSERT_TRUE(torch::allclose(output[i], expected));
   }
@@ -2607,7 +2608,7 @@
   auto output = m(input);
   auto sum = torch::sum(torch::exp(-input), 1);
 
-  for (int i = 0; i < 2; i++) {
+  for (const auto i : c10::irange(2)) {
     auto expected = torch::exp(-input[i]) / sum[i];
     ASSERT_TRUE(torch::allclose(output[i], expected));
   }
@@ -2619,7 +2620,7 @@
   auto output = m(input);
   auto sum = torch::sum(torch::exp(input), 1);
 
-  for (int i = 0; i < 2; i++) {
+  for (const auto i : c10::irange(2)) {
     auto expected = torch::log(torch::exp(input[i]) / sum[i]);
     ASSERT_TRUE(torch::allclose(output[i], expected));
   }
@@ -2656,7 +2657,7 @@
     auto logprob_out = asfm->log_prob(x);
     NLLLoss nll_loss;
 
-    for (int64_t v = 0; v < 4; ++v) {
+    for (const auto v : c10::irange(4)) {
       auto y = torch::full({4}, v, torch::kLong);
       auto asm_out = asfm(x, y);
       auto out = asm_out.output;
@@ -2675,10 +2676,10 @@
   auto output = m(input);
   auto sum = torch::sum(torch::exp(input), 1);
 
-  for (int i = 0; i < 1; i++) {
-    for (int j = 0; j < 2; j++) {
-      for (int k = 0; k < 3; k++) {
-        for (int l = 0; l < 4; l++) {
+  for (const auto i : c10::irange(1)) {
+    for (const auto j : c10::irange(2)) {
+      for (const auto k : c10::irange(3)) {
+        for (const auto l : c10::irange(4)) {
           auto expected = torch::exp(input[i][j][k][l]) / sum[i][k][l];
           ASSERT_TRUE(torch::allclose(output[i][j][k][l], expected));
         }
@@ -3389,8 +3390,8 @@
     TORCH_INTERNAL_ASSERT(a.size(0) == b.size(0));
     TORCH_INTERNAL_ASSERT(a.size(1) == b.size(1));
     auto retval = torch::zeros({a.size(0), a.size(1), a.size(2), b.size(3)}, torch::kFloat32);
-    for (int i = 0; i < a.size(0); i++) {
-      for (int j = 0; j < a.size(1); j++) {
+    for (const auto i : c10::irange(a.size(0))) {
+      for (const auto j : c10::irange(a.size(1))) {
         retval[i][j] = torch::matmul(a[i][j], b[i][j]);
       }
     }
@@ -3399,9 +3400,9 @@
 
   torch::Tensor _softmax(const torch::Tensor& x) {
     auto output = torch::zeros(x.sizes());
-    for (int i = 0; i < x.size(0); i++) {
-      for (int j = 0; j < x.size(1); j++) {
-        for (int k = 0; k < x.size(2); k++) {
+    for (const auto i : c10::irange(x.size(0))) {
+      for (const auto j : c10::irange(x.size(1))) {
+        for (const auto k : c10::irange(x.size(2))) {
           const auto& x_curr = x[i][j][k];
           const auto e_x = torch::exp(x_curr - torch::max(x_curr));
           output[i][j][k] = e_x / torch::sum(e_x);
@@ -3424,10 +3425,10 @@
     const auto s1 = QKT.size(2);
     const auto s2 = QKT.size(3);
     if (unseen_mask.defined() || key_padding_mask.defined()) {
-      for (int i = 0; i < b1; i++) {
-        for (int j = 0; j < b2; j++) {
-          for (int m = 0; m < s1; m++) {
-            for (int n = 0; n < s2; n++) {
+      for (const auto i : c10::irange(b1)) {
+        for (const auto j : c10::irange(b2)) {
+          for (const auto m : c10::irange(s1)) {
+            for (const auto n : c10::irange(s2)) {
               if (unseen_mask.defined() && unseen_mask[m][n].item<double>() == 0) {
                 QKT[i][j][m][n] = -std::numeric_limits<double>::infinity();
               }
@@ -3475,7 +3476,8 @@
     std::uniform_int_distribution<int> d_2_10(2, 10);
     std::uniform_int_distribution<int> d_3_10(3, 10);
     bool registration_checked = false;
-    for (int i = 0; i < 100; i++) {
+    for (const auto i : c10::irange(100)) {
+      (void)i; // Suppress unused variable warning
       const auto batch_sz = d_2_10(generator);
       const auto seq_len = d_2_10(generator);
       const auto d_head = d_3_10(generator);
diff --git a/test/cpp/api/nn_utils.cpp b/test/cpp/api/nn_utils.cpp
index 37c2676..451c72e 100644
--- a/test/cpp/api/nn_utils.cpp
+++ b/test/cpp/api/nn_utils.cpp
@@ -1,5 +1,6 @@
 #include <gtest/gtest.h>
 
+#include <c10/util/irange.h>
 #include <torch/torch.h>
 
 #include <test/cpp/api/support.h>
@@ -40,7 +41,7 @@
   auto compare_scaling =
       [&](const std::vector<torch::Tensor>& grads) -> torch::Tensor {
     std::vector<torch::Tensor> p_scale;
-    for (int i = 0; i < grads.size(); i++) {
+    for (const auto i : c10::irange(grads.size())) {
       auto param = l->parameters()[i];
       auto grad = grads[i];
       p_scale.push_back(param.grad().data().div(grad).view(-1));
@@ -61,7 +62,7 @@
       std::numeric_limits<float>::infinity(),
   };
   for (auto norm_type : norm_types) {
-    for (int i = 0; i < grads.size(); i++) {
+    for (const auto i : c10::irange(grads.size())) {
       l->parameters()[i].mutable_grad() =
           grads[i].clone().view_as(l->parameters()[i].data());
     }
@@ -80,7 +81,7 @@
       torch::ones(10).div(500),
   };
   for (auto norm_type : norm_types) {
-    for (int i = 0; i < grads.size(); i++) {
+    for (const auto i : c10::irange(grads.size())) {
       l->parameters()[i].grad().data().copy_(grads[i]);
     }
     auto norm_before = compute_norm(norm_type);
@@ -227,7 +228,7 @@
       // NOLINTNEXTLINE(cppcoreguidelines-avoid-goto,hicpp-avoid-goto)
       EXPECT_THROW(utils::clip_grad_norm_(parameters, 1., norm_type, true), std::exception) << msg;
       // Grads should not change if error is thrown
-      for (int64_t p_idx = 0; p_idx < parameters.size(); p_idx++) {
+      for (const auto p_idx : c10::irange(parameters.size())) {
         ASSERT_TRUE(torch::allclose(parameters[p_idx].grad(), grads_before[p_idx], 1.0, 0.0, /*equal_nan*/ true)) << msg;
       }
     } else {
@@ -285,7 +286,7 @@
   std::vector<std::vector<torch::Tensor>> grad_lists = {
       {grad_w, grad_b}, {grad_w, torch::Tensor()}};
   for (auto grad_list : grad_lists) {
-    for (int i = 0; i < grad_list.size(); i++) {
+    for (const auto i : c10::irange(grad_list.size())) {
       auto p = l->parameters()[i];
       auto g = grad_list[i];
       p.mutable_grad() = g.defined() ? g.clone().view_as(p.data()) : g;
@@ -335,7 +336,7 @@
   };
 
   utils::vector_to_parameters(vector, zero_parameters);
-  for (int i = 0; i < zero_parameters.size(); ++i) {
+  for (const auto i : c10::irange(zero_parameters.size())) {
     ASSERT_TRUE(zero_parameters[i].allclose(parameters[i]));
   }
 
@@ -368,7 +369,8 @@
 std::vector<torch::Tensor> PackedSequenceTest_ordered_sequence(torch::ScalarType tensor_type) {
   std::vector<torch::Tensor> seqs;
   seqs.reserve(PackedSequenceTest_batch_size);
-  for (int64_t i = 0; i < PackedSequenceTest_batch_size; i++) {
+  for (const auto i : c10::irange(PackedSequenceTest_batch_size)) {
+    (void)i; // Suppress unused variable warning
     seqs.emplace_back(torch::empty({
       torch::randint(1, PackedSequenceTest_max_length, {1}).item<int64_t>()
     }, tensor_type));
@@ -390,7 +392,7 @@
   // Create Tensor of random padded sequences
   auto ordered = PackedSequenceTest_ordered_sequence(tensor_type);
   auto lengths = torch::empty({(int64_t)ordered.size()}, torch::kInt64);
-  for (int64_t i = 0; i < ordered.size(); i++) {
+  for (const auto i : c10::irange(ordered.size())) {
     lengths[i] = ordered[i].size(0);
   }
   auto padded_tensor = rnn_utils::pad_sequence(ordered);
@@ -619,9 +621,9 @@
     }
     auto padded = torch::cat(tensors_to_be_cat, 1);
     std::vector<torch::Tensor> expected_data_vec;
-    for (int64_t n = 0; n < batch_sizes.size(0); n++) {
+    for (const auto n : c10::irange(batch_sizes.size(0))) {
       int64_t batch_size = batch_sizes[n].item<int64_t>();
-      for (int64_t i = 0; i < batch_size; i++) {
+      for (const auto i : c10::irange(batch_size)) {
         expected_data_vec.emplace_back(torch::arange(1., 6) + (i + 1) * 100 + 5 * n);
       }
     }
@@ -631,7 +633,7 @@
     if (should_shuffle) {
       // Shuffle the padded sequence to create an unsorted sequence
       std::vector<int64_t> permutation;
-      for (int64_t i = 0; i < sorted_lengths.size(); i++) {
+      for (const auto i : c10::irange(sorted_lengths.size())) {
         permutation.emplace_back(i);
       }
       std::shuffle(
@@ -702,7 +704,7 @@
       if (batch_first) {
         grad_output.transpose_(0, 1);
       }
-      for (int64_t i = 0; i < lengths.size(0); i++) {
+      for (const auto i : c10::irange(lengths.size(0))) {
         int64_t l = lengths[i].item<int64_t>();
         ASSERT_TRUE(torch::allclose(
           padded.grad().narrow(0, 0, l).select(1, i),