gh-106529: Split FOR_ITER_RANGE into uops (#106638)

For an example of what this does for Tier 1 and Tier 2, see
https://github.com/python/cpython/issues/106529#issuecomment-1631649920
diff --git a/Include/internal/pycore_opcode_metadata.h b/Include/internal/pycore_opcode_metadata.h
index 317f42a..d2c1f9ad 100644
--- a/Include/internal/pycore_opcode_metadata.h
+++ b/Include/internal/pycore_opcode_metadata.h
@@ -40,6 +40,9 @@
 #define _GUARD_GLOBALS_VERSION 318
 #define _GUARD_BUILTINS_VERSION 319
 #define IS_NONE 320
+#define _ITER_CHECK_RANGE 321
+#define _ITER_EXHAUSTED_RANGE 322
+#define _ITER_NEXT_RANGE 323
 
 #ifndef NEED_OPCODE_METADATA
 extern int _PyOpcode_num_popped(int opcode, int oparg, bool jump);
@@ -1319,5 +1322,8 @@
     [318] = "_GUARD_GLOBALS_VERSION",
     [319] = "_GUARD_BUILTINS_VERSION",
     [320] = "IS_NONE",
+    [321] = "_ITER_CHECK_RANGE",
+    [322] = "_ITER_EXHAUSTED_RANGE",
+    [323] = "_ITER_NEXT_RANGE",
 };
 #endif // NEED_OPCODE_METADATA
diff --git a/Lib/test/test_capi/test_misc.py b/Lib/test/test_capi/test_misc.py
index 9c14a50..abdf7ed 100644
--- a/Lib/test/test_capi/test_misc.py
+++ b/Lib/test/test_capi/test_misc.py
@@ -2443,7 +2443,6 @@ def testfunc(x):
                 i += 1
 
         opt = _testinternalcapi.get_uop_optimizer()
-
         with temporary_optimizer(opt):
             testfunc(1000)
 
@@ -2580,13 +2579,33 @@ def testfunc(n):
 
         ex = get_first_executor(testfunc)
         self.assertIsNotNone(ex)
-        # for i, (opname, oparg) in enumerate(ex):
-        #     print(f"{i:4d}: {opname:<20s} {oparg:4d}")
         uops = {opname for opname, _ in ex}
         # Since there is no JUMP_FORWARD instruction,
         # look for indirect evidence: the += operator
         self.assertIn("_BINARY_OP_ADD_INT", uops)
 
+    def test_for_iter_range(self):
+        def testfunc(n):
+            total = 0
+            for i in range(n):
+                total += i
+            return total
+        # import dis; dis.dis(testfunc)
+
+        opt = _testinternalcapi.get_uop_optimizer()
+        with temporary_optimizer(opt):
+            total = testfunc(10)
+            self.assertEqual(total, 45)
+
+        ex = get_first_executor(testfunc)
+        self.assertIsNotNone(ex)
+        # for i, (opname, oparg) in enumerate(ex):
+        #     print(f"{i:4d}: {opname:<20s} {oparg:3d}")
+        uops = {opname for opname, _ in ex}
+        self.assertIn("_ITER_EXHAUSTED_RANGE", uops)
+        # Verification that the jump goes past END_FOR
+        # is done by manual inspection of the output
+
 
 if __name__ == "__main__":
     unittest.main()
diff --git a/Python/bytecodes.c b/Python/bytecodes.c
index f5ce2e7..18862f8 100644
--- a/Python/bytecodes.c
+++ b/Python/bytecodes.c
@@ -2451,9 +2451,14 @@ dummy_func(
             // Common case: no jump, leave it to the code generator
         }
 
-        inst(FOR_ITER_RANGE, (unused/1, iter -- iter, next)) {
+        op(_ITER_CHECK_RANGE, (iter -- iter)) {
             _PyRangeIterObject *r = (_PyRangeIterObject *)iter;
             DEOPT_IF(Py_TYPE(r) != &PyRangeIter_Type, FOR_ITER);
+        }
+
+        op(_ITER_JUMP_RANGE, (iter -- iter)) {
+            _PyRangeIterObject *r = (_PyRangeIterObject *)iter;
+            assert(Py_TYPE(r) == &PyRangeIter_Type);
             STAT_INC(FOR_ITER, hit);
             if (r->len <= 0) {
                 STACK_SHRINK(1);
@@ -2463,15 +2468,29 @@ dummy_func(
                 JUMPBY(oparg + 1);
                 DISPATCH();
             }
+        }
+
+        // Only used by Tier 2
+        op(_ITER_EXHAUSTED_RANGE, (iter -- iter, exhausted)) {
+            _PyRangeIterObject *r = (_PyRangeIterObject *)iter;
+            assert(Py_TYPE(r) == &PyRangeIter_Type);
+            exhausted = r->len <= 0 ? Py_True : Py_False;
+        }
+
+        op(_ITER_NEXT_RANGE, (iter -- iter, next)) {
+            _PyRangeIterObject *r = (_PyRangeIterObject *)iter;
+            assert(Py_TYPE(r) == &PyRangeIter_Type);
+            assert(r->len > 0);
             long value = r->start;
             r->start = value + r->step;
             r->len--;
             next = PyLong_FromLong(value);
-            if (next == NULL) {
-                goto error;
-            }
+            ERROR_IF(next == NULL, error);
         }
 
+        macro(FOR_ITER_RANGE) =
+            unused/1 + _ITER_CHECK_RANGE + _ITER_JUMP_RANGE + _ITER_NEXT_RANGE;
+
         inst(FOR_ITER_GEN, (unused/1, iter -- iter, unused)) {
             DEOPT_IF(tstate->interp->eval_frame, FOR_ITER);
             PyGenObject *gen = (PyGenObject *)iter;
diff --git a/Python/executor_cases.c.h b/Python/executor_cases.c.h
index 1df8feb..2c2dbf4 100644
--- a/Python/executor_cases.c.h
+++ b/Python/executor_cases.c.h
@@ -1720,6 +1720,40 @@
             break;
         }
 
+        case _ITER_CHECK_RANGE: {
+            PyObject *iter = stack_pointer[-1];
+            _PyRangeIterObject *r = (_PyRangeIterObject *)iter;
+            DEOPT_IF(Py_TYPE(r) != &PyRangeIter_Type, FOR_ITER);
+            break;
+        }
+
+        case _ITER_EXHAUSTED_RANGE: {
+            PyObject *iter = stack_pointer[-1];
+            PyObject *exhausted;
+            _PyRangeIterObject *r = (_PyRangeIterObject *)iter;
+            assert(Py_TYPE(r) == &PyRangeIter_Type);
+            exhausted = r->len <= 0 ? Py_True : Py_False;
+            STACK_GROW(1);
+            stack_pointer[-1] = exhausted;
+            break;
+        }
+
+        case _ITER_NEXT_RANGE: {
+            PyObject *iter = stack_pointer[-1];
+            PyObject *next;
+            _PyRangeIterObject *r = (_PyRangeIterObject *)iter;
+            assert(Py_TYPE(r) == &PyRangeIter_Type);
+            assert(r->len > 0);
+            long value = r->start;
+            r->start = value + r->step;
+            r->len--;
+            next = PyLong_FromLong(value);
+            if (next == NULL) goto error;
+            STACK_GROW(1);
+            stack_pointer[-1] = next;
+            break;
+        }
+
         case WITH_EXCEPT_START: {
             PyObject *val = stack_pointer[-1];
             PyObject *lasti = stack_pointer[-3];
diff --git a/Python/generated_cases.c.h b/Python/generated_cases.c.h
index f7a18b4..383432f 100644
--- a/Python/generated_cases.c.h
+++ b/Python/generated_cases.c.h
@@ -3092,29 +3092,47 @@
         }
 
         TARGET(FOR_ITER_RANGE) {
-            PyObject *iter = stack_pointer[-1];
-            PyObject *next;
-            _PyRangeIterObject *r = (_PyRangeIterObject *)iter;
-            DEOPT_IF(Py_TYPE(r) != &PyRangeIter_Type, FOR_ITER);
-            STAT_INC(FOR_ITER, hit);
-            if (r->len <= 0) {
-                STACK_SHRINK(1);
-                Py_DECREF(r);
-                SKIP_OVER(INLINE_CACHE_ENTRIES_FOR_ITER);
-                // Jump over END_FOR instruction.
-                JUMPBY(oparg + 1);
-                DISPATCH();
+            PyObject *_tmp_1;
+            PyObject *_tmp_2 = stack_pointer[-1];
+            {
+                PyObject *iter = _tmp_2;
+                _PyRangeIterObject *r = (_PyRangeIterObject *)iter;
+                DEOPT_IF(Py_TYPE(r) != &PyRangeIter_Type, FOR_ITER);
+                _tmp_2 = iter;
             }
-            long value = r->start;
-            r->start = value + r->step;
-            r->len--;
-            next = PyLong_FromLong(value);
-            if (next == NULL) {
-                goto error;
+            {
+                PyObject *iter = _tmp_2;
+                _PyRangeIterObject *r = (_PyRangeIterObject *)iter;
+                assert(Py_TYPE(r) == &PyRangeIter_Type);
+                STAT_INC(FOR_ITER, hit);
+                if (r->len <= 0) {
+                    STACK_SHRINK(1);
+                    Py_DECREF(r);
+                    SKIP_OVER(INLINE_CACHE_ENTRIES_FOR_ITER);
+                    // Jump over END_FOR instruction.
+                    JUMPBY(oparg + 1);
+                    DISPATCH();
+                }
+                _tmp_2 = iter;
             }
-            STACK_GROW(1);
-            stack_pointer[-1] = next;
+            {
+                PyObject *iter = _tmp_2;
+                PyObject *next;
+                _PyRangeIterObject *r = (_PyRangeIterObject *)iter;
+                assert(Py_TYPE(r) == &PyRangeIter_Type);
+                assert(r->len > 0);
+                long value = r->start;
+                r->start = value + r->step;
+                r->len--;
+                next = PyLong_FromLong(value);
+                if (next == NULL) goto error;
+                _tmp_2 = iter;
+                _tmp_1 = next;
+            }
             next_instr += 1;
+            STACK_GROW(1);
+            stack_pointer[-1] = _tmp_1;
+            stack_pointer[-2] = _tmp_2;
             DISPATCH();
         }
 
diff --git a/Python/optimizer.c b/Python/optimizer.c
index c3fdee6..abd2351 100644
--- a/Python/optimizer.c
+++ b/Python/optimizer.c
@@ -479,6 +479,28 @@ translate_bytecode_to_trace(
                 break;
             }
 
+            case FOR_ITER_RANGE:
+            {
+                // Assume jump unlikely (can a for-loop exit be likely?)
+                // Reserve 9 entries (4 here, 3 stub, plus SAVE_IP + EXIT_TRACE)
+                if (trace_length + 9 > max_length) {
+                    DPRINTF(1, "Ran out of space for FOR_ITER_RANGE\n");
+                    goto done;
+                }
+                _Py_CODEUNIT *target_instr =  // +1 at the end skips over END_FOR
+                    instr + 1 + _PyOpcode_Caches[_PyOpcode_Deopt[opcode]] + oparg + 1;
+                max_length -= 3;  // Really the start of the stubs
+                ADD_TO_TRACE(_ITER_CHECK_RANGE, 0);
+                ADD_TO_TRACE(_ITER_EXHAUSTED_RANGE, 0);
+                ADD_TO_TRACE(_POP_JUMP_IF_TRUE, max_length);
+                ADD_TO_TRACE(_ITER_NEXT_RANGE, 0);
+
+                ADD_TO_STUB(max_length + 0, POP_TOP, 0);
+                ADD_TO_STUB(max_length + 1, SAVE_IP, INSTR_IP(target_instr, code));
+                ADD_TO_STUB(max_length + 2, EXIT_TRACE, 0);
+                break;
+            }
+
             default:
             {
                 const struct opcode_macro_expansion *expansion = &_PyOpcode_macro_expansion[opcode];
@@ -574,8 +596,8 @@ translate_bytecode_to_trace(
                     }
                 }
             }
-            trace_length += buffer_size - max_length;
         }
+        trace_length += buffer_size - max_length;
         return trace_length;
     }
     else {