torch/csrc/dynamo/eval_frame.c - platform/external/pytorch - Git at Google

 #define PY_SSIZE_T_CLEAN
 #include <torch/csrc/dynamo/cache_entry.h>
 #include <torch/csrc/dynamo/cpp_shim.h>
 #include <torch/csrc/dynamo/cpython_defs.h>
 #include <torch/csrc/dynamo/cpython_includes.h>
 #include <torch/csrc/dynamo/debug_macros.h>
 #include <torch/csrc/dynamo/extra_state.h>
 #include <torch/csrc/dynamo/framelocals_mapping.h>
 #include <torch/csrc/utils/python_compat.h>
 #include <opcode.h>
 #include <stdbool.h>

 PyObject* guard_error_hook = NULL;
 const char* cache_lookup_profiler_str = "TorchDynamo Cache Lookup";

 static int active_dynamo_threads = 0;

 static Py_tss_t eval_frame_callback_key = Py_tss_NEEDS_INIT;

 inline static PyObject* eval_frame_callback_get(void) {
   void* result = PyThread_tss_get(&eval_frame_callback_key);
   if (unlikely(result == NULL)) {
     return (PyObject*)Py_None;
   } else {
     return (PyObject*)result;
   }
 }

 inline static void eval_frame_callback_set(PyObject* obj) {
   PyThread_tss_set(&eval_frame_callback_key, obj);
 }

 // 3.14 Not supported at all. See cpython_defs.c for hints
 #if !(IS_PYTHON_3_14_PLUS)

 // All the eval APIs change in 3.11 so we need to decide which one to use on the fly
 // https://docs.python.org/3/c-api/init.html#c._PyFrameEvalFunction
 #if IS_PYTHON_3_11_PLUS
 #define THP_EVAL_API_FRAME_OBJECT _PyInterpreterFrame

 // We need to be able to return the _PyInterpreterFrame to python so create
 // a python binding for it

 typedef struct THPPyInterpreterFrame {
   PyObject_HEAD
   _PyInterpreterFrame* frame; // Borrowed reference
   PyObject* locals;
 } THPPyInterpreterFrame;

 THPPyInterpreterFrame* THPPyInterpreterFrame_New(_PyInterpreterFrame* frame);

 #define DECLARE_PYOBJ_ATTR(name) \
 static PyObject* THPPyInterpreterFrame_##name(THPPyInterpreterFrame* self, PyObject* _noargs) { \
   PyObject* res = (PyObject*)self->frame->name; \
   Py_XINCREF(res); \
   return res; \
 }

 #if IS_PYTHON_3_12_PLUS
 DECLARE_PYOBJ_ATTR(f_funcobj)
 #else
 DECLARE_PYOBJ_ATTR(f_func)
 #endif

 DECLARE_PYOBJ_ATTR(f_globals)
 DECLARE_PYOBJ_ATTR(f_builtins)

 static PyObject* THPPyInterpreterFrame_f_locals(THPPyInterpreterFrame* self, PyObject* _noargs) {
   DEBUG_NULL_CHECK(self->locals);
   Py_XINCREF(self->locals);
   return self->locals;
 }

 #if IS_PYTHON_3_13_PLUS
 DECLARE_PYOBJ_ATTR(f_executable)
 #else
 DECLARE_PYOBJ_ATTR(f_code)
 #endif

 DECLARE_PYOBJ_ATTR(frame_obj)

 #undef DECLARE_PYOBJ_ATTR

 static THPPyInterpreterFrame* THPPyInterpreterFrame_previous(THPPyInterpreterFrame* self, PyObject* _noargs) {
   THPPyInterpreterFrame* res = THPPyInterpreterFrame_New(self->frame->previous);
   return res;
 }

 // This is not a true attribute of the class but we do access it in python and it is hard to implement
 // on the python side, so do it here:
 static PyObject* THPPyInterpreterFrame_f_lasti(THPPyInterpreterFrame* self, PyObject* _noargs) {
   return PyLong_FromLong(_PyInterpreterFrame_LASTI(self->frame));
 }

 static PyObject* THPPyInterpreterFrame_f_lineno(THPPyInterpreterFrame* self, PyObject* _noargs) {
   if (!self->frame->frame_obj) {
     return PyLong_FromLong(F_CODE(self->frame)->co_firstlineno);
   }
   int lineno = PyFrame_GetLineNumber(self->frame->frame_obj);
   if (lineno < 0) {
     Py_RETURN_NONE;
   }
   return PyLong_FromLong(lineno);
 }

 static PyObject* THPPyInterpreterFrame_f_back(THPPyInterpreterFrame* self, PyObject* _noargs) {
   if (!self->frame->frame_obj) {
     Py_RETURN_NONE;
   }
   return (PyObject*)PyFrame_GetBack(self->frame->frame_obj);
 }

 // NOLINTNEXTLINE(cppcoreguidelines-avoid-c-arrays,cppcoreguidelines-avoid-non-const-global-variables,modernize-avoid-c-arrays)
 static struct PyGetSetDef THPPyInterpreterFrame_properties[] = {
 #if IS_PYTHON_3_12_PLUS
     {"f_func", (getter)THPPyInterpreterFrame_f_funcobj, NULL, NULL, NULL},
 #else
     {"f_func", (getter)THPPyInterpreterFrame_f_func, NULL, NULL, NULL},
 #endif
     {"f_globals", (getter)THPPyInterpreterFrame_f_globals, NULL, NULL, NULL},
     {"f_builtins", (getter)THPPyInterpreterFrame_f_builtins, NULL, NULL, NULL},
     {"f_locals", (getter)THPPyInterpreterFrame_f_locals, NULL, NULL, NULL},
 #if IS_PYTHON_3_13_PLUS
     {"f_code", (getter)THPPyInterpreterFrame_f_executable, NULL, NULL, NULL},
 #else
     {"f_code", (getter)THPPyInterpreterFrame_f_code, NULL, NULL, NULL},
 #endif
     {"frame_obj", (getter)THPPyInterpreterFrame_frame_obj, NULL, NULL, NULL},
     {"previous", (getter)THPPyInterpreterFrame_previous, NULL, NULL, NULL},
     {"f_lasti", (getter)THPPyInterpreterFrame_f_lasti, NULL, NULL, NULL},
     {"f_lineno", (getter)THPPyInterpreterFrame_f_lineno, NULL, NULL, NULL},
     {"f_back", (getter)THPPyInterpreterFrame_f_back, NULL, NULL, NULL},
     {NULL}};

 static PyTypeObject THPPyInterpreterFrameType = {
     PyVarObject_HEAD_INIT(NULL, 0)
     .tp_name = "torch._C.dynamo.eval_frame._PyInterpreterFrame",
     .tp_basicsize = sizeof(THPPyInterpreterFrame),
     .tp_flags = Py_TPFLAGS_DEFAULT,
     .tp_getset = THPPyInterpreterFrame_properties,
 };


 THPPyInterpreterFrame* THPPyInterpreterFrame_New(_PyInterpreterFrame* frame) {
   PyTypeObject* type = (PyTypeObject*)&THPPyInterpreterFrameType;
   THPPyInterpreterFrame* self = (THPPyInterpreterFrame*)type->tp_alloc(type, 0);
   if (!self)
     return NULL;
   self->frame = frame;
   self->locals = NULL;
   return self;
 }


 #else
 #define THP_EVAL_API_FRAME_OBJECT PyFrameObject

 static int
 THP_PyFrame_FastToLocalsWithError(THP_EVAL_API_FRAME_OBJECT *frame, int *free_vars_copied) {
   return PyFrame_FastToLocalsWithError(frame);
 }
 #endif

 static PyObject* _custom_eval_frame_shim(
     PyThreadState* tstate,
     THP_EVAL_API_FRAME_OBJECT* frame,
     int throw_flag);
 static PyObject* _custom_eval_frame(
     PyThreadState* tstate,
     THP_EVAL_API_FRAME_OBJECT* frame,
     int throw_flag,
     PyObject* callback,
     int* should_clear_frame);
 static PyObject *(*previous_eval_frame)(PyThreadState *tstate,
                                         THP_EVAL_API_FRAME_OBJECT* frame, int throw_flag) = NULL;

 #if PY_VERSION_HEX >= 0x03090000
 static PyObject* custom_eval_frame_shim(
     PyThreadState* tstate,
     THP_EVAL_API_FRAME_OBJECT* frame,
     int throw_flag) {
   return _custom_eval_frame_shim(tstate, frame, throw_flag);
 }
 #else
 static PyObject* custom_eval_frame_shim(THP_EVAL_API_FRAME_OBJECT* frame, int throw_flag) {
   PyThreadState* tstate = PyThreadState_GET();
   return _custom_eval_frame_shim(tstate, frame, throw_flag);
 }
 #endif

 inline static PyObject* eval_frame_default(
     PyThreadState* tstate,
     THP_EVAL_API_FRAME_OBJECT* frame,
     int throw_flag) {
 #if PY_VERSION_HEX >= 0x03090000
   if (tstate == NULL) {
     tstate = PyThreadState_GET();
   }
   if (previous_eval_frame) {
     return previous_eval_frame(tstate, frame, throw_flag);
   }
   else {
     return _PyEval_EvalFrameDefault(tstate, frame, throw_flag);
   }
 #else
   return _PyEval_EvalFrameDefault(frame, throw_flag);
 #endif
 }

 inline static void enable_eval_frame_shim(PyThreadState* tstate) {
 #if PY_VERSION_HEX >= 0x03090000
   if (_PyInterpreterState_GetEvalFrameFunc(tstate->interp) !=
       &custom_eval_frame_shim) {
     DEBUG_CHECK(previous_eval_frame == NULL);
     previous_eval_frame = _PyInterpreterState_GetEvalFrameFunc(tstate->interp);
     _PyInterpreterState_SetEvalFrameFunc(tstate->interp,
                                          &custom_eval_frame_shim);
   }
 #else
   if (tstate->interp->eval_frame != &custom_eval_frame_shim) {
     // First call
     tstate->interp->eval_frame = &custom_eval_frame_shim;
   }
 #endif
 }

 inline static void enable_eval_frame_default(PyThreadState* tstate) {
 #if PY_VERSION_HEX >= 0x03090000
   if (_PyInterpreterState_GetEvalFrameFunc(tstate->interp) !=
       previous_eval_frame) {
     DEBUG_CHECK(previous_eval_frame != NULL);
     _PyInterpreterState_SetEvalFrameFunc(tstate->interp,
                                          previous_eval_frame);
     previous_eval_frame = NULL;
   }
 #else
   if (tstate->interp->eval_frame != &_PyEval_EvalFrameDefault) {
     // First call
     tstate->interp->eval_frame = &_PyEval_EvalFrameDefault;
   }
 #endif
 }


 inline static const char* get_frame_name(THP_EVAL_API_FRAME_OBJECT* frame) {
   // Returns the C string name of the current frame.
   DEBUG_CHECK(PyUnicode_Check(F_CODE(frame)->co_name));
   return PyUnicode_AsUTF8(F_CODE(frame)->co_name);
 }

 static inline PyObject* call_callback(
     PyObject* callable,
     THP_EVAL_API_FRAME_OBJECT* _frame,
     PyObject* locals,
     CacheEntry* cache_entry,
     FrameState* frame_state) {

 // remember to update the type signature for DynamoCallbackFn.__call__ in torch/_dynamo/types.py
 // if this function changes
 #if IS_PYTHON_3_11_PLUS
   THPPyInterpreterFrame* frame = THPPyInterpreterFrame_New(_frame);
   if (frame == NULL) {
     return NULL;
   }
   frame->locals = locals;
 #else
   PyObject* frame = Py_NewRef(_frame);
 #endif

   PyObject* cache_entry_pyobj = CacheEntry_to_obj(cache_entry);
   PyObject* res = PyObject_CallFunction(
     callable,
     "OOO",
     frame,
     cache_entry_pyobj,
     frame_state);
   Py_DECREF(frame);
   Py_DECREF(cache_entry_pyobj);
   return res;
 }

 static inline void clear_old_frame_if_python_312_plus(
   PyThreadState* tstate,
   THP_EVAL_API_FRAME_OBJECT* frame) {
 #if IS_PYTHON_3_12_PLUS

   THP_PyFrame_Clear(frame);
   THP_PyThreadState_PopFrame(tstate, frame);

 #endif
 }

 inline static PyObject* eval_custom_code_impl(
     PyThreadState* tstate,
     THP_EVAL_API_FRAME_OBJECT* frame,
     PyCodeObject* code,
     int throw_flag,
     int free_vars_copied) {

   DEBUG_NULL_CHECK(tstate);
   DEBUG_NULL_CHECK(frame);
   DEBUG_NULL_CHECK(code);

 #if IS_PYTHON_3_11_PLUS

   // Generate Python function object and _PyInterpreterFrame in a way similar to
   // https://github.com/python/cpython/blob/e715da6db1d1d70cd779dc48e1ba8110c51cc1bf/Python/ceval.c#L1130
 #if IS_PYTHON_3_12_PLUS
   PyFunctionObject* old_func = (PyFunctionObject*) frame->f_funcobj;
   size_t size = code->co_framesize;
 #else
   PyFunctionObject* old_func = frame->f_func;
   size_t size = code->co_nlocalsplus + code->co_stacksize + FRAME_SPECIALS_SIZE;
 #endif

   PyFunctionObject* func = _PyFunction_CopyWithNewCode(old_func, code);
   if (func == NULL) {
     return NULL;
   }

   THP_EVAL_API_FRAME_OBJECT* shadow = THP_PyThreadState_BumpFramePointerSlow(tstate, size);
   if (shadow == NULL) {
     Py_DECREF(func);
     return NULL;
   }

   Py_INCREF(func);
   // consumes reference to func
 #if IS_PYTHON_3_12_PLUS
   _PyFrame_Initialize(shadow, func, NULL, code, 0);
 #else
   _PyFrame_InitializeSpecials(shadow, func, NULL, code->co_nlocalsplus);
 #endif

   PyObject** fastlocals_old = frame->localsplus;
   PyObject** fastlocals_new = shadow->localsplus;
   Py_ssize_t n_old = F_CODE(frame)->co_nlocalsplus;
   Py_ssize_t n_new = code->co_nlocalsplus;

   // localsplus are XINCREF'd by default eval frame, so all values must be valid.
 #if !(IS_PYTHON_3_12_PLUS)
   // _PyFrame_Initialize in 3.12 already does this
   for (int i = 0; i < code->co_nlocalsplus; i++) {
     fastlocals_new[i] = NULL;
   }
 #endif

   // for 3.11+, if free_vars_copied is true, we do not need to
   // run the first COPY_FREE_VARS since THP_PyFrame_FastToLocalsWithError
   // already did the equivalent action.
   if (free_vars_copied && _Py_OPCODE(_PyCode_CODE(F_CODE(shadow))[0]) == COPY_FREE_VARS) {
     PREV_INSTR(shadow) = _PyCode_CODE(F_CODE(shadow));
   }

 #else

   THP_EVAL_API_FRAME_OBJECT* shadow = PyFrame_New(tstate, code, frame->f_globals, NULL);
   if (shadow == NULL) {
     return NULL;
   }

   PyObject** fastlocals_old = frame->f_localsplus;
   PyObject** fastlocals_new = shadow->f_localsplus;
   Py_ssize_t n_old = F_CODE(frame)->co_nlocals + PyCode_GetNFreevars(F_CODE(frame)) + PyCode_GetNCellvars(F_CODE(frame));
   Py_ssize_t n_new = code->co_nlocals + PyCode_GetNFreevars(code) + PyCode_GetNCellvars(code);

 #endif

   // ============== Initialize new frame from old frame ============
   // Python internal for executing a function:
   //  1. CPython interpreter first creates an empty frame according to the code object
   //  2. CPython interpreter initializes the frame by filling arguments/free variables into frame and initializing cell variables
   //  3. CPython interpreter executes the code object
   //
   // Dynamo hooks the 3th step: before executing the code object, Dynamo transforms the code object into a new code object. Then, the old frame is not suitable for executing the new code. Therefore, Dynamo needs to manually create and initialize a new frame to execute the new code.
   // The main task is to copy data in old frame to new frame, concerning a storage space named `localsplus`.
   //
   // localsplus storage is an array with the following layout:
   // |   args   |   new_locals    |    cell_variables |   free_variables    |
   // | <--- from left to right, index from 0 to n - 1 ---> |
   // code.co_varnames == args + new_locals, code.co_nlocals == len(code.co_varnames)
   // code.co_freevars == free_variables
   // In Python 3.10 and lower, `n == code.co_nlocals + len(code.co_cellvars) + len(code.co_freevars)` (Python expression)
   // In Python 3.11 and higher, `n <= code.co_nlocals + len(code.co_cellvars) + len(code.co_freevars)` (Python expression). There is an extra field in Python C-API: `n == code->co_nlocalsplus` (C expression) to retrieve the length of array.
   // The complexity happens if an argument becomes a cell variable:
   //  In Python 3.10 and lower, `code.co_cellvars == cell_variables`, and the corresponding slot in args becomes `NULL`.
   //  In Python 3.11 and higher, `code.co_cellvars > cell_variables`, that cell variable is still stored in args, with a flag set in corresponding item's `co_localspluskinds` .
   //
   // ideally, we need to look up new localsplus from old localsplus by name:
   // for i, name, value in enumerate(localsplusnames_old):
   //   if value != NULL: (NULL happens for new local variables and arguments that becomes cell variables)
   //     name_to_idx[name] = i
   // for i, name in enumerate(localsplusnames_new):
   //  if name in name_to_idx:
   //    fastlocals_new[i] = fastlocals_old[name_to_idx[name]]
   //
   // The above process of building a `name_to_idx` mapping is expensive.
   // Dynamo makes the following assumptions:
   //  1. new code has the same arguments as the old code (both the number and the order)
   //  2. new code has the same cell variables as the old code (both the number and the order)
   //  3. new code has the same free variables as the old code (both the number and the order)
   //  The only flexibility lies in new local variables: new code can introduce their own variables.
   // With these assumptions, Dynamo can copy data directly by index. Dynamo just needs to take care of copying cell variables correctly.
   // To avoid runtime cost, the assumptions are checked when we first generate the code object in pytorch/torch/_dynamo/convert_frame.py .


   // copy args
   // according to https://docs.python.org/3/library/inspect.html , `co_argcount` is the number of arguments (not including keyword only arguments, * or ** args). so we need to add `co_kwonlyargcount` and `co_flags` to get the total number of arguments.
   // !!(F_CODE(frame)->co_flags & CO_VARARGS) is 1 if the function has *args, 0 otherwise
   // !!(F_CODE(frame)->co_flags & CO_VARKEYWORDS) is 1 if the function has **kwargs, 0 otherwise
   // they convert bit flags to 0 or 1, and avoid branching.
   // This is performance critical code, so we really care about performance.
   Py_ssize_t total_argcount_old = F_CODE(frame)->co_argcount + F_CODE(frame)->co_kwonlyargcount + !!(F_CODE(frame)->co_flags & CO_VARARGS) + !!(F_CODE(frame)->co_flags & CO_VARKEYWORDS);

   for (Py_ssize_t i = 0; i < total_argcount_old; i++) {
     Py_XINCREF(fastlocals_old[i]);
     fastlocals_new[i] = fastlocals_old[i];
   }

   // copy free vars
   Py_ssize_t nfrees_old = PyCode_GetNFreevars(F_CODE(frame));

   for (Py_ssize_t i = 0; i < nfrees_old; i++) {
     Py_XINCREF(fastlocals_old[n_old - 1 - i]);
     fastlocals_new[n_new - 1 - i] = fastlocals_old[n_old - 1 - i];
   }

   // copy cell vars, from high index to low index, until it meets a variable that is not cell variable.
   for (Py_ssize_t i = n_old - nfrees_old - 1, j = n_new - nfrees_old - 1; i >= total_argcount_old; i--, j--) {

   // conditional test to tell if a variable is not a cell variable
   // this is straightforward in Python 3.11 and higher, as there are bit flags in `co_localspluskinds` to tell if a variable is a cell variable.
   // in Python 3.10 and lower, essentially we are checking if a variable is a new local variable (because of the layout mentioned above, the first variable that is not cell variable is the first new local variable). the corresponding slot in `flocalsplus` is NULL for new local variables.
 #if IS_PYTHON_3_11_PLUS
     if(!(_PyLocals_GetKind(F_CODE(frame)->co_localspluskinds, i) & CO_FAST_CELL))
     {
       break;
     }
 #else
     if(fastlocals_old[i] == NULL)
     {
       break;
     }
 #endif

     Py_XINCREF(fastlocals_old[i]);
     fastlocals_new[j] = fastlocals_old[i];
   }

   // NOTE: if you want to evaluate frame instead of shadow in 3.12+,
   // you need to clear_old_frame_if_python_312_plus the shadow frame BEFORE
   // calling eval_frame_default (i.e. here) and comment out the
   // clear_old_frame_if_python_312_plus call on the original frame.

   PyObject* result = eval_frame_default(tstate, shadow, throw_flag);

 #if IS_PYTHON_3_12_PLUS

   // frame is cleared by caller
   Py_DECREF(func);

 #elif IS_PYTHON_3_11_PLUS

   // In 3.11, shadow has is_entry set to true, so _PyEvalFrameClearAndPop is not called,
   // so we manually clear and pop the shadow frame.
   THP_PyFrame_Clear(shadow);
   THP_PyThreadState_PopFrame(tstate, shadow);
   Py_DECREF(func);

 #else

   Py_DECREF(shadow);

 #endif

   return result;
 }

 // This wrapper function adds a profiler event
 inline static PyObject* eval_custom_code(
     PyThreadState* tstate,
     THP_EVAL_API_FRAME_OBJECT* frame,
     PyCodeObject* code,
     int throw_flag,
     int free_vars_copied) {
   _PytorchRecordFunctionState* rf = _pytorch_record_function_enter("Torch-Compiled Region");
   PyObject* result = eval_custom_code_impl(
     tstate,
     frame,
     code,
     throw_flag,
     free_vars_copied
   );
   _pytorch_record_function_exit(rf);
   return result;
 }

 static PyObject* _custom_eval_frame_shim(
     PyThreadState* tstate,
     THP_EVAL_API_FRAME_OBJECT* frame,
     int throw_flag) {
   // Shims logic into one of three states. Can probably be refactored into a
   // single func, later:
   //  - None: disables TorchDynamo
   //  - False: run-only mode (reuse existing compiles)
   //  - Python callable(): enables TorchDynamo
   PyObject* callback = eval_frame_callback_get();

   if (callback == Py_None) {
     return eval_frame_default(tstate, frame, throw_flag);
   }

   int should_clear_frame = 0;
   PyObject* result = _custom_eval_frame(tstate, frame, throw_flag, callback, &should_clear_frame);
   if (should_clear_frame) {
     clear_old_frame_if_python_312_plus(tstate, frame);
   }
   return result;
 }

 // NOTE: In 3.12+, the frame evaluation function (callee) is responsible for clearing/popping
 // the frame, meaning that unless we default evaluate the original frame,
 // we are responsible for clearing it - via clear_old_frame_if_python_312_plus.
 // The should_clear_frame flag is used to indicate whether the frame should be
 // cleared by _custom_eval_frame's caller.
 // Generally should_clear_frame should be set if and only we don't eval_frame_default.
 static PyObject* _custom_eval_frame(
     PyThreadState* tstate,
     THP_EVAL_API_FRAME_OBJECT* frame,
     int throw_flag,
     PyObject* callback,
     int* should_clear_frame) {
 #if IS_PYTHON_3_11_PLUS
   DEBUG_TRACE(
       "begin %s %s %i %i",
       get_frame_name(frame),
       PyUnicode_AsUTF8(F_CODE(frame)->co_filename),
       F_CODE(frame)->co_firstlineno,
       _PyInterpreterFrame_LASTI(frame));
 #else
   DEBUG_TRACE(
       "begin %s %s %i %i %i",
       get_frame_name(frame),
       PyUnicode_AsUTF8(F_CODE(frame)->co_filename),
       frame->f_lineno,
       frame->f_lasti,
       frame->f_iblock);
 #endif

   if (throw_flag) {
     // When unwinding generators, eval frame is called with throw_flag ==
     // true.  Frame evaluation is supposed to continue unwinding by propagating
     // the exception.  Dynamo doesn't really know how to do this, nor does it
     // really want to do this, because there's unlikely any code to capture
     // (you're going to immediately quit out of the frame, perhaps running
     // some unwinding logic along the way).  So we just run the default
     // handler in this case.
     //
     // NB: A previous version of this patch returned NULL.  This is wrong,
     // because returning NULL is *different* from unwinding an exception.
     // In particular, you will not execute things like context manager
     // __exit__ if you just return NULL.
     //
     // NB: It's /conceivable/ that you might want to actually still call the
     // Dynamo callback when throw_flag == TRUE, to give Dynamo a chance to
     // do any stack unwinding code.  But this is not really useful because
     // (1) Dynamo doesn't actually know how to do stack unwinding, so it would
     // immediately skip the frame, and (2) even if it did, this would only
     // be profitable if there was tensor code in the unwinding code.  Seems
     // unlikely.
     DEBUG_TRACE("throw %s", get_frame_name(frame));
     return eval_frame_default(tstate, frame, throw_flag);
   }

   ExtraState* extra = get_extra_state(F_CODE(frame));
   if (extra == SKIP_CODE || (callback == Py_False && extra == NULL)) {
     DEBUG_TRACE("skip %s", get_frame_name(frame));
     return eval_frame_default(tstate, frame, throw_flag);
   }

   if (extra == NULL) {
     extra = init_and_set_extra_state(F_CODE(frame));
   }


   int free_vars_copied = 0;
   #if IS_PYTHON_3_12_PLUS
   PyObject *locals = get_framelocals_mapping(frame);
   #else
   if (THP_PyFrame_FastToLocalsWithError(frame, &free_vars_copied) < 0) {
     DEBUG_TRACE("error %s", get_frame_name(frame));
     *should_clear_frame = 1;
     return NULL;
   }
   PyObject *locals = frame->f_locals;
   Py_INCREF(locals);
   #endif

   PyObject* backend = get_backend(callback);

   // A callback of Py_False indicates "run only" mode, the cache is checked, but
   // we never compile.
   if (callback == Py_False) {
     DEBUG_TRACE("In run only mode %s", get_frame_name(frame));
     _PytorchRecordFunctionState* rf = _pytorch_record_function_enter(cache_lookup_profiler_str);
     PyObject* maybe_cached_code = lookup(extra, locals, backend);
     _pytorch_record_function_exit(rf);

     Py_DECREF(locals);

     if (maybe_cached_code == NULL) {
       // guard eval failed, keep propagating
       *should_clear_frame = 1;
       return NULL;
     } else if (maybe_cached_code == Py_None) {
       DEBUG_TRACE("cache miss %s", get_frame_name(frame));
       return eval_frame_default(tstate, frame, throw_flag);
     }
     PyCodeObject* cached_code = (PyCodeObject*)maybe_cached_code;
     // used cached version
     DEBUG_TRACE("cache hit %s", get_frame_name(frame));
     *should_clear_frame = 1;
     return eval_custom_code(tstate, frame, cached_code, throw_flag, 0);
   }
   DEBUG_CHECK(PyDict_CheckExact(locals));
   DEBUG_CHECK(PyDict_CheckExact(frame->f_globals));
   DEBUG_CHECK(PyDict_CheckExact(frame->f_builtins));

   // We don't run the current custom_eval_frame behavior for guards.
   // So we temporarily set the callback to Py_None to drive the correct behavior
   // in the shim.
   eval_frame_callback_set(Py_None);

   _PytorchRecordFunctionState* rf = _pytorch_record_function_enter(cache_lookup_profiler_str);
   PyObject* maybe_cached_code = lookup(extra, locals, backend);
   _pytorch_record_function_exit(rf);
   if (maybe_cached_code == NULL) {
     // Python error
     *should_clear_frame = 1;
     Py_DECREF(locals);
     return NULL;
   } else if (maybe_cached_code != Py_None) {
     PyCodeObject* cached_code = (PyCodeObject*)maybe_cached_code;
     // used cached version
     DEBUG_TRACE("cache hit %s", get_frame_name(frame));
     // Re-enable custom behavior
     eval_frame_callback_set(callback);
     *should_clear_frame = 1;
     Py_DECREF(locals);
     return eval_custom_code(tstate, frame, cached_code, throw_flag, free_vars_copied);
   }
   // cache miss
   CacheEntry* cache_entry = extract_cache_entry(extra);
   FrameState* frame_state = extract_frame_state(extra);
   PyObject* result =
       call_callback(callback, frame, locals, cache_entry, frame_state);
   Py_DECREF(locals);
   if (result == NULL) {
     // internal exception, returning here will leak the exception into user code
     // this is useful for debugging -- but we dont want it to happen outside of
     // testing
     // NB: we intentionally DO NOT re-enable custom behavior to prevent
     // cascading failure from internal exceptions.  The upshot is if
     // Dynamo barfs, that's it for Dynamo, even if you catch the exception
     // inside the torch.compile block we won't try to Dynamo anything else.
     *should_clear_frame = 1;
     return NULL;
   } else if (result != Py_None) {
     DEBUG_TRACE("create cache %s", get_frame_name(frame));

     // NB: We could use extract_cache_entry to get the cache_entry, but
     // extract_cache_entry returns a borrowed reference. Modifying a borrowed
     // reference seems wrong. Therefore, we directly access the
     // extra->cache_entry. extra wont be NULL here.
     CacheEntry* new_cache_entry = create_cache_entry(extra, result, backend);
     Py_DECREF(result);

     // Update the existing cache_entry on the extra object. This extra object is
     // sitting on the extra scratch space, we are just changing the cache_entry
     // ptr. As a result, extra now becomes the owner of CacheEntry object. This
     // will be cleaned up when set_extra_state is called.
     // Re-enable custom behavior
     eval_frame_callback_set(callback);
     *should_clear_frame = 1;
     return eval_custom_code(tstate, frame, CacheEntry_get_code(new_cache_entry), throw_flag, free_vars_copied);
   } else {
     DEBUG_TRACE("create skip %s", get_frame_name(frame));
     Py_DECREF(result);
     set_extra_state(F_CODE(frame), SKIP_CODE);
     // Re-enable custom behavior
     eval_frame_callback_set(callback);
     return eval_frame_default(tstate, frame, throw_flag);
   }
 }

 #else // IS_PYTHON_3_14_PLUS

 // Fake definitions for everything we removed

 typedef struct THPPyInterpreterFrame {
   PyObject_HEAD
   _PyInterpreterFrame* frame; // Borrowed reference
 } THPPyInterpreterFrame;

 inline static void enable_eval_frame_shim(PyThreadState* tstate) {}
 inline static void enable_eval_frame_default(PyThreadState* tstate) {}

 static struct PyGetSetDef THPPyInterpreterFrame_properties[] = {NULL};

 static PyTypeObject THPPyInterpreterFrameType = {
     PyVarObject_HEAD_INIT(NULL, 0)
     .tp_name = "torch._C.dynamo.eval_frame._PyInterpreterFrame",
     .tp_basicsize = sizeof(THPPyInterpreterFrame),
     .tp_flags = Py_TPFLAGS_DEFAULT,
     .tp_getset = THPPyInterpreterFrame_properties,
 };

 #endif // CPython 3.14

 static PyObject* increment_working_threads(PyThreadState* tstate) {
   active_dynamo_threads = active_dynamo_threads + 1;
   if (active_dynamo_threads > 0) {
     enable_eval_frame_shim(tstate);
   }
   Py_RETURN_NONE;
 }

 static PyObject* decrement_working_threads(PyThreadState* tstate) {
   if (active_dynamo_threads > 0) {
     active_dynamo_threads = active_dynamo_threads - 1;
     if (active_dynamo_threads == 0) {
       enable_eval_frame_default(tstate);
     }
   }
   Py_RETURN_NONE;
 }

 static PyObject* set_eval_frame(PyObject* new_callback, PyThreadState* tstate) {
   // Change the eval frame callback and return the old one
   //  - None: disables TorchDynamo
   //  - False: run-only mode (reuse existing compiles)
   //  - Python callable(): enables TorchDynamo
   PyObject* old_callback = eval_frame_callback_get();

   // owned by caller
   Py_INCREF(old_callback);

   if (old_callback != Py_None && new_callback == Py_None) {
     decrement_working_threads(tstate);
   } else if (old_callback == Py_None && new_callback != Py_None) {
     increment_working_threads(tstate);
   }

   Py_INCREF(new_callback);
   Py_DECREF(old_callback);

   // Set thread local callback. This will drive behavior of our shim, if/when it
   // is installed.
   eval_frame_callback_set(new_callback);

   return old_callback;
 }

 static PyObject* set_eval_frame_py(PyObject* dummy, PyObject* callback) {
   if (callback != Py_None && callback != Py_False &&
       !PyCallable_Check(callback)) {
     DEBUG_TRACE0("arg error");
     PyErr_SetString(PyExc_TypeError, "expected a callable");
     return NULL;
   }
   DEBUG_TRACE(
       "python enabled=%d and is run_only=%d",
       callback != Py_None,
       callback == Py_False);
   return set_eval_frame(callback, PyThreadState_GET());
 }

 static PyObject* reset_code(PyObject* dummy, PyObject* code) {
   if (!PyCode_Check(code)) {
     DEBUG_TRACE0("arg error");
     PyErr_SetString(PyExc_TypeError, "expected a code object");
     return NULL;
   }

   // set_extra_state destroys the existing object on extra scratch space.
   set_extra_state((PyCodeObject*)code, NULL);
   Py_RETURN_NONE;
 }

 static PyObject* unsupported(PyObject* dummy, PyObject* args) {
   // a dummy C function used in testing
   PyObject* obj1 = NULL;
   PyObject* obj2 = NULL;
   if (!PyArg_ParseTuple(args, "OO", &obj1, &obj2)) {
     return NULL;
   }
   Py_INCREF(obj2);
   return obj2;
 }

 static PyObject* skip_code(PyObject* dummy, PyObject* obj) {
   if (!PyCode_Check(obj)) {
     PyErr_SetString(PyExc_TypeError, "expected a code object");
     return NULL;
   }

   // set_extra_state destroys the existing object on extra scratch space.
   set_extra_state((PyCodeObject*)obj, SKIP_CODE);
   Py_RETURN_NONE;
 }

 static PyObject* set_guard_error_hook(PyObject* dummy, PyObject* obj) {
   if (obj == Py_None) {
     obj = NULL;
   }
   Py_XSETREF(guard_error_hook, Py_XNewRef(obj));
   Py_RETURN_NONE;
 }

 static PyMethodDef _methods[] = {
     {"set_eval_frame", set_eval_frame_py, METH_O, NULL},
     {"reset_code", reset_code, METH_O, NULL},
     {"unsupported", unsupported, METH_VARARGS, NULL},
     {"skip_code", skip_code, METH_O, NULL},
     {"set_guard_error_hook", set_guard_error_hook, METH_O, NULL},
     {NULL, NULL, 0, NULL}};

 static struct PyModuleDef _module = {
     PyModuleDef_HEAD_INIT,
     "torch._C._dynamo.eval_frame",
     "Module containing hooks to override eval_frame",
     -1,
     _methods};

 #if IS_PYTHON_3_12_PLUS
 #define _PyEval_RequestCodeExtraIndex PyUnstable_Eval_RequestCodeExtraIndex
 #endif

 PyObject* torch_c_dynamo_eval_frame_init(void) {
   extra_index = _PyEval_RequestCodeExtraIndex(destroy_extra_state);
   if (extra_index < 0) {
     PyErr_SetString(PyExc_RuntimeError,
                     "dynamo: unable to register extra index");
     return NULL;
   }

   int result = PyThread_tss_create(&eval_frame_callback_key);
   CHECK(result == 0);

   Py_INCREF(Py_None);
   eval_frame_callback_set(Py_None);

   PyObject* module = PyModule_Create(&_module);
   if (module == NULL) {
     return NULL;
   }

 #if IS_PYTHON_3_11_PLUS
   if (PyType_Ready(&THPPyInterpreterFrameType) < 0) {
     return NULL;
   }
   Py_INCREF(&THPPyInterpreterFrameType);
   if (PyModule_AddObject(module, "_PyInterpreterFrame", (PyObject*)&THPPyInterpreterFrameType) != 0) {
     return NULL;
   }
 #endif

   return module;
 }