Python/specialize.c - platform/external/python/cpython3 - Git at Google


 #include "Python.h"
 #include "pycore_code.h"
 #include "opcode.h"


 /* We layout the quickened data as a bi-directional array:
  * Instructions upwards, cache entries downwards.
  * first_instr is aligned to a SpecializedCacheEntry.
  * The nth instruction is located at first_instr[n]
  * The nth cache is located at ((SpecializedCacheEntry *)first_instr)[-1-n]
  * The first (index 0) cache entry is reserved for the count, to enable finding
  * the first instruction from the base pointer.
  * The cache_count argument must include space for the count.
  * We use the SpecializedCacheOrInstruction union to refer to the data
  * to avoid type punning.

  Layout of quickened data, each line 8 bytes for M cache entries and N instructions:

  <cache_count>                              <---- co->co_quickened
  <cache M-1>
  <cache M-2>
  ...
  <cache 0>
  <instr 0> <instr 1> <instr 2> <instr 3>    <--- co->co_first_instr
  <instr 4> <instr 5> <instr 6> <instr 7>
  ...
  <instr N-1>
 */

 Py_ssize_t _Py_QuickenedCount = 0;

 static SpecializedCacheOrInstruction *
 allocate(int cache_count, int instruction_count)
 {
     assert(sizeof(SpecializedCacheOrInstruction) == 2*sizeof(int32_t));
     assert(sizeof(SpecializedCacheEntry) == 2*sizeof(int32_t));
     assert(cache_count > 0);
     assert(instruction_count > 0);
     int count = cache_count + (instruction_count + INSTRUCTIONS_PER_ENTRY -1)/INSTRUCTIONS_PER_ENTRY;
     SpecializedCacheOrInstruction *array = (SpecializedCacheOrInstruction *)
         PyMem_Malloc(sizeof(SpecializedCacheOrInstruction) * count);
     if (array == NULL) {
         PyErr_NoMemory();
         return NULL;
     }
     _Py_QuickenedCount++;
     array[0].entry.zero.cache_count = cache_count;
     return array;
 }

 static int
 get_cache_count(SpecializedCacheOrInstruction *quickened) {
     return quickened[0].entry.zero.cache_count;
 }

 /* Map from opcode to adaptive opcode.
   Values of zero are ignored. */
 static uint8_t adaptive_opcodes[256] = { 0 };

 /* The number of cache entries required for a "family" of instructions. */
 static uint8_t cache_requirements[256] = { 0 };

 /* Return the oparg for the cache_offset and instruction index.
  *
  * If no cache is needed then return the original oparg.
  * If a cache is needed, but cannot be accessed because
  * oparg would be too large, then return -1.
  *
  * Also updates the cache_offset, as it may need to be incremented by
  * more than the cache requirements, if many instructions do not need caches.
  *
  * See pycore_code.h for details of how the cache offset,
  * instruction index and oparg are related */
 static int
 oparg_from_instruction_and_update_offset(int index, int opcode, int original_oparg, int *cache_offset) {
     /* The instruction pointer in the interpreter points to the next
      * instruction, so we compute the offset using nexti (index + 1) */
     int nexti = index + 1;
     uint8_t need = cache_requirements[opcode];
     if (need == 0) {
         return original_oparg;
     }
     assert(adaptive_opcodes[opcode] != 0);
     int oparg = oparg_from_offset_and_nexti(*cache_offset, nexti);
     assert(*cache_offset == offset_from_oparg_and_nexti(oparg, nexti));
     /* Some cache space is wasted here as the minimum possible offset is (nexti>>1) */
     if (oparg < 0) {
         oparg = 0;
         *cache_offset = offset_from_oparg_and_nexti(oparg, nexti);
     }
     else if (oparg > 255) {
         return -1;
     }
     *cache_offset += need;
     return oparg;
 }

 static int
 entries_needed(_Py_CODEUNIT *code, int len)
 {
     int cache_offset = 0;
     int previous_opcode = -1;
     for (int i = 0; i < len; i++) {
         uint8_t opcode = _Py_OPCODE(code[i]);
         if (previous_opcode != EXTENDED_ARG) {
             oparg_from_instruction_and_update_offset(i, opcode, 0, &cache_offset);
         }
         previous_opcode = opcode;
     }
     return cache_offset + 1;   // One extra for the count entry
 }

 static inline _Py_CODEUNIT *
 first_instruction(SpecializedCacheOrInstruction *quickened)
 {
     return &quickened[get_cache_count(quickened)].code[0];
 }

 /** Insert adaptive instructions and superinstructions.
  *
  * Skip instruction preceded by EXTENDED_ARG for adaptive
  * instructions as those are both very rare and tricky
  * to handle.
  */
 static void
 optimize(SpecializedCacheOrInstruction *quickened, int len)
 {
     _Py_CODEUNIT *instructions = first_instruction(quickened);
     int cache_offset = 0;
     int previous_opcode = -1;
     for(int i = 0; i < len; i++) {
         int opcode = _Py_OPCODE(instructions[i]);
         int oparg = _Py_OPARG(instructions[i]);
         uint8_t adaptive_opcode = adaptive_opcodes[opcode];
         if (adaptive_opcode && previous_opcode != EXTENDED_ARG) {
             int new_oparg = oparg_from_instruction_and_update_offset(
                 i, opcode, oparg, &cache_offset
             );
             if (new_oparg < 0) {
                 /* Not possible to allocate a cache for this instruction */
                 previous_opcode = opcode;
                 continue;
             }
             instructions[i] = _Py_MAKECODEUNIT(adaptive_opcode, new_oparg);
             previous_opcode = adaptive_opcode;
             int entries_needed = cache_requirements[opcode];
             if (entries_needed) {
                 /* Initialize the adpative cache entry */
                 int cache0_offset = cache_offset-entries_needed;
                 SpecializedCacheEntry *cache =
                     _GetSpecializedCacheEntry(instructions, cache0_offset);
                 cache->adaptive.original_oparg = oparg;
                 cache->adaptive.counter = 0;
             }
         }
         else {
             /* Super instructions don't use the cache,
              * so no need to update the offset. */
             switch (opcode) {
                 /* Insert superinstructions here
                  E.g.
                 case LOAD_FAST:
                     if (previous_opcode == LOAD_FAST)
                         instructions[i-1] = _Py_MAKECODEUNIT(LOAD_FAST__LOAD_FAST, oparg);
                  */
             }
             previous_opcode = opcode;
         }
     }
     assert(cache_offset+1 == get_cache_count(quickened));
 }

 int
 _Py_Quicken(PyCodeObject *code) {
     if (code->co_quickened) {
         return 0;
     }
     Py_ssize_t size = PyBytes_GET_SIZE(code->co_code);
     int instr_count = (int)(size/sizeof(_Py_CODEUNIT));
     if (instr_count > MAX_SIZE_TO_QUICKEN) {
         code->co_warmup = QUICKENING_WARMUP_COLDEST;
         return 0;
     }
     int entry_count = entries_needed(code->co_firstinstr, instr_count);
     SpecializedCacheOrInstruction *quickened = allocate(entry_count, instr_count);
     if (quickened == NULL) {
         return -1;
     }
     _Py_CODEUNIT *new_instructions = first_instruction(quickened);
     memcpy(new_instructions, code->co_firstinstr, size);
     optimize(quickened, instr_count);
     code->co_quickened = quickened;
     code->co_firstinstr = new_instructions;
     return 0;
 }

	#include "Python.h"
	#include "pycore_code.h"
	#include "opcode.h"


	/* We layout the quickened data as a bi-directional array:
	* Instructions upwards, cache entries downwards.
	* first_instr is aligned to a SpecializedCacheEntry.
	* The nth instruction is located at first_instr[n]
	* The nth cache is located at ((SpecializedCacheEntry *)first_instr)[-1-n]
	* The first (index 0) cache entry is reserved for the count, to enable finding
	* the first instruction from the base pointer.
	* The cache_count argument must include space for the count.
	* We use the SpecializedCacheOrInstruction union to refer to the data
	* to avoid type punning.

	Layout of quickened data, each line 8 bytes for M cache entries and N instructions:

	<cache_count> <---- co->co_quickened
	<cache M-1>
	<cache M-2>
	...
	<cache 0>
	<instr 0> <instr 1> <instr 2> <instr 3> <--- co->co_first_instr
	<instr 4> <instr 5> <instr 6> <instr 7>
	...
	<instr N-1>
	*/

	Py_ssize_t _Py_QuickenedCount = 0;

	static SpecializedCacheOrInstruction *
	allocate(int cache_count, int instruction_count)
	{
	assert(sizeof(SpecializedCacheOrInstruction) == 2*sizeof(int32_t));
	assert(sizeof(SpecializedCacheEntry) == 2*sizeof(int32_t));
	assert(cache_count > 0);
	assert(instruction_count > 0);
	int count = cache_count + (instruction_count + INSTRUCTIONS_PER_ENTRY -1)/INSTRUCTIONS_PER_ENTRY;
	SpecializedCacheOrInstruction array = (SpecializedCacheOrInstruction )
	PyMem_Malloc(sizeof(SpecializedCacheOrInstruction) * count);
	if (array == NULL) {
	PyErr_NoMemory();
	return NULL;
	}
	_Py_QuickenedCount++;
	array[0].entry.zero.cache_count = cache_count;
	return array;
	}

	static int
	get_cache_count(SpecializedCacheOrInstruction *quickened) {
	return quickened[0].entry.zero.cache_count;
	}

	/* Map from opcode to adaptive opcode.
	Values of zero are ignored. */
	static uint8_t adaptive_opcodes[256] = { 0 };

	/* The number of cache entries required for a "family" of instructions. */
	static uint8_t cache_requirements[256] = { 0 };

	/* Return the oparg for the cache_offset and instruction index.
	*
	* If no cache is needed then return the original oparg.
	* If a cache is needed, but cannot be accessed because
	* oparg would be too large, then return -1.
	*
	* Also updates the cache_offset, as it may need to be incremented by
	* more than the cache requirements, if many instructions do not need caches.
	*
	* See pycore_code.h for details of how the cache offset,
	* instruction index and oparg are related */
	static int
	oparg_from_instruction_and_update_offset(int index, int opcode, int original_oparg, int *cache_offset) {
	/* The instruction pointer in the interpreter points to the next
	* instruction, so we compute the offset using nexti (index + 1) */
	int nexti = index + 1;
	uint8_t need = cache_requirements[opcode];
	if (need == 0) {
	return original_oparg;
	}
	assert(adaptive_opcodes[opcode] != 0);
	int oparg = oparg_from_offset_and_nexti(*cache_offset, nexti);
	assert(*cache_offset == offset_from_oparg_and_nexti(oparg, nexti));
	/* Some cache space is wasted here as the minimum possible offset is (nexti>>1) */
	if (oparg < 0) {
	oparg = 0;
	*cache_offset = offset_from_oparg_and_nexti(oparg, nexti);
	}
	else if (oparg > 255) {
	return -1;
	}
	*cache_offset += need;
	return oparg;
	}

	static int
	entries_needed(_Py_CODEUNIT *code, int len)
	{
	int cache_offset = 0;
	int previous_opcode = -1;
	for (int i = 0; i < len; i++) {
	uint8_t opcode = _Py_OPCODE(code[i]);
	if (previous_opcode != EXTENDED_ARG) {
	oparg_from_instruction_and_update_offset(i, opcode, 0, &cache_offset);
	}
	previous_opcode = opcode;
	}
	return cache_offset + 1; // One extra for the count entry
	}

	static inline _Py_CODEUNIT *
	first_instruction(SpecializedCacheOrInstruction *quickened)
	{
	return &quickened[get_cache_count(quickened)].code[0];
	}

	/** Insert adaptive instructions and superinstructions.
	*
	* Skip instruction preceded by EXTENDED_ARG for adaptive
	* instructions as those are both very rare and tricky
	* to handle.
	*/
	static void
	optimize(SpecializedCacheOrInstruction *quickened, int len)
	{
	_Py_CODEUNIT *instructions = first_instruction(quickened);
	int cache_offset = 0;
	int previous_opcode = -1;
	for(int i = 0; i < len; i++) {
	int opcode = _Py_OPCODE(instructions[i]);
	int oparg = _Py_OPARG(instructions[i]);
	uint8_t adaptive_opcode = adaptive_opcodes[opcode];
	if (adaptive_opcode && previous_opcode != EXTENDED_ARG) {
	int new_oparg = oparg_from_instruction_and_update_offset(
	i, opcode, oparg, &cache_offset
	);
	if (new_oparg < 0) {
	/* Not possible to allocate a cache for this instruction */
	previous_opcode = opcode;
	continue;
	}
	instructions[i] = _Py_MAKECODEUNIT(adaptive_opcode, new_oparg);
	previous_opcode = adaptive_opcode;
	int entries_needed = cache_requirements[opcode];
	if (entries_needed) {
	/* Initialize the adpative cache entry */
	int cache0_offset = cache_offset-entries_needed;
	SpecializedCacheEntry *cache =
	_GetSpecializedCacheEntry(instructions, cache0_offset);
	cache->adaptive.original_oparg = oparg;
	cache->adaptive.counter = 0;
	}
	}
	else {
	/* Super instructions don't use the cache,
	* so no need to update the offset. */
	switch (opcode) {
	/* Insert superinstructions here
	E.g.
	case LOAD_FAST:
	if (previous_opcode == LOAD_FAST)
	instructions[i-1] = _Py_MAKECODEUNIT(LOAD_FAST__LOAD_FAST, oparg);
	*/
	}
	previous_opcode = opcode;
	}
	}
	assert(cache_offset+1 == get_cache_count(quickened));
	}

	int
	_Py_Quicken(PyCodeObject *code) {
	if (code->co_quickened) {
	return 0;
	}
	Py_ssize_t size = PyBytes_GET_SIZE(code->co_code);
	int instr_count = (int)(size/sizeof(_Py_CODEUNIT));
	if (instr_count > MAX_SIZE_TO_QUICKEN) {
	code->co_warmup = QUICKENING_WARMUP_COLDEST;
	return 0;
	}
	int entry_count = entries_needed(code->co_firstinstr, instr_count);
	SpecializedCacheOrInstruction *quickened = allocate(entry_count, instr_count);
	if (quickened == NULL) {
	return -1;
	}
	_Py_CODEUNIT *new_instructions = first_instruction(quickened);
	memcpy(new_instructions, code->co_firstinstr, size);
	optimize(quickened, instr_count);
	code->co_quickened = quickened;
	code->co_firstinstr = new_instructions;
	return 0;
	}