JIT compiler update
diff --git a/src/pcre2_jit_compile.c b/src/pcre2_jit_compile.c
index cf3e6e4..538e7b0 100644
--- a/src/pcre2_jit_compile.c
+++ b/src/pcre2_jit_compile.c
@@ -2272,7 +2272,7 @@
 for (i = 0; i < RECURSE_TMP_REG_COUNT; i++)
   {
   SLJIT_ASSERT(status->tmp_regs[i] >= 0);
-  SLJIT_ASSERT(sljit_get_register_index(SLJIT_INT_REGISTER, status->saved_tmp_regs[i]) < 0 || status->tmp_regs[i] == status->saved_tmp_regs[i]);
+  SLJIT_ASSERT(sljit_get_register_index(SLJIT_GP_REGISTER, status->saved_tmp_regs[i]) < 0 || status->tmp_regs[i] == status->saved_tmp_regs[i]);
 
   status->store_bases[i] = -1;
   }
@@ -2292,7 +2292,7 @@
 if (status->store_bases[next_tmp_reg] == -1)
   {
   /* Preserve virtual registers. */
-  if (sljit_get_register_index(SLJIT_INT_REGISTER, status->saved_tmp_regs[next_tmp_reg]) < 0)
+  if (sljit_get_register_index(SLJIT_GP_REGISTER, status->saved_tmp_regs[next_tmp_reg]) < 0)
     OP1(SLJIT_MOV, status->saved_tmp_regs[next_tmp_reg], 0, tmp_reg, 0);
   }
 else
@@ -2321,7 +2321,7 @@
     OP1(SLJIT_MOV, SLJIT_MEM1(status->store_bases[next_tmp_reg]), status->store_offsets[next_tmp_reg], tmp_reg, 0);
 
     /* Restore virtual registers. */
-    if (sljit_get_register_index(SLJIT_INT_REGISTER, saved_tmp_reg) < 0)
+    if (sljit_get_register_index(SLJIT_GP_REGISTER, saved_tmp_reg) < 0)
       OP1(SLJIT_MOV, tmp_reg, 0, saved_tmp_reg, 0);
     }
 
@@ -3250,7 +3250,7 @@
   return;
   }
 
-if (sljit_get_register_index(SLJIT_INT_REGISTER, TMP3) >= 0 && !sljit_has_cpu_feature(SLJIT_HAS_ZERO_REGISTER))
+if (sljit_get_register_index(SLJIT_GP_REGISTER, TMP3) >= 0 && !sljit_has_cpu_feature(SLJIT_HAS_ZERO_REGISTER))
   {
   OP1(SLJIT_MOV, TMP3, 0, SLJIT_IMM, 0);
   src = TMP3;
@@ -13739,9 +13739,9 @@
 SLJIT_ASSERT(tables);
 
 #if HAS_VIRTUAL_REGISTERS == 1
-SLJIT_ASSERT(sljit_get_register_index(SLJIT_INT_REGISTER, TMP3) < 0 && sljit_get_register_index(SLJIT_INT_REGISTER, ARGUMENTS) < 0 && sljit_get_register_index(SLJIT_INT_REGISTER, RETURN_ADDR) < 0);
+SLJIT_ASSERT(sljit_get_register_index(SLJIT_GP_REGISTER, TMP3) < 0 && sljit_get_register_index(SLJIT_GP_REGISTER, ARGUMENTS) < 0 && sljit_get_register_index(SLJIT_GP_REGISTER, RETURN_ADDR) < 0);
 #elif HAS_VIRTUAL_REGISTERS == 0
-SLJIT_ASSERT(sljit_get_register_index(SLJIT_INT_REGISTER, TMP3) >= 0 && sljit_get_register_index(SLJIT_INT_REGISTER, ARGUMENTS) >= 0 && sljit_get_register_index(SLJIT_INT_REGISTER, RETURN_ADDR) >= 0);
+SLJIT_ASSERT(sljit_get_register_index(SLJIT_GP_REGISTER, TMP3) >= 0 && sljit_get_register_index(SLJIT_GP_REGISTER, ARGUMENTS) >= 0 && sljit_get_register_index(SLJIT_GP_REGISTER, RETURN_ADDR) >= 0);
 #else
 #error "Invalid value for HAS_VIRTUAL_REGISTERS"
 #endif
diff --git a/src/pcre2_jit_simd_inc.h b/src/pcre2_jit_simd_inc.h
index 93353cf..355962d 100644
--- a/src/pcre2_jit_simd_inc.h
+++ b/src/pcre2_jit_simd_inc.h
@@ -201,7 +201,7 @@
   }
 }
 
-#define JIT_HAS_FAST_FORWARD_CHAR_SIMD (sljit_has_cpu_feature(SLJIT_HAS_SSE2))
+#define JIT_HAS_FAST_FORWARD_CHAR_SIMD (sljit_has_cpu_feature(SLJIT_HAS_FPU))
 
 static void fast_forward_char_simd(compiler_common *common, PCRE2_UCHAR char1, PCRE2_UCHAR char2, sljit_s32 offset)
 {
@@ -214,8 +214,8 @@
 struct sljit_jump *quit;
 struct sljit_jump *partial_quit[2];
 vector_compare_type compare_type = vector_compare_match1;
-sljit_s32 tmp1_reg_ind = sljit_get_register_index(SLJIT_INT_REGISTER, TMP1);
-sljit_s32 str_ptr_reg_ind = sljit_get_register_index(SLJIT_INT_REGISTER, STR_PTR);
+sljit_s32 tmp1_reg_ind = sljit_get_register_index(SLJIT_GP_REGISTER, TMP1);
+sljit_s32 str_ptr_reg_ind = sljit_get_register_index(SLJIT_GP_REGISTER, STR_PTR);
 sljit_s32 data_ind = 0;
 sljit_s32 tmp_ind = 1;
 sljit_s32 cmp1_ind = 2;
@@ -365,7 +365,7 @@
 #endif
 }
 
-#define JIT_HAS_FAST_REQUESTED_CHAR_SIMD (sljit_has_cpu_feature(SLJIT_HAS_SSE2))
+#define JIT_HAS_FAST_REQUESTED_CHAR_SIMD (sljit_has_cpu_feature(SLJIT_HAS_FPU))
 
 static jump_list *fast_requested_char_simd(compiler_common *common, PCRE2_UCHAR char1, PCRE2_UCHAR char2)
 {
@@ -375,8 +375,8 @@
 struct sljit_jump *quit;
 jump_list *not_found = NULL;
 vector_compare_type compare_type = vector_compare_match1;
-sljit_s32 tmp1_reg_ind = sljit_get_register_index(SLJIT_INT_REGISTER, TMP1);
-sljit_s32 str_ptr_reg_ind = sljit_get_register_index(SLJIT_INT_REGISTER, STR_PTR);
+sljit_s32 tmp1_reg_ind = sljit_get_register_index(SLJIT_GP_REGISTER, TMP1);
+sljit_s32 str_ptr_reg_ind = sljit_get_register_index(SLJIT_GP_REGISTER, STR_PTR);
 sljit_s32 data_ind = 0;
 sljit_s32 tmp_ind = 1;
 sljit_s32 cmp1_ind = 2;
@@ -497,7 +497,7 @@
 
 #ifndef _WIN64
 
-#define JIT_HAS_FAST_FORWARD_CHAR_PAIR_SIMD (sljit_has_cpu_feature(SLJIT_HAS_SSE2))
+#define JIT_HAS_FAST_FORWARD_CHAR_PAIR_SIMD (sljit_has_cpu_feature(SLJIT_HAS_FPU))
 
 static void fast_forward_char_pair_simd(compiler_common *common, sljit_s32 offs1,
   PCRE2_UCHAR char1a, PCRE2_UCHAR char1b, sljit_s32 offs2, PCRE2_UCHAR char2a, PCRE2_UCHAR char2b)
@@ -509,9 +509,9 @@
 sljit_u32 bit1 = 0;
 sljit_u32 bit2 = 0;
 sljit_u32 diff = IN_UCHARS(offs1 - offs2);
-sljit_s32 tmp1_reg_ind = sljit_get_register_index(SLJIT_INT_REGISTER, TMP1);
-sljit_s32 tmp2_reg_ind = sljit_get_register_index(SLJIT_INT_REGISTER, TMP2);
-sljit_s32 str_ptr_reg_ind = sljit_get_register_index(SLJIT_INT_REGISTER, STR_PTR);
+sljit_s32 tmp1_reg_ind = sljit_get_register_index(SLJIT_GP_REGISTER, TMP1);
+sljit_s32 tmp2_reg_ind = sljit_get_register_index(SLJIT_GP_REGISTER, TMP2);
+sljit_s32 str_ptr_reg_ind = sljit_get_register_index(SLJIT_GP_REGISTER, STR_PTR);
 sljit_s32 data1_ind = 0;
 sljit_s32 data2_ind = 1;
 sljit_s32 tmp1_ind = 2;
@@ -1183,7 +1183,7 @@
   OP1(SLJIT_MOV, tmp_general_reg, 0, SLJIT_IMM, chr);
 
   /* VLVG */
-  instruction[0] = (sljit_u16)(0xe700 | (dst_vreg << 4) | sljit_get_register_index(SLJIT_INT_REGISTER, tmp_general_reg));
+  instruction[0] = (sljit_u16)(0xe700 | (dst_vreg << 4) | sljit_get_register_index(SLJIT_GP_REGISTER, tmp_general_reg));
   instruction[1] = 0;
   instruction[2] = (sljit_u16)((VECTOR_ELEMENT_SIZE << 12) | (0x8 << 8) | 0x22);
   sljit_emit_op_custom(compiler, instruction, 6);
@@ -1262,8 +1262,8 @@
 struct sljit_jump *quit;
 struct sljit_jump *partial_quit[2];
 vector_compare_type compare_type = vector_compare_match1;
-sljit_s32 tmp1_reg_ind = sljit_get_register_index(SLJIT_INT_REGISTER, TMP1);
-sljit_s32 str_ptr_reg_ind = sljit_get_register_index(SLJIT_INT_REGISTER, STR_PTR);
+sljit_s32 tmp1_reg_ind = sljit_get_register_index(SLJIT_GP_REGISTER, TMP1);
+sljit_s32 str_ptr_reg_ind = sljit_get_register_index(SLJIT_GP_REGISTER, STR_PTR);
 sljit_s32 data_ind = 0;
 sljit_s32 tmp_ind = 1;
 sljit_s32 cmp1_ind = 2;
@@ -1460,8 +1460,8 @@
 struct sljit_jump *quit;
 jump_list *not_found = NULL;
 vector_compare_type compare_type = vector_compare_match1;
-sljit_s32 tmp1_reg_ind = sljit_get_register_index(SLJIT_INT_REGISTER, TMP1);
-sljit_s32 tmp3_reg_ind = sljit_get_register_index(SLJIT_INT_REGISTER, TMP3);
+sljit_s32 tmp1_reg_ind = sljit_get_register_index(SLJIT_GP_REGISTER, TMP1);
+sljit_s32 tmp3_reg_ind = sljit_get_register_index(SLJIT_GP_REGISTER, TMP3);
 sljit_s32 data_ind = 0;
 sljit_s32 tmp_ind = 1;
 sljit_s32 cmp1_ind = 2;
@@ -1630,9 +1630,9 @@
 sljit_u32 bit1 = 0;
 sljit_u32 bit2 = 0;
 sljit_s32 diff = IN_UCHARS(offs2 - offs1);
-sljit_s32 tmp1_reg_ind = sljit_get_register_index(SLJIT_INT_REGISTER, TMP1);
-sljit_s32 tmp2_reg_ind = sljit_get_register_index(SLJIT_INT_REGISTER, TMP2);
-sljit_s32 str_ptr_reg_ind = sljit_get_register_index(SLJIT_INT_REGISTER, STR_PTR);
+sljit_s32 tmp1_reg_ind = sljit_get_register_index(SLJIT_GP_REGISTER, TMP1);
+sljit_s32 tmp2_reg_ind = sljit_get_register_index(SLJIT_GP_REGISTER, TMP2);
+sljit_s32 str_ptr_reg_ind = sljit_get_register_index(SLJIT_GP_REGISTER, STR_PTR);
 sljit_s32 data1_ind = 0;
 sljit_s32 data2_ind = 1;
 sljit_s32 tmp1_ind = 2;
@@ -1949,8 +1949,8 @@
 struct sljit_jump *quit;
 struct sljit_jump *partial_quit[2];
 vector_compare_type compare_type = vector_compare_match1;
-sljit_s32 tmp1_reg_ind = sljit_get_register_index(SLJIT_INT_REGISTER, TMP1);
-sljit_s32 str_ptr_reg_ind = sljit_get_register_index(SLJIT_INT_REGISTER, STR_PTR);
+sljit_s32 tmp1_reg_ind = sljit_get_register_index(SLJIT_GP_REGISTER, TMP1);
+sljit_s32 str_ptr_reg_ind = sljit_get_register_index(SLJIT_GP_REGISTER, STR_PTR);
 sljit_s32 data_ind = 0;
 sljit_s32 tmp_ind = 1;
 sljit_s32 cmp1_ind = 2;
@@ -2082,8 +2082,8 @@
 struct sljit_jump *quit;
 jump_list *not_found = NULL;
 vector_compare_type compare_type = vector_compare_match1;
-sljit_s32 tmp1_reg_ind = sljit_get_register_index(SLJIT_INT_REGISTER, TMP1);
-sljit_s32 str_ptr_reg_ind = sljit_get_register_index(SLJIT_INT_REGISTER, STR_PTR);
+sljit_s32 tmp1_reg_ind = sljit_get_register_index(SLJIT_GP_REGISTER, TMP1);
+sljit_s32 str_ptr_reg_ind = sljit_get_register_index(SLJIT_GP_REGISTER, STR_PTR);
 sljit_s32 data_ind = 0;
 sljit_s32 tmp_ind = 1;
 sljit_s32 cmp1_ind = 2;
@@ -2183,9 +2183,9 @@
 sljit_u32 bit1 = 0;
 sljit_u32 bit2 = 0;
 sljit_u32 diff = IN_UCHARS(offs1 - offs2);
-sljit_s32 tmp1_reg_ind = sljit_get_register_index(SLJIT_INT_REGISTER, TMP1);
-sljit_s32 tmp2_reg_ind = sljit_get_register_index(SLJIT_INT_REGISTER, TMP2);
-sljit_s32 str_ptr_reg_ind = sljit_get_register_index(SLJIT_INT_REGISTER, STR_PTR);
+sljit_s32 tmp1_reg_ind = sljit_get_register_index(SLJIT_GP_REGISTER, TMP1);
+sljit_s32 tmp2_reg_ind = sljit_get_register_index(SLJIT_GP_REGISTER, TMP2);
+sljit_s32 str_ptr_reg_ind = sljit_get_register_index(SLJIT_GP_REGISTER, STR_PTR);
 sljit_s32 data1_ind = 0;
 sljit_s32 data2_ind = 1;
 sljit_s32 tmp1_ind = 2;
diff --git a/src/sljit/allocator_src/sljitExecAllocatorApple.c b/src/sljit/allocator_src/sljitExecAllocatorApple.c
index 87a0420..95b9842 100644
--- a/src/sljit/allocator_src/sljitExecAllocatorApple.c
+++ b/src/sljit/allocator_src/sljitExecAllocatorApple.c
@@ -33,15 +33,18 @@
    On non-macOS systems, returns MAP_JIT if it is defined.
 */
 #include <TargetConditionals.h>
-#if TARGET_OS_OSX
-#if defined SLJIT_CONFIG_X86 && SLJIT_CONFIG_X86
+
+#if (defined(TARGET_OS_OSX) && TARGET_OS_OSX) || (TARGET_OS_MAC && !TARGET_OS_IPHONE)
+
+#if defined(SLJIT_CONFIG_X86) && SLJIT_CONFIG_X86
+
 #include <sys/utsname.h>
 #include <stdlib.h>
 
 #define SLJIT_MAP_JIT	(get_map_jit_flag())
 #define SLJIT_UPDATE_WX_FLAGS(from, to, enable_exec)
 
-static SLJIT_INLINE int get_map_jit_flag()
+static SLJIT_INLINE int get_map_jit_flag(void)
 {
 	size_t page_size;
 	void *ptr;
@@ -67,10 +70,8 @@
 	}
 	return map_jit_flag;
 }
-#else /* !SLJIT_CONFIG_X86 */
-#if !(defined SLJIT_CONFIG_ARM && SLJIT_CONFIG_ARM)
-#error "Unsupported architecture"
-#endif /* SLJIT_CONFIG_ARM */
+
+#elif defined(SLJIT_CONFIG_ARM) && SLJIT_CONFIG_ARM
 
 #include <AvailabilityMacros.h>
 #include <pthread.h>
@@ -86,9 +87,24 @@
 #endif /* BigSur */
 	pthread_jit_write_protect_np(enable_exec);
 }
-#endif /* SLJIT_CONFIG_X86 */
+
+#elif defined(SLJIT_CONFIG_PPC) && SLJIT_CONFIG_PPC
+
+#define SLJIT_MAP_JIT	(0)
+#define SLJIT_UPDATE_WX_FLAGS(from, to, enable_exec)
+
+#else
+#error "Unsupported architecture"
+#endif /* SLJIT_CONFIG */
+
 #else /* !TARGET_OS_OSX */
+
+#ifdef MAP_JIT
 #define SLJIT_MAP_JIT	(MAP_JIT)
+#else
+#define SLJIT_MAP_JIT	(0)
+#endif
+
 #endif /* TARGET_OS_OSX */
 
 static SLJIT_INLINE void* alloc_chunk(sljit_uw size)
diff --git a/src/sljit/sljitConfigInternal.h b/src/sljit/sljitConfigInternal.h
index 0f58aa8..3d0e3da 100644
--- a/src/sljit/sljitConfigInternal.h
+++ b/src/sljit/sljitConfigInternal.h
@@ -85,6 +85,8 @@
    Other macros:
      SLJIT_FUNC : calling convention attribute for both calling JIT from C and C calling back from JIT
      SLJIT_W(number) : defining 64 bit constants on 64 bit architectures (platform independent helper)
+     SLJIT_F64_SECOND(reg) : provides the register index of the second 32 bit part of a 64 bit
+                             floating point register when SLJIT_HAS_F64_AS_F32_PAIR returns non-zero
 */
 
 /***********************************************************/
@@ -201,6 +203,10 @@
 /* Instruction cache flush. */
 /****************************/
 
+#ifdef __APPLE__
+#include <AvailabilityMacros.h>
+#endif
+
 /*
  * TODO:
  *
@@ -241,7 +247,7 @@
 /* Not required to implement on archs with unified caches. */
 #define SLJIT_CACHE_FLUSH(from, to)
 
-#elif defined __APPLE__
+#elif defined(__APPLE__) && MAC_OS_X_VERSION_MIN_REQUIRED >= 1050
 
 /* Supported by all macs since Mac OS 10.5.
    However, it does not work on non-jailbroken iOS devices,
@@ -393,9 +399,10 @@
 /* Auto detecting mips revision. */
 #if (defined __mips_isa_rev) && (__mips_isa_rev >= 6)
 #define SLJIT_MIPS_REV 6
-#elif (defined __mips_isa_rev && __mips_isa_rev >= 1) \
-	|| (defined __clang__ && defined _MIPS_ARCH_OCTEON) \
-	|| (defined __clang__ && defined _MIPS_ARCH_P5600)
+#elif defined(__mips_isa_rev) && __mips_isa_rev >= 1
+#define SLJIT_MIPS_REV __mips_isa_rev
+#elif defined(__clang__) \
+	&& (defined(_MIPS_ARCH_OCTEON) || defined(_MIPS_ARCH_P5600))
 /* clang either forgets to define (clang-7) __mips_isa_rev at all
  * or sets it to zero (clang-8,-9) for -march=octeon (MIPS64 R2+)
  * and -march=p5600 (MIPS32 R5).
@@ -676,6 +683,19 @@
 #define SLJIT_HAS_STATUS_FLAGS_STATE 1
 #endif
 
+/***************************************/
+/* Floating point register management. */
+/***************************************/
+
+#if (defined SLJIT_CONFIG_ARM_32 && SLJIT_CONFIG_ARM_32) \
+	|| (defined SLJIT_CONFIG_MIPS_32 && SLJIT_CONFIG_MIPS_32)
+#define SLJIT_F64_SECOND(reg) \
+	((reg) + SLJIT_FS0)
+#else /* !SLJIT_CONFIG_ARM_32 && !SLJIT_CONFIG_MIPS_32 */
+#define SLJIT_F64_SECOND(reg) \
+	(reg)
+#endif /* SLJIT_CONFIG_ARM_32 || SLJIT_CONFIG_MIPS_32 */
+
 /*************************************/
 /* Debug and verbose related macros. */
 /*************************************/
diff --git a/src/sljit/sljitLir.c b/src/sljit/sljitLir.c
index 9278c06..564eb73 100644
--- a/src/sljit/sljitLir.c
+++ b/src/sljit/sljitLir.c
@@ -123,29 +123,29 @@
 #endif
 
 /* Parameter parsing. */
-#define REG_MASK		0x3f
+#define REG_MASK		0x7f
 #define OFFS_REG(reg)		(((reg) >> 8) & REG_MASK)
 #define OFFS_REG_MASK		(REG_MASK << 8)
 #define TO_OFFS_REG(reg)	((reg) << 8)
-/* When reg cannot be unused. */
-#define FAST_IS_REG(reg)	((reg) <= REG_MASK)
+#define FAST_IS_REG(reg)	((reg) < REG_MASK)
 
 /* Mask for argument types. */
 #define SLJIT_ARG_MASK		0x7
 #define SLJIT_ARG_FULL_MASK	(SLJIT_ARG_MASK | SLJIT_ARG_TYPE_SCRATCH_REG)
 
-/* Mask for sljit_emit_mem. */
-#define REG_PAIR_MASK		0xff00
-#define REG_PAIR_FIRST(reg)	((reg) & 0xff)
+/* Mask for register pairs. */
+#define REG_PAIR_MASK		0x7f00
+#define REG_PAIR_FIRST(reg)	((reg) & 0x7f)
 #define REG_PAIR_SECOND(reg)	((reg) >> 8)
 
 /* Mask for sljit_emit_enter. */
 #define SLJIT_KEPT_SAVEDS_COUNT(options) ((options) & 0x3)
 
 /* Getters for simd operations, which returns with log2(size). */
-#define SLJIT_SIMD_GET_REG_SIZE(type) (((type) >> 12) & 0x3f)
-#define SLJIT_SIMD_GET_ELEM_SIZE(type) (((type) >> 18) & 0x3f)
-#define SLJIT_SIMD_GET_ALIGNMENT(type) (((type) >> 24) & 0x3f)
+#define SLJIT_SIMD_GET_REG_SIZE(type)		(((type) >> 12) & 0x3f)
+#define SLJIT_SIMD_GET_ELEM_SIZE(type)		(((type) >> 18) & 0x3f)
+#define SLJIT_SIMD_GET_ALIGNMENT(type)		(((type) >> 24) & 0x3f)
+#define SLJIT_SIMD_GET_ELEM2_SIZE(type)		(((type) >> 24) & 0x3f)
 
 /* Jump flags. */
 #define JUMP_LABEL	0x1
@@ -846,10 +846,6 @@
 	(((r) >= SLJIT_R0 && (r) < (SLJIT_R0 + compiler->scratches)) \
 	|| ((r) > (SLJIT_S0 - compiler->saveds) && (r) <= SLJIT_S0))
 
-#define FUNCTION_CHECK_IS_FREG(fr) \
-	(((fr) >= SLJIT_FR0 && (fr) < (SLJIT_FR0 + compiler->fscratches)) \
-	|| ((fr) > (SLJIT_FS0 - compiler->fsaveds) && (fr) <= SLJIT_FS0))
-
 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
 #define CHECK_IF_VIRTUAL_REGISTER(p) ((p) <= SLJIT_S3 && (p) >= SLJIT_S8)
 #else
@@ -858,7 +854,7 @@
 
 static sljit_s32 function_check_src_mem(struct sljit_compiler *compiler, sljit_s32 p, sljit_sw i)
 {
-	if (compiler->scratches == -1 || compiler->saveds == -1)
+	if (compiler->scratches == -1)
 		return 0;
 
 	if (!(p & SLJIT_MEM))
@@ -895,7 +891,7 @@
 
 static sljit_s32 function_check_src(struct sljit_compiler *compiler, sljit_s32 p, sljit_sw i)
 {
-	if (compiler->scratches == -1 || compiler->saveds == -1)
+	if (compiler->scratches == -1)
 		return 0;
 
 	if (FUNCTION_CHECK_IS_REG(p))
@@ -912,7 +908,7 @@
 
 static sljit_s32 function_check_dst(struct sljit_compiler *compiler, sljit_s32 p, sljit_sw i)
 {
-	if (compiler->scratches == -1 || compiler->saveds == -1)
+	if (compiler->scratches == -1)
 		return 0;
 
 	if (FUNCTION_CHECK_IS_REG(p))
@@ -924,19 +920,57 @@
 #define FUNCTION_CHECK_DST(p, i) \
 	CHECK_ARGUMENT(function_check_dst(compiler, p, i));
 
-static sljit_s32 function_fcheck(struct sljit_compiler *compiler, sljit_s32 p, sljit_sw i)
+#if (defined SLJIT_CONFIG_ARM_32 && SLJIT_CONFIG_ARM_32) \
+	|| (defined SLJIT_CONFIG_MIPS_32 && SLJIT_CONFIG_MIPS_32)
+
+#define FUNCTION_CHECK_IS_FREG(fr, is_32) \
+	function_check_is_freg(compiler, (fr), (is_32))
+
+static sljit_s32 function_check_is_freg(struct sljit_compiler *compiler, sljit_s32 fr, sljit_s32 is_32);
+
+#define FUNCTION_FCHECK(p, i, is_32) \
+	CHECK_ARGUMENT(function_fcheck(compiler, (p), (i), (is_32)));
+
+static sljit_s32 function_fcheck(struct sljit_compiler *compiler, sljit_s32 p, sljit_sw i, sljit_s32 is_32)
 {
-	if (compiler->scratches == -1 || compiler->saveds == -1)
+	if (compiler->scratches == -1)
 		return 0;
 
-	if (FUNCTION_CHECK_IS_FREG(p))
+	if (FUNCTION_CHECK_IS_FREG(p, is_32))
 		return (i == 0);
 
 	return function_check_src_mem(compiler, p, i);
 }
 
-#define FUNCTION_FCHECK(p, i) \
-	CHECK_ARGUMENT(function_fcheck(compiler, p, i));
+#else /* !SLJIT_CONFIG_ARM_32 && !SLJIT_CONFIG_MIPS_32 */
+#define FUNCTION_CHECK_IS_FREG(fr, is_32) \
+	function_check_is_freg(compiler, (fr))
+
+static sljit_s32 function_check_is_freg(struct sljit_compiler *compiler, sljit_s32 fr)
+{
+	if (compiler->scratches == -1)
+		return 0;
+
+	return (fr >= SLJIT_FR0 && fr < (SLJIT_FR0 + compiler->fscratches))
+		|| (fr > (SLJIT_FS0 - compiler->fsaveds) && fr <= SLJIT_FS0);
+}
+
+#define FUNCTION_FCHECK(p, i, is_32) \
+	CHECK_ARGUMENT(function_fcheck(compiler, (p), (i)));
+
+static sljit_s32 function_fcheck(struct sljit_compiler *compiler, sljit_s32 p, sljit_sw i)
+{
+	if (compiler->scratches == -1)
+		return 0;
+
+	if ((p >= SLJIT_FR0 && p < (SLJIT_FR0 + compiler->fscratches))
+			|| (p > (SLJIT_FS0 - compiler->fsaveds) && p <= SLJIT_FS0))
+		return (i == 0);
+
+	return function_check_src_mem(compiler, p, i);
+}
+
+#endif /* SLJIT_CONFIG_ARM_32 || SLJIT_CONFIG_MIPS_32 */
 
 #endif /* SLJIT_ARGUMENT_CHECKS */
 
@@ -973,6 +1007,14 @@
 
 static void sljit_verbose_freg(struct sljit_compiler *compiler, sljit_s32 r)
 {
+#if (defined SLJIT_CONFIG_ARM_32 && SLJIT_CONFIG_ARM_32) \
+		|| (defined SLJIT_CONFIG_MIPS_32 && SLJIT_CONFIG_MIPS_32)
+	if (r >= (SLJIT_FS0 + SLJIT_FR0) && r <= (SLJIT_FS0 + SLJIT_FS0)) {
+		fprintf(compiler->verbose, "^");
+		r -= SLJIT_FS0;
+	}
+#endif /* SLJIT_CONFIG_ARM_32 || SLJIT_CONFIG_MIPS_32 */
+
 	if (r < (SLJIT_FR0 + compiler->fscratches))
 		fprintf(compiler->verbose, "fr%d", r - SLJIT_FR0);
 	else
@@ -981,7 +1023,7 @@
 
 static void sljit_verbose_param(struct sljit_compiler *compiler, sljit_s32 p, sljit_sw i)
 {
-	if ((p) & SLJIT_IMM)
+	if ((p) == SLJIT_IMM)
 		fprintf(compiler->verbose, "#%" SLJIT_PRINT_D "d", (i));
 	else if ((p) & SLJIT_MEM) {
 		if ((p) & REG_MASK) {
@@ -1259,7 +1301,7 @@
 	}
 
 #if (defined SLJIT_ARGUMENT_CHECKS && SLJIT_ARGUMENT_CHECKS)
-	CHECK_ARGUMENT(compiler->last_return == SLJIT_ARG_TYPE_VOID);
+	CHECK_ARGUMENT(compiler->last_return == SLJIT_ARG_TYPE_RET_VOID);
 #endif
 
 #if (defined SLJIT_VERBOSE && SLJIT_VERBOSE)
@@ -1302,7 +1344,7 @@
 	if (GET_OPCODE(op) < SLJIT_MOV_F64) {
 		FUNCTION_CHECK_SRC(src, srcw);
 	} else {
-		FUNCTION_FCHECK(src, srcw);
+		FUNCTION_FCHECK(src, srcw, op & SLJIT_32);
 	}
 	compiler->last_flags = 0;
 #endif
@@ -1417,6 +1459,7 @@
 	}
 
 #if (defined SLJIT_ARGUMENT_CHECKS && SLJIT_ARGUMENT_CHECKS)
+	CHECK_ARGUMENT(sljit_has_cpu_feature(SLJIT_HAS_ATOMIC));
 	CHECK_ARGUMENT(GET_OPCODE(op) >= SLJIT_MOV && GET_OPCODE(op) <= SLJIT_MOV_P);
 	CHECK_ARGUMENT(GET_OPCODE(op) != SLJIT_MOV_S8 && GET_OPCODE(op) != SLJIT_MOV_S16 && GET_OPCODE(op) != SLJIT_MOV_S32);
 
@@ -1458,6 +1501,7 @@
 	}
 
 #if (defined SLJIT_ARGUMENT_CHECKS && SLJIT_ARGUMENT_CHECKS)
+	CHECK_ARGUMENT(sljit_has_cpu_feature(SLJIT_HAS_ATOMIC));
 	CHECK_ARGUMENT(GET_OPCODE(op) >= SLJIT_MOV && GET_OPCODE(op) <= SLJIT_MOV_P);
 	CHECK_ARGUMENT(GET_OPCODE(op) != SLJIT_MOV_S8 && GET_OPCODE(op) != SLJIT_MOV_S16 && GET_OPCODE(op) != SLJIT_MOV_S32);
 
@@ -1469,7 +1513,7 @@
 	CHECK_ARGUMENT(!(op & VARIABLE_FLAG_MASK) || GET_FLAG_TYPE(op) == SLJIT_ATOMIC_STORED);
 
 	if (GET_OPCODE(op) == SLJIT_MOV_U8 || GET_OPCODE(op) == SLJIT_MOV_U16) {
-		/* Only SLJIT_32, SLJIT_ATOMIC_STORED is allowed. */
+		/* Only SLJIT_32, SLJIT_ATOMIC_STORED are allowed. */
 		CHECK_ARGUMENT(!(op & SLJIT_SET_Z));
 	} else {
 		/* Only SLJIT_ATOMIC_STORED is allowed. */
@@ -1660,7 +1704,7 @@
 	SLJIT_UNUSED_ARG(type);
 	SLJIT_UNUSED_ARG(reg);
 #if (defined SLJIT_ARGUMENT_CHECKS && SLJIT_ARGUMENT_CHECKS)
-	if (type == SLJIT_INT_REGISTER) {
+	if (type == SLJIT_GP_REGISTER) {
 		CHECK_ARGUMENT(reg > 0 && reg <= SLJIT_NUMBER_OF_REGISTERS);
 	} else {
 		CHECK_ARGUMENT(type == SLJIT_FLOAT_REGISTER || ((type >> 12) == 0 || ((type >> 12) >= 3 && (type >> 12) <= 6)));
@@ -1719,8 +1763,8 @@
 	CHECK_ARGUMENT(sljit_has_cpu_feature(SLJIT_HAS_FPU));
 	CHECK_ARGUMENT(GET_OPCODE(op) >= SLJIT_MOV_F64 && GET_OPCODE(op) <= SLJIT_ABS_F64);
 	CHECK_ARGUMENT(!(op & (SLJIT_SET_Z | VARIABLE_FLAG_MASK)));
-	FUNCTION_FCHECK(src, srcw);
-	FUNCTION_FCHECK(dst, dstw);
+	FUNCTION_FCHECK(src, srcw, op & SLJIT_32);
+	FUNCTION_FCHECK(dst, dstw, op & SLJIT_32);
 #endif
 #if (defined SLJIT_VERBOSE && SLJIT_VERBOSE)
 	if (SLJIT_UNLIKELY(!!compiler->verbose)) {
@@ -1759,8 +1803,8 @@
 	CHECK_ARGUMENT(!(op & SLJIT_SET_Z));
 	CHECK_ARGUMENT((op & VARIABLE_FLAG_MASK)
 		|| (GET_FLAG_TYPE(op) >= SLJIT_F_EQUAL && GET_FLAG_TYPE(op) <= SLJIT_ORDERED_LESS_EQUAL));
-	FUNCTION_FCHECK(src1, src1w);
-	FUNCTION_FCHECK(src2, src2w);
+	FUNCTION_FCHECK(src1, src1w, op & SLJIT_32);
+	FUNCTION_FCHECK(src2, src2w, op & SLJIT_32);
 #endif
 #if (defined SLJIT_VERBOSE && SLJIT_VERBOSE)
 	if (SLJIT_UNLIKELY(!!compiler->verbose)) {
@@ -1790,7 +1834,7 @@
 #if (defined SLJIT_ARGUMENT_CHECKS && SLJIT_ARGUMENT_CHECKS)
 	CHECK_ARGUMENT(sljit_has_cpu_feature(SLJIT_HAS_FPU));
 	CHECK_ARGUMENT(!(op & (SLJIT_SET_Z | VARIABLE_FLAG_MASK)));
-	FUNCTION_FCHECK(src, srcw);
+	FUNCTION_FCHECK(src, srcw, op & SLJIT_32);
 	FUNCTION_CHECK_DST(dst, dstw);
 #endif
 #if (defined SLJIT_VERBOSE && SLJIT_VERBOSE)
@@ -1820,7 +1864,7 @@
 	CHECK_ARGUMENT(sljit_has_cpu_feature(SLJIT_HAS_FPU));
 	CHECK_ARGUMENT(!(op & (SLJIT_SET_Z | VARIABLE_FLAG_MASK)));
 	FUNCTION_CHECK_SRC(src, srcw);
-	FUNCTION_FCHECK(dst, dstw);
+	FUNCTION_FCHECK(dst, dstw, op & SLJIT_32);
 #endif
 #if (defined SLJIT_VERBOSE && SLJIT_VERBOSE)
 	if (SLJIT_UNLIKELY(!!compiler->verbose)) {
@@ -1850,9 +1894,9 @@
 	CHECK_ARGUMENT(sljit_has_cpu_feature(SLJIT_HAS_FPU));
 	CHECK_ARGUMENT(GET_OPCODE(op) >= SLJIT_ADD_F64 && GET_OPCODE(op) <= SLJIT_DIV_F64);
 	CHECK_ARGUMENT(!(op & (SLJIT_SET_Z | VARIABLE_FLAG_MASK)));
-	FUNCTION_FCHECK(src1, src1w);
-	FUNCTION_FCHECK(src2, src2w);
-	FUNCTION_FCHECK(dst, dstw);
+	FUNCTION_FCHECK(src1, src1w, op & SLJIT_32);
+	FUNCTION_FCHECK(src2, src2w, op & SLJIT_32);
+	FUNCTION_FCHECK(dst, dstw, op & SLJIT_32);
 #endif
 #if (defined SLJIT_VERBOSE && SLJIT_VERBOSE)
 	if (SLJIT_UNLIKELY(!!compiler->verbose)) {
@@ -1876,9 +1920,9 @@
 #if (defined SLJIT_ARGUMENT_CHECKS && SLJIT_ARGUMENT_CHECKS)
 	CHECK_ARGUMENT(sljit_has_cpu_feature(SLJIT_HAS_FPU));
 	CHECK_ARGUMENT(GET_OPCODE(op) == SLJIT_COPYSIGN_F64);
-	FUNCTION_FCHECK(src1, src1w);
-	FUNCTION_FCHECK(src2, src2w);
-	CHECK_ARGUMENT(FUNCTION_CHECK_IS_FREG(dst_freg));
+	FUNCTION_FCHECK(src1, src1w, op & SLJIT_32);
+	FUNCTION_FCHECK(src2, src2w, op & SLJIT_32);
+	CHECK_ARGUMENT(FUNCTION_CHECK_IS_FREG(dst_freg, op & SLJIT_32));
 #endif
 #if (defined SLJIT_VERBOSE && SLJIT_VERBOSE)
 	if (SLJIT_UNLIKELY(!!compiler->verbose)) {
@@ -1906,7 +1950,7 @@
 
 #if (defined SLJIT_ARGUMENT_CHECKS && SLJIT_ARGUMENT_CHECKS)
 	CHECK_ARGUMENT(sljit_has_cpu_feature(SLJIT_HAS_FPU));
-	CHECK_ARGUMENT(FUNCTION_CHECK_IS_FREG(freg));
+	CHECK_ARGUMENT(FUNCTION_CHECK_IS_FREG(freg, 1));
 #endif
 #if (defined SLJIT_VERBOSE && SLJIT_VERBOSE)
 	if (SLJIT_UNLIKELY(!!compiler->verbose)) {
@@ -1930,7 +1974,7 @@
 
 #if (defined SLJIT_ARGUMENT_CHECKS && SLJIT_ARGUMENT_CHECKS)
 	CHECK_ARGUMENT(sljit_has_cpu_feature(SLJIT_HAS_FPU));
-	CHECK_ARGUMENT(FUNCTION_CHECK_IS_FREG(freg));
+	CHECK_ARGUMENT(FUNCTION_CHECK_IS_FREG(freg, 0));
 #endif
 #if (defined SLJIT_VERBOSE && SLJIT_VERBOSE)
 	if (SLJIT_UNLIKELY(!!compiler->verbose)) {
@@ -1949,7 +1993,7 @@
 	CHECK_ARGUMENT(sljit_has_cpu_feature(SLJIT_HAS_FPU));
 	CHECK_ARGUMENT(GET_OPCODE(op) >= SLJIT_COPY_TO_F64 && GET_OPCODE(op) <= SLJIT_COPY_FROM_F64);
 	CHECK_ARGUMENT(!(op & (SLJIT_SET_Z | VARIABLE_FLAG_MASK)));
-	CHECK_ARGUMENT(FUNCTION_CHECK_IS_FREG(freg));
+	CHECK_ARGUMENT(FUNCTION_CHECK_IS_FREG(freg, op & SLJIT_32));
 
 #if (defined SLJIT_64BIT_ARCHITECTURE && SLJIT_64BIT_ARCHITECTURE)
 	CHECK_ARGUMENT(FUNCTION_CHECK_IS_REG(reg));
@@ -2134,8 +2178,8 @@
 	CHECK_ARGUMENT(sljit_has_cpu_feature(SLJIT_HAS_FPU));
 	CHECK_ARGUMENT(!(type & ~(0xff | SLJIT_REWRITABLE_JUMP | SLJIT_32)));
 	CHECK_ARGUMENT((type & 0xff) >= SLJIT_F_EQUAL && (type & 0xff) <= SLJIT_ORDERED_LESS_EQUAL);
-	FUNCTION_FCHECK(src1, src1w);
-	FUNCTION_FCHECK(src2, src2w);
+	FUNCTION_FCHECK(src1, src1w, type & SLJIT_32);
+	FUNCTION_FCHECK(src2, src2w, type & SLJIT_32);
 	compiler->last_flags = 0;
 #endif
 #if (defined SLJIT_VERBOSE && SLJIT_VERBOSE)
@@ -2302,9 +2346,9 @@
 	CHECK_ARGUMENT(cond >= SLJIT_EQUAL && cond <= SLJIT_ORDERED_LESS_EQUAL);
 
 	CHECK_ARGUMENT(compiler->fscratches != -1 && compiler->fsaveds != -1);
-	CHECK_ARGUMENT(FUNCTION_CHECK_IS_FREG(dst_freg));
-	FUNCTION_FCHECK(src1, src1w);
-	CHECK_ARGUMENT(FUNCTION_CHECK_IS_FREG(src2_freg));
+	CHECK_ARGUMENT(FUNCTION_CHECK_IS_FREG(dst_freg, type & SLJIT_32));
+	FUNCTION_FCHECK(src1, src1w, type & SLJIT_32);
+	CHECK_ARGUMENT(FUNCTION_CHECK_IS_FREG(src2_freg, type & SLJIT_32));
 
 	if (cond <= SLJIT_NOT_ZERO)
 		CHECK_ARGUMENT(compiler->last_flags & SLJIT_SET_Z);
@@ -2335,14 +2379,16 @@
 	sljit_s32 reg,
 	sljit_s32 mem, sljit_sw memw)
 {
+#if (defined SLJIT_ARGUMENT_CHECKS && SLJIT_ARGUMENT_CHECKS)
+	sljit_s32 allowed_flags;
+#endif /* SLJIT_ARGUMENT_CHECKS */
+
 	if (SLJIT_UNLIKELY(compiler->skip_checks)) {
 		compiler->skip_checks = 0;
 		CHECK_RETURN_OK;
 	}
 
 #if (defined SLJIT_ARGUMENT_CHECKS && SLJIT_ARGUMENT_CHECKS)
-	sljit_s32 allowed_flags;
-
 	if (type & SLJIT_MEM_UNALIGNED) {
 		CHECK_ARGUMENT(!(type & (SLJIT_MEM_ALIGNED_16 | SLJIT_MEM_ALIGNED_32)));
 	} else if (type & SLJIT_MEM_ALIGNED_16) {
@@ -2354,14 +2400,14 @@
 	allowed_flags = SLJIT_MEM_UNALIGNED;
 
 	switch (type & 0xff) {
+	case SLJIT_MOV_P:
+	case SLJIT_MOV:
+		allowed_flags |= SLJIT_MEM_ALIGNED_32;
+		/* fallthrough */
 	case SLJIT_MOV_U32:
 	case SLJIT_MOV_S32:
 	case SLJIT_MOV32:
-		allowed_flags = SLJIT_MEM_UNALIGNED | SLJIT_MEM_ALIGNED_16;
-		break;
-	case SLJIT_MOV:
-	case SLJIT_MOV_P:
-		allowed_flags = SLJIT_MEM_UNALIGNED | SLJIT_MEM_ALIGNED_16 | SLJIT_MEM_ALIGNED_32;
+		allowed_flags |= SLJIT_MEM_ALIGNED_16;
 		break;
 	}
 
@@ -2465,6 +2511,7 @@
 	sljit_s32 mem, sljit_sw memw)
 {
 #if (defined SLJIT_ARGUMENT_CHECKS && SLJIT_ARGUMENT_CHECKS)
+	CHECK_ARGUMENT(sljit_has_cpu_feature(SLJIT_HAS_FPU));
 	CHECK_ARGUMENT((type & 0xff) == SLJIT_MOV_F64);
 
 	if (type & SLJIT_MEM_UNALIGNED) {
@@ -2477,7 +2524,7 @@
 	}
 
 	CHECK_ARGUMENT(!(type & ~(0xff | SLJIT_32 | SLJIT_MEM_STORE | SLJIT_MEM_UNALIGNED | SLJIT_MEM_ALIGNED_16 | SLJIT_MEM_ALIGNED_32)));
-	CHECK_ARGUMENT(FUNCTION_CHECK_IS_FREG(freg));
+	CHECK_ARGUMENT(FUNCTION_CHECK_IS_FREG(freg, type & SLJIT_32));
 	FUNCTION_CHECK_SRC_MEM(mem, memw);
 #endif
 #if (defined SLJIT_VERBOSE && SLJIT_VERBOSE)
@@ -2508,10 +2555,11 @@
 	sljit_s32 mem, sljit_sw memw)
 {
 #if (defined SLJIT_ARGUMENT_CHECKS && SLJIT_ARGUMENT_CHECKS)
+	CHECK_ARGUMENT(sljit_has_cpu_feature(SLJIT_HAS_FPU));
 	CHECK_ARGUMENT((type & 0xff) == SLJIT_MOV_F64);
 	CHECK_ARGUMENT((type & ~(0xff | SLJIT_32 | SLJIT_MEM_STORE | SLJIT_MEM_SUPP | SLJIT_MEM_POST)) == 0);
 	FUNCTION_CHECK_SRC_MEM(mem, memw);
-	CHECK_ARGUMENT(FUNCTION_CHECK_IS_FREG(freg));
+	CHECK_ARGUMENT(FUNCTION_CHECK_IS_FREG(freg, type & SLJIT_32));
 #endif
 #if (defined SLJIT_VERBOSE && SLJIT_VERBOSE)
 	if (SLJIT_UNLIKELY(!!compiler->verbose)) {
@@ -2541,12 +2589,13 @@
 	sljit_s32 srcdst, sljit_sw srcdstw)
 {
 #if (defined SLJIT_ARGUMENT_CHECKS && SLJIT_ARGUMENT_CHECKS)
+	CHECK_ARGUMENT(sljit_has_cpu_feature(SLJIT_HAS_SIMD));
 	CHECK_ARGUMENT((type & (sljit_s32)(0xc0000fff - (SLJIT_SIMD_STORE | SLJIT_SIMD_FLOAT | SLJIT_SIMD_TEST))) == 0);
 	CHECK_ARGUMENT((type & 0x3f000) >= SLJIT_SIMD_REG_64 && (type & 0x3f000) <= SLJIT_SIMD_REG_512);
 	CHECK_ARGUMENT(SLJIT_SIMD_GET_ELEM_SIZE(type) <= SLJIT_SIMD_GET_REG_SIZE(type));
 	CHECK_ARGUMENT(SLJIT_SIMD_GET_ALIGNMENT(type) <= (srcdst & SLJIT_MEM) ? SLJIT_SIMD_GET_REG_SIZE(type) : 0);
-	CHECK_ARGUMENT(FUNCTION_CHECK_IS_FREG(freg));
-	FUNCTION_FCHECK(srcdst, srcdstw);
+	CHECK_ARGUMENT(FUNCTION_CHECK_IS_FREG(freg, 0));
+	FUNCTION_FCHECK(srcdst, srcdstw, 0);
 #endif
 #if (defined SLJIT_VERBOSE && SLJIT_VERBOSE)
 	if (SLJIT_UNLIKELY(!!compiler->verbose)) {
@@ -2582,16 +2631,17 @@
 	sljit_s32 src, sljit_sw srcw)
 {
 #if (defined SLJIT_ARGUMENT_CHECKS && SLJIT_ARGUMENT_CHECKS)
+	CHECK_ARGUMENT(sljit_has_cpu_feature(SLJIT_HAS_SIMD));
 	CHECK_ARGUMENT((type & (sljit_s32)(0xff000fff - (SLJIT_SIMD_FLOAT | SLJIT_SIMD_TEST))) == 0);
 	CHECK_ARGUMENT((type & 0x3f000) >= SLJIT_SIMD_REG_64 && (type & 0x3f000) <= SLJIT_SIMD_REG_512);
 	CHECK_ARGUMENT(SLJIT_SIMD_GET_ELEM_SIZE(type) < SLJIT_SIMD_GET_REG_SIZE(type));
-	CHECK_ARGUMENT(FUNCTION_CHECK_IS_FREG(freg));
+	CHECK_ARGUMENT(FUNCTION_CHECK_IS_FREG(freg, 0));
 
 	if (type & SLJIT_SIMD_FLOAT) {
 		if (src == SLJIT_IMM) {
 			CHECK_ARGUMENT(srcw == 0);
 		} else {
-			FUNCTION_FCHECK(src, srcw);
+			FUNCTION_FCHECK(src, srcw, SLJIT_SIMD_GET_ELEM_SIZE(type) == 2);
 		}
 	} else if (src != SLJIT_IMM) {
 		FUNCTION_CHECK_DST(src, srcw);
@@ -2628,15 +2678,20 @@
 	sljit_s32 srcdst, sljit_sw srcdstw)
 {
 #if (defined SLJIT_ARGUMENT_CHECKS && SLJIT_ARGUMENT_CHECKS)
-	CHECK_ARGUMENT((type & (sljit_s32)(0xff000fff - (SLJIT_SIMD_STORE | SLJIT_SIMD_FLOAT | SLJIT_SIMD_TEST))) == 0);
+	CHECK_ARGUMENT(sljit_has_cpu_feature(SLJIT_HAS_SIMD));
+	CHECK_ARGUMENT((type & (sljit_s32)(0xff000fff - (SLJIT_SIMD_STORE | SLJIT_SIMD_LANE_ZERO | SLJIT_SIMD_LANE_SIGNED | SLJIT_32 | SLJIT_SIMD_FLOAT | SLJIT_SIMD_TEST))) == 0);
+	CHECK_ARGUMENT((type & (SLJIT_SIMD_STORE | SLJIT_SIMD_LANE_ZERO)) != (SLJIT_SIMD_STORE | SLJIT_SIMD_LANE_ZERO));
+	CHECK_ARGUMENT((type & (SLJIT_SIMD_STORE | SLJIT_SIMD_LANE_SIGNED)) != SLJIT_SIMD_LANE_SIGNED);
+	CHECK_ARGUMENT(!(type & SLJIT_SIMD_FLOAT) || !(type & (SLJIT_SIMD_LANE_SIGNED | SLJIT_32)));
 	CHECK_ARGUMENT((type & 0x3f000) >= SLJIT_SIMD_REG_64 && (type & 0x3f000) <= SLJIT_SIMD_REG_512);
 	CHECK_ARGUMENT(SLJIT_SIMD_GET_ELEM_SIZE(type) < SLJIT_SIMD_GET_REG_SIZE(type));
-	CHECK_ARGUMENT(FUNCTION_CHECK_IS_FREG(freg));
+	CHECK_ARGUMENT(!(type & SLJIT_32) || SLJIT_SIMD_GET_ELEM_SIZE(type) <= 2);
+	CHECK_ARGUMENT(FUNCTION_CHECK_IS_FREG(freg, 0));
 	CHECK_ARGUMENT(lane_index >= 0 && lane_index < (1 << (SLJIT_SIMD_GET_REG_SIZE(type) - SLJIT_SIMD_GET_ELEM_SIZE(type))));
 
 	if (type & SLJIT_SIMD_FLOAT) {
-		FUNCTION_FCHECK(srcdst, srcdstw);
-	} else if ((type & SLJIT_SIMD_STORE) || (srcdst != SLJIT_IMM)) {
+		FUNCTION_FCHECK(srcdst, srcdstw, SLJIT_SIMD_GET_ELEM_SIZE(type) == 2);
+	} else if ((type & SLJIT_SIMD_STORE) || srcdst != SLJIT_IMM) {
 		FUNCTION_CHECK_DST(srcdst, srcdstw);
 	}
 #endif
@@ -2649,8 +2704,11 @@
 			CHECK_RETURN_OK;
 		}
 
-		fprintf(compiler->verbose, "  simd_%s_lane.%d.%s%d ",
+		fprintf(compiler->verbose, "  simd_%s_lane%s%s%s.%d.%s%d ",
 			(type & SLJIT_SIMD_STORE) ? "store" : "load",
+			(type & SLJIT_32) ? "32" : "",
+			(type & SLJIT_SIMD_LANE_ZERO) ? "_z" : "",
+			(type & SLJIT_SIMD_LANE_SIGNED) ? "_s" : "",
 			(8 << SLJIT_SIMD_GET_REG_SIZE(type)),
 			(type & SLJIT_SIMD_FLOAT) ? "f" : "",
 			(8 << SLJIT_SIMD_GET_ELEM_SIZE(type)));
@@ -2672,11 +2730,12 @@
 	sljit_s32 src, sljit_s32 src_lane_index)
 {
 #if (defined SLJIT_ARGUMENT_CHECKS && SLJIT_ARGUMENT_CHECKS)
+	CHECK_ARGUMENT(sljit_has_cpu_feature(SLJIT_HAS_SIMD));
 	CHECK_ARGUMENT((type & (sljit_s32)(0xff000fff - (SLJIT_SIMD_FLOAT | SLJIT_SIMD_TEST))) == 0);
 	CHECK_ARGUMENT((type & 0x3f000) >= SLJIT_SIMD_REG_64 && (type & 0x3f000) <= SLJIT_SIMD_REG_512);
 	CHECK_ARGUMENT(SLJIT_SIMD_GET_ELEM_SIZE(type) < SLJIT_SIMD_GET_REG_SIZE(type));
-	CHECK_ARGUMENT(FUNCTION_CHECK_IS_FREG(freg));
-	CHECK_ARGUMENT(FUNCTION_CHECK_IS_FREG(src));
+	CHECK_ARGUMENT(FUNCTION_CHECK_IS_FREG(freg, 0));
+	CHECK_ARGUMENT(FUNCTION_CHECK_IS_FREG(src, 0));
 	CHECK_ARGUMENT(src_lane_index >= 0 && src_lane_index < (1 << (SLJIT_SIMD_GET_REG_SIZE(type) - SLJIT_SIMD_GET_ELEM_SIZE(type))));
 #endif
 #if (defined SLJIT_VERBOSE && SLJIT_VERBOSE)
@@ -2702,6 +2761,82 @@
 	CHECK_RETURN_OK;
 }
 
+static SLJIT_INLINE CHECK_RETURN_TYPE check_sljit_emit_simd_extend(struct sljit_compiler *compiler, sljit_s32 type,
+	sljit_s32 freg,
+	sljit_s32 src, sljit_sw srcw)
+{
+#if (defined SLJIT_ARGUMENT_CHECKS && SLJIT_ARGUMENT_CHECKS)
+	CHECK_ARGUMENT(sljit_has_cpu_feature(SLJIT_HAS_SIMD));
+	CHECK_ARGUMENT((type & (sljit_s32)(0xc0000fff - (SLJIT_SIMD_EXTEND_SIGNED | SLJIT_SIMD_FLOAT | SLJIT_SIMD_TEST))) == 0);
+	CHECK_ARGUMENT((type & (SLJIT_SIMD_EXTEND_SIGNED | SLJIT_SIMD_FLOAT)) != (SLJIT_SIMD_EXTEND_SIGNED | SLJIT_SIMD_FLOAT));
+	CHECK_ARGUMENT((type & 0x3f000) >= SLJIT_SIMD_REG_64 && (type & 0x3f000) <= SLJIT_SIMD_REG_512);
+	CHECK_ARGUMENT(SLJIT_SIMD_GET_ELEM2_SIZE(type) < SLJIT_SIMD_GET_REG_SIZE(type));
+	CHECK_ARGUMENT(SLJIT_SIMD_GET_ELEM_SIZE(type) < SLJIT_SIMD_GET_ELEM2_SIZE(type));
+	CHECK_ARGUMENT(FUNCTION_CHECK_IS_FREG(freg, 0));
+	FUNCTION_FCHECK(src, srcw, SLJIT_SIMD_GET_ELEM_SIZE(type) == 2);
+#endif
+#if (defined SLJIT_VERBOSE && SLJIT_VERBOSE)
+	if (SLJIT_UNLIKELY(!!compiler->verbose)) {
+		if (type & SLJIT_SIMD_TEST)
+			CHECK_RETURN_OK;
+		if (sljit_emit_simd_extend(compiler, type | SLJIT_SIMD_TEST, freg, src, srcw) == SLJIT_ERR_UNSUPPORTED) {
+			fprintf(compiler->verbose, "    # simd_extend: unsupported form, no instructions are emitted\n");
+			CHECK_RETURN_OK;
+		}
+
+		fprintf(compiler->verbose, "  simd_load_extend%s.%d.%s%d.%s%d ",
+			(type & SLJIT_SIMD_EXTEND_SIGNED) ? "_s" : "",
+			(8 << SLJIT_SIMD_GET_REG_SIZE(type)),
+			(type & SLJIT_SIMD_FLOAT) ? "f" : "",
+			(8 << SLJIT_SIMD_GET_ELEM2_SIZE(type)),
+			(type & SLJIT_SIMD_FLOAT) ? "f" : "",
+			(8 << SLJIT_SIMD_GET_ELEM_SIZE(type)));
+
+		sljit_verbose_freg(compiler, freg);
+		fprintf(compiler->verbose, ", ");
+		sljit_verbose_fparam(compiler, src, srcw);
+		fprintf(compiler->verbose, "\n");
+	}
+#endif
+	CHECK_RETURN_OK;
+}
+
+static SLJIT_INLINE CHECK_RETURN_TYPE check_sljit_emit_simd_sign(struct sljit_compiler *compiler, sljit_s32 type,
+	sljit_s32 freg,
+	sljit_s32 dst, sljit_sw dstw)
+{
+#if (defined SLJIT_ARGUMENT_CHECKS && SLJIT_ARGUMENT_CHECKS)
+	CHECK_ARGUMENT(sljit_has_cpu_feature(SLJIT_HAS_SIMD));
+	CHECK_ARGUMENT((type & (sljit_s32)(0xff000fff - (SLJIT_SIMD_FLOAT | SLJIT_SIMD_TEST | SLJIT_32))) == SLJIT_SIMD_STORE);
+	CHECK_ARGUMENT((type & 0x3f000) >= SLJIT_SIMD_REG_64 && (type & 0x3f000) <= SLJIT_SIMD_REG_512);
+	CHECK_ARGUMENT(SLJIT_SIMD_GET_ELEM_SIZE(type) < SLJIT_SIMD_GET_REG_SIZE(type));
+	CHECK_ARGUMENT(FUNCTION_CHECK_IS_FREG(freg, 0));
+	FUNCTION_CHECK_DST(dst, dstw);
+#endif
+#if (defined SLJIT_VERBOSE && SLJIT_VERBOSE)
+	if (SLJIT_UNLIKELY(!!compiler->verbose)) {
+		if (type & SLJIT_SIMD_TEST)
+			CHECK_RETURN_OK;
+		if (sljit_emit_simd_sign(compiler, type | SLJIT_SIMD_TEST, freg, dst, dstw) == SLJIT_ERR_UNSUPPORTED) {
+			fprintf(compiler->verbose, "    # simd_sign: unsupported form, no instructions are emitted\n");
+			CHECK_RETURN_OK;
+		}
+
+		fprintf(compiler->verbose, "  simd_store_sign%s.%d.%s%d ",
+			(type & SLJIT_32) ? "32" : "",
+			(8 << SLJIT_SIMD_GET_REG_SIZE(type)),
+			(type & SLJIT_SIMD_FLOAT) ? "f" : "",
+			(8 << SLJIT_SIMD_GET_ELEM_SIZE(type)));
+
+		sljit_verbose_freg(compiler, freg);
+		fprintf(compiler->verbose, ", ");
+		sljit_verbose_param(compiler, dst, dstw);
+		fprintf(compiler->verbose, "\n");
+	}
+#endif
+	CHECK_RETURN_OK;
+}
+
 static SLJIT_INLINE CHECK_RETURN_TYPE check_sljit_get_local_base(struct sljit_compiler *compiler, sljit_s32 dst, sljit_sw dstw, sljit_sw offset)
 {
 	/* Any offset is allowed. */
@@ -2913,7 +3048,8 @@
 }
 
 #if !(defined SLJIT_CONFIG_X86 && SLJIT_CONFIG_X86) \
-	&& !(defined SLJIT_CONFIG_S390X && SLJIT_CONFIG_S390X)
+	&& !(defined SLJIT_CONFIG_S390X && SLJIT_CONFIG_S390X) \
+	&& !(defined(SLJIT_CONFIG_LOONGARCH_64) && SLJIT_CONFIG_LOONGARCH_64)
 
 SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_fop2r(struct sljit_compiler *compiler, sljit_s32 op,
 	sljit_s32 dst_freg,
@@ -2929,7 +3065,7 @@
 	return sljit_emit_fop2(compiler, op, dst_freg, 0, src1, src1w, src2, src2w);
 }
 
-#endif /* !SLJIT_CONFIG_X86 && !SLJIT_CONFIG_S390X */
+#endif /* !SLJIT_CONFIG_X86 && !SLJIT_CONFIG_S390X && !SLJIT_CONFIG_LOONGARCH_64 */
 
 #if !(defined SLJIT_CONFIG_MIPS && SLJIT_CONFIG_MIPS) \
 	&& !(defined SLJIT_CONFIG_RISCV && SLJIT_CONFIG_RISCV) \
@@ -2949,18 +3085,18 @@
 	condition = type & 0xff;
 #if (defined SLJIT_CONFIG_ARM_64 && SLJIT_CONFIG_ARM_64)
 	if ((condition == SLJIT_EQUAL || condition == SLJIT_NOT_EQUAL)) {
-		if ((src1 & SLJIT_IMM) && !src1w) {
+		if (src1 == SLJIT_IMM && !src1w) {
 			src1 = src2;
 			src1w = src2w;
 			src2 = SLJIT_IMM;
 			src2w = 0;
 		}
-		if ((src2 & SLJIT_IMM) && !src2w)
+		if (src2 == SLJIT_IMM && !src2w)
 			return emit_cmp_to0(compiler, type, src1, src1w);
 	}
 #endif
 
-	if (SLJIT_UNLIKELY((src1 & SLJIT_IMM) && !(src2 & SLJIT_IMM))) {
+	if (SLJIT_UNLIKELY(src1 == SLJIT_IMM && src2 != SLJIT_IMM)) {
 		/* Immediate is preferred as second argument by most architectures. */
 		switch (condition) {
 		case SLJIT_LESS:
@@ -3104,6 +3240,11 @@
 {
 	CHECK_ERROR();
 	CHECK(check_sljit_emit_simd_mov(compiler, type, freg, srcdst, srcdstw));
+	SLJIT_UNUSED_ARG(compiler);
+	SLJIT_UNUSED_ARG(type);
+	SLJIT_UNUSED_ARG(freg);
+	SLJIT_UNUSED_ARG(srcdst);
+	SLJIT_UNUSED_ARG(srcdstw);
 
 	return SLJIT_ERR_UNSUPPORTED;
 }
@@ -3114,6 +3255,11 @@
 {
 	CHECK_ERROR();
 	CHECK(check_sljit_emit_simd_replicate(compiler, type, freg, src, srcw));
+	SLJIT_UNUSED_ARG(compiler);
+	SLJIT_UNUSED_ARG(type);
+	SLJIT_UNUSED_ARG(freg);
+	SLJIT_UNUSED_ARG(src);
+	SLJIT_UNUSED_ARG(srcw);
 
 	return SLJIT_ERR_UNSUPPORTED;
 }
@@ -3124,6 +3270,12 @@
 {
 	CHECK_ERROR();
 	CHECK(check_sljit_emit_simd_lane_mov(compiler, type, freg, lane_index, srcdst, srcdstw));
+	SLJIT_UNUSED_ARG(compiler);
+	SLJIT_UNUSED_ARG(type);
+	SLJIT_UNUSED_ARG(freg);
+	SLJIT_UNUSED_ARG(lane_index);
+	SLJIT_UNUSED_ARG(srcdst);
+	SLJIT_UNUSED_ARG(srcdstw);
 
 	return SLJIT_ERR_UNSUPPORTED;
 }
@@ -3134,12 +3286,88 @@
 {
 	CHECK_ERROR();
 	CHECK(check_sljit_emit_simd_lane_replicate(compiler, type, freg, src, src_lane_index));
+	SLJIT_UNUSED_ARG(compiler);
+	SLJIT_UNUSED_ARG(type);
+	SLJIT_UNUSED_ARG(freg);
+	SLJIT_UNUSED_ARG(src);
+	SLJIT_UNUSED_ARG(src_lane_index);
+
+	return SLJIT_ERR_UNSUPPORTED;
+}
+
+SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_simd_extend(struct sljit_compiler *compiler, sljit_s32 type,
+	sljit_s32 freg,
+	sljit_s32 src, sljit_sw srcw)
+{
+	CHECK_ERROR();
+	CHECK(check_sljit_emit_simd_extend(compiler, type, freg, src, srcw));
+	SLJIT_UNUSED_ARG(compiler);
+	SLJIT_UNUSED_ARG(type);
+	SLJIT_UNUSED_ARG(freg);
+	SLJIT_UNUSED_ARG(src);
+	SLJIT_UNUSED_ARG(srcw);
+
+	return SLJIT_ERR_UNSUPPORTED;
+}
+
+SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_simd_sign(struct sljit_compiler *compiler, sljit_s32 type,
+	sljit_s32 freg,
+	sljit_s32 dst, sljit_sw dstw)
+{
+	CHECK_ERROR();
+	CHECK(check_sljit_emit_simd_sign(compiler, type, freg, dst, dstw));
+	SLJIT_UNUSED_ARG(compiler);
+	SLJIT_UNUSED_ARG(type);
+	SLJIT_UNUSED_ARG(freg);
+	SLJIT_UNUSED_ARG(dst);
+	SLJIT_UNUSED_ARG(dstw);
 
 	return SLJIT_ERR_UNSUPPORTED;
 }
 
 #endif /* !SLJIT_CONFIG_X86 && !SLJIT_CONFIG_ARM */
 
+#if !(defined(SLJIT_CONFIG_X86) && SLJIT_CONFIG_X86) \
+	&& !(defined(SLJIT_CONFIG_ARM) && SLJIT_CONFIG_ARM) \
+	&& !(defined(SLJIT_CONFIG_S390X) && SLJIT_CONFIG_S390X) \
+	&& !(defined(SLJIT_CONFIG_LOONGARCH) && SLJIT_CONFIG_LOONGARCH)
+
+SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_atomic_load(struct sljit_compiler *compiler,
+	sljit_s32 op,
+	sljit_s32 dst_reg,
+	sljit_s32 mem_reg)
+{
+	SLJIT_UNUSED_ARG(compiler);
+	SLJIT_UNUSED_ARG(op);
+	SLJIT_UNUSED_ARG(dst_reg);
+	SLJIT_UNUSED_ARG(mem_reg);
+
+	CHECK_ERROR();
+	CHECK(check_sljit_emit_atomic_load(compiler, op, dst_reg, mem_reg));
+
+	return SLJIT_ERR_UNSUPPORTED;
+}
+
+SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_atomic_store(struct sljit_compiler *compiler,
+	sljit_s32 op,
+	sljit_s32 src_reg,
+	sljit_s32 mem_reg,
+	sljit_s32 temp_reg)
+{
+	SLJIT_UNUSED_ARG(compiler);
+	SLJIT_UNUSED_ARG(op);
+	SLJIT_UNUSED_ARG(src_reg);
+	SLJIT_UNUSED_ARG(mem_reg);
+	SLJIT_UNUSED_ARG(temp_reg);
+
+	CHECK_ERROR();
+	CHECK(check_sljit_emit_atomic_store(compiler, op, src_reg, mem_reg, temp_reg));
+
+	return SLJIT_ERR_UNSUPPORTED;
+}
+
+#endif /* !SLJIT_CONFIG_X86 && !SLJIT_CONFIG_ARM && !SLJIT_CONFIG_S390X && !SLJIT_CONFIG_LOONGARCH */
+
 #if !(defined SLJIT_CONFIG_X86 && SLJIT_CONFIG_X86) \
 	&& !(defined SLJIT_CONFIG_ARM_64 && SLJIT_CONFIG_ARM_64)
 
diff --git a/src/sljit/sljitLir.h b/src/sljit/sljitLir.h
index ae68737..95d59e4 100644
--- a/src/sljit/sljitLir.h
+++ b/src/sljit/sljitLir.h
@@ -322,20 +322,24 @@
      arg_d must be placed in SLJIT_FR1
 
    Examples for argument processing by sljit_emit_enter:
-     SLJIT_ARGS4(VOID, P, 32_R, F32, W)
+     SLJIT_ARGS4V(P, 32_R, F32, W)
      Arguments are placed into: SLJIT_S0, SLJIT_R1, SLJIT_FR0, SLJIT_S1
+     The type of the result is void.
 
-     SLJIT_ARGS4(VOID, W, W_R, W, W_R)
+     SLJIT_ARGS4(F32, W, W_R, W, W_R)
      Arguments are placed into: SLJIT_S0, SLJIT_R1, SLJIT_S1, SLJIT_R3
+     The type of the result is sljit_f32.
 
-     SLJIT_ARGS4(VOID, F64, W, F32, P_R)
+     SLJIT_ARGS4(P, W, F32, P_R)
      Arguments are placed into: SLJIT_FR0, SLJIT_S0, SLJIT_FR1, SLJIT_R1
+     The type of the result is pointer.
 
      Note: it is recommended to pass the scratch arguments first
      followed by the saved arguments:
 
-       SLJIT_ARGS4(VOID, W_R, W_R, W, W)
+       SLJIT_ARGS4(W, W_R, W_R, W, W)
        Arguments are placed into: SLJIT_R0, SLJIT_R1, SLJIT_S0, SLJIT_S1
+       The type of the result is sljit_sw / sljit_uw.
 */
 
 /* The following flag is only allowed for the integer arguments of
@@ -343,21 +347,21 @@
    stored in a scratch register instead of a saved register. */
 #define SLJIT_ARG_TYPE_SCRATCH_REG 0x8
 
-/* Void result, can only be used by SLJIT_ARG_RETURN. */
-#define SLJIT_ARG_TYPE_VOID	0
+/* No return value, only supported by SLJIT_ARG_RETURN. */
+#define SLJIT_ARG_TYPE_RET_VOID		0
 /* Machine word sized integer argument or result. */
-#define SLJIT_ARG_TYPE_W	1
+#define SLJIT_ARG_TYPE_W		1
 #define SLJIT_ARG_TYPE_W_R	(SLJIT_ARG_TYPE_W | SLJIT_ARG_TYPE_SCRATCH_REG)
 /* 32 bit integer argument or result. */
-#define SLJIT_ARG_TYPE_32	2
+#define SLJIT_ARG_TYPE_32		2
 #define SLJIT_ARG_TYPE_32_R	(SLJIT_ARG_TYPE_32 | SLJIT_ARG_TYPE_SCRATCH_REG)
 /* Pointer sized integer argument or result. */
-#define SLJIT_ARG_TYPE_P	3
+#define SLJIT_ARG_TYPE_P		3
 #define SLJIT_ARG_TYPE_P_R	(SLJIT_ARG_TYPE_P | SLJIT_ARG_TYPE_SCRATCH_REG)
 /* 64 bit floating point argument or result. */
-#define SLJIT_ARG_TYPE_F64	4
+#define SLJIT_ARG_TYPE_F64		4
 /* 32 bit floating point argument or result. */
-#define SLJIT_ARG_TYPE_F32	5
+#define SLJIT_ARG_TYPE_F32		5
 
 #define SLJIT_ARG_SHIFT 4
 #define SLJIT_ARG_RETURN(type) (type)
@@ -370,24 +374,40 @@
 
    can be shortened to:
        SLJIT_ARGS1(W, F32)
+
+   Another example where no value is returned:
+       SLJIT_ARG_RETURN(SLJIT_ARG_TYPE_RET_VOID) | SLJIT_ARG_VALUE(SLJIT_ARG_TYPE_W_R, 1)
+
+   can be shortened to:
+       SLJIT_ARGS1V(W_R)
 */
 
 #define SLJIT_ARG_TO_TYPE(type) SLJIT_ARG_TYPE_ ## type
 
 #define SLJIT_ARGS0(ret) \
 	SLJIT_ARG_RETURN(SLJIT_ARG_TO_TYPE(ret))
+#define SLJIT_ARGS0V() \
+	SLJIT_ARG_RETURN(SLJIT_ARG_TYPE_RET_VOID)
 
 #define SLJIT_ARGS1(ret, arg1) \
 	(SLJIT_ARGS0(ret) | SLJIT_ARG_VALUE(SLJIT_ARG_TO_TYPE(arg1), 1))
+#define SLJIT_ARGS1V(arg1) \
+	(SLJIT_ARGS0V() | SLJIT_ARG_VALUE(SLJIT_ARG_TO_TYPE(arg1), 1))
 
 #define SLJIT_ARGS2(ret, arg1, arg2) \
 	(SLJIT_ARGS1(ret, arg1) | SLJIT_ARG_VALUE(SLJIT_ARG_TO_TYPE(arg2), 2))
+#define SLJIT_ARGS2V(arg1, arg2) \
+	(SLJIT_ARGS1V(arg1) | SLJIT_ARG_VALUE(SLJIT_ARG_TO_TYPE(arg2), 2))
 
 #define SLJIT_ARGS3(ret, arg1, arg2, arg3) \
 	(SLJIT_ARGS2(ret, arg1, arg2) | SLJIT_ARG_VALUE(SLJIT_ARG_TO_TYPE(arg3), 3))
+#define SLJIT_ARGS3V(arg1, arg2, arg3) \
+	(SLJIT_ARGS2V(arg1, arg2) | SLJIT_ARG_VALUE(SLJIT_ARG_TO_TYPE(arg3), 3))
 
 #define SLJIT_ARGS4(ret, arg1, arg2, arg3, arg4) \
 	(SLJIT_ARGS3(ret, arg1, arg2, arg3) | SLJIT_ARG_VALUE(SLJIT_ARG_TO_TYPE(arg4), 4))
+#define SLJIT_ARGS4V(arg1, arg2, arg3, arg4) \
+	(SLJIT_ARGS3V(arg1, arg2, arg3) | SLJIT_ARG_VALUE(SLJIT_ARG_TO_TYPE(arg4), 4))
 
 /* --------------------------------------------------------------------- */
 /*  Main structures and functions                                        */
@@ -667,17 +687,25 @@
 #define SLJIT_HAS_COPY_F32		9
 /* [Emulated] Copy from/to f64 operation is available (see sljit_emit_fcopy). */
 #define SLJIT_HAS_COPY_F64		10
+/* [Not emulated] The 64 bit floating point registers can be used as
+   two separate 32 bit floating point registers (e.g. ARM32). The
+   second 32 bit part can be accessed by SLJIT_F64_SECOND. */
+#define SLJIT_HAS_F64_AS_F32_PAIR	11
 /* [Not emulated] Some SIMD operations are supported by the compiler. */
-#define SLJIT_HAS_SIMD			11
+#define SLJIT_HAS_SIMD			12
 /* [Not emulated] SIMD registers are mapped to a pair of double precision
    floating point registers. E.g. passing either SLJIT_FR0 or SLJIT_FR1 to
    a simd operation represents the same 128 bit register, and both SLJIT_FR0
    and SLJIT_FR1 are overwritten. */
-#define SLJIT_SIMD_REGS_ARE_PAIRS	12
+#define SLJIT_SIMD_REGS_ARE_PAIRS	13
+/* [Not emulated] Atomic support is available (fine-grained). */
+#define SLJIT_HAS_ATOMIC      14
 
 #if (defined SLJIT_CONFIG_X86 && SLJIT_CONFIG_X86)
-/* [Not emulated] SSE2 support is available on x86. */
-#define SLJIT_HAS_SSE2			100
+/* [Not emulated] AVX support is available on x86. */
+#define SLJIT_HAS_AVX			100
+/* [Not emulated] AVX2 support is available on x86. */
+#define SLJIT_HAS_AVX2			101
 #endif
 
 SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_has_cpu_feature(sljit_s32 feature_type);
@@ -873,17 +901,17 @@
 #define SLJIT_MEM0()		(SLJIT_MEM)
 #define SLJIT_MEM1(r1)		(SLJIT_MEM | (r1))
 #define SLJIT_MEM2(r1, r2)	(SLJIT_MEM | (r1) | ((r2) << 8))
-#define SLJIT_IMM		0x40
+#define SLJIT_IMM		0x7f
 #define SLJIT_REG_PAIR(r1, r2)	((r1) | ((r2) << 8))
 
 /* Macros for checking operand types (only for valid arguments). */
 #define SLJIT_IS_REG(arg)	((arg) > 0 && (arg) < SLJIT_IMM)
 #define SLJIT_IS_MEM(arg)	((arg) & SLJIT_MEM)
 #define SLJIT_IS_MEM0(arg)	((arg) == SLJIT_MEM)
-#define SLJIT_IS_MEM1(arg)	((arg) > SLJIT_MEM && (arg) < (SLJIT_MEM + SLJIT_IMM))
+#define SLJIT_IS_MEM1(arg)	((arg) > SLJIT_MEM && (arg) < (SLJIT_MEM << 1))
 #define SLJIT_IS_MEM2(arg)	(((arg) & SLJIT_MEM) && (arg) >= (SLJIT_MEM << 1))
 #define SLJIT_IS_IMM(arg)	((arg) == SLJIT_IMM)
-#define SLJIT_IS_REG_PAIR(arg)	(((arg) >> 8) != 0)
+#define SLJIT_IS_REG_PAIR(arg)	(!((arg) & SLJIT_MEM) && (arg) >= (SLJIT_MEM << 1))
 
 /* Sets 32 bit operation mode on 64 bit CPUs. This option is ignored on
    32 bit CPUs. When this option is set for an arithmetic operation, only
@@ -1813,7 +1841,7 @@
 	sljit_s32 freg,
 	sljit_s32 mem, sljit_sw memw);
 
-/* The following flags are used by several simd operations. */
+/* The following options are used by several simd operations. */
 
 /* Load data into a simd register, this is the default */
 #define SLJIT_SIMD_LOAD			0x000000
@@ -1842,7 +1870,7 @@
 /* Element size is 128 bit long */
 #define SLJIT_SIMD_ELEM_128		(4 << 18)
 
-/* The following flags are used by sljit_emit_simd_mem(). */
+/* The following options are used by sljit_emit_simd_mov(). */
 
 /* Memory address is unaligned (this is the default) */
 #define SLJIT_SIMD_MEM_UNALIGNED	(0 << 24)
@@ -1854,6 +1882,10 @@
 #define SLJIT_SIMD_MEM_ALIGNED_64	(3 << 24)
 /* Memory address is 128 bit aligned */
 #define SLJIT_SIMD_MEM_ALIGNED_128	(4 << 24)
+/* Memory address is 256 bit aligned */
+#define SLJIT_SIMD_MEM_ALIGNED_256	(5 << 24)
+/* Memory address is 512 bit aligned */
+#define SLJIT_SIMD_MEM_ALIGNED_512	(6 << 24)
 
 /* Moves data between a simd register and memory.
 
@@ -1862,7 +1894,7 @@
    it does not emit any instructions.
 
    type must be a combination of SLJIT_SIMD_* and
-     SLJIT_SIMD_MEM_* flags
+     SLJIT_SIMD_MEM_* options
    freg is the source or destination simd register
      of the operation
    srcdst must be a memory operand or a simd register
@@ -1884,14 +1916,14 @@
    SLJIT_ERR_UNSUPPORTED. If SLJIT_SIMD_TEST is passed,
    it does not emit any instructions.
 
-   type must be a combination of SLJIT_SIMD_* flags except
-     SLJIT_SIMD_STORE.
+   type must be a combination of SLJIT_SIMD_* options
+     except SLJIT_SIMD_STORE.
    freg is the destination simd register of the operation
    src is the value which is replicated
 
    Note:
-       When SLJIT_SIMD_FLOAT is specified, the
-       SLJIT_IMM, 0 can be passed as src/srcw arguments.
+       The src == SLJIT_IMM and srcw == 0 can be used to
+       clear a register even when SLJIT_SIMD_FLOAT is set.
 
    Flags: - (does not modify flags) */
 
@@ -1899,6 +1931,13 @@
 	sljit_s32 freg,
 	sljit_s32 src, sljit_sw srcw);
 
+/* The following options are used by sljit_emit_simd_lane_mov(). */
+
+/* Clear all bits of the simd register before loading the lane. */
+#define SLJIT_SIMD_LANE_ZERO		0x000002
+/* Sign extend the integer value stored from the lane. */
+#define SLJIT_SIMD_LANE_SIGNED		0x000004
+
 /* Moves data between a simd register lane and a register or
    memory. If the srcdst argument is a register, it must be
    a floating point register when SLJIT_SIMD_FLOAT is specified,
@@ -1908,7 +1947,13 @@
    SLJIT_ERR_UNSUPPORTED. If SLJIT_SIMD_TEST is passed,
    it does not emit any instructions.
 
-   type must be a combination of SLJIT_SIMD_* flags
+   type must be a combination of SLJIT_SIMD_* options
+     Further options:
+       SLJIT_32 - when SLJIT_SIMD_FLOAT is not set
+       SLJIT_SIMD_LANE_SIGNED - when SLJIT_SIMD_STORE
+           is set and SLJIT_SIMD_FLOAT is not set
+       SLJIT_SIMD_LANE_ZERO - when SLJIT_SIMD_LOAD
+           is specified
    freg is the source or destination simd register
      of the operation
    lane_index is the index of the lane
@@ -1931,8 +1976,8 @@
    SLJIT_ERR_UNSUPPORTED. If SLJIT_SIMD_TEST is passed,
    it does not emit any instructions.
 
-   type must be a combination of SLJIT_SIMD_* flags except
-     SLJIT_SIMD_STORE.
+   type must be a combination of SLJIT_SIMD_* options
+     except SLJIT_SIMD_STORE.
    freg is the destination simd register of the operation
    src is the simd register which lane is replicated
    src_lane_index is the lane index of the src register
@@ -1943,6 +1988,57 @@
 	sljit_s32 freg,
 	sljit_s32 src, sljit_s32 src_lane_index);
 
+/* The following options are used by sljit_emit_simd_load_extend(). */
+
+/* Sign extend the integer elements */
+#define SLJIT_SIMD_EXTEND_SIGNED	0x000002
+/* Extend data to 16 bit */
+#define SLJIT_SIMD_EXTEND_16		(1 << 24)
+/* Extend data to 32 bit */
+#define SLJIT_SIMD_EXTEND_32		(2 << 24)
+/* Extend data to 64 bit */
+#define SLJIT_SIMD_EXTEND_64		(3 << 24)
+
+/* Extend elements and stores them in a simd register.
+   The extension operation increases the size of the
+   elements (e.g. from 16 bit to 64 bit). For integer
+   values, the extension can be signed or unsigned.
+
+   If the operation is not supported, it returns with
+   SLJIT_ERR_UNSUPPORTED. If SLJIT_SIMD_TEST is passed,
+   it does not emit any instructions.
+
+   type must be a combination of SLJIT_SIMD_*, and
+     SLJIT_SIMD_EXTEND_* options except SLJIT_SIMD_STORE
+   freg is the destination simd register of the operation
+   src must be a memory operand or a simd register.
+     In the latter case, the source elements are stored
+     in the lower half of the register.
+
+   Flags: - (does not modify flags) */
+
+SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_simd_extend(struct sljit_compiler *compiler, sljit_s32 type,
+	sljit_s32 freg,
+	sljit_s32 src, sljit_sw srcw);
+
+/* Extract the highest bit (usually the sign bit) from
+   each elements of a vector.
+
+   If the operation is not supported, it returns with
+   SLJIT_ERR_UNSUPPORTED. If SLJIT_SIMD_TEST is passed,
+   it does not emit any instructions.
+
+   type must be a combination of SLJIT_SIMD_* and SLJIT_32
+     options except SLJIT_SIMD_LOAD
+   freg is the source simd register of the operation
+   dst is the destination operand
+
+   Flags: - (does not modify flags) */
+
+SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_simd_sign(struct sljit_compiler *compiler, sljit_s32 type,
+	sljit_s32 freg,
+	sljit_s32 dst, sljit_sw dstw);
+
 /* The sljit_emit_atomic_load and sljit_emit_atomic_store operation pair
    can perform an atomic read-modify-write operation. First, an unsigned
    value must be loaded from memory using sljit_emit_atomic_load. Then,
@@ -2039,15 +2135,15 @@
 
 /* Types for sljit_get_register_index */
 
-/* Integer registers. */
-#define SLJIT_INT_REGISTER 0
+/* General purpose (integer) registers. */
+#define SLJIT_GP_REGISTER 0
 /* Floating point registers. */
 #define SLJIT_FLOAT_REGISTER 1
 
 /* The following function is a helper function for sljit_emit_op_custom.
    It returns with the real machine register index ( >=0 ) of any registers.
 
-   When type is SLJIT_INT_REGISTER:
+   When type is SLJIT_GP_REGISTER:
       reg must be an SLJIT_R(i), SLJIT_S(i), or SLJIT_SP register
 
    When type is SLJIT_FLOAT_REGISTER:
diff --git a/src/sljit/sljitNativeARM_32.c b/src/sljit/sljitNativeARM_32.c
index 4a54436..55c62b7 100644
--- a/src/sljit/sljitNativeARM_32.c
+++ b/src/sljit/sljitNativeARM_32.c
@@ -49,8 +49,8 @@
 #define TMP_REG2	(SLJIT_NUMBER_OF_REGISTERS + 3)
 #define TMP_PC		(SLJIT_NUMBER_OF_REGISTERS + 4)
 
-#define TMP_FREG1	(SLJIT_NUMBER_OF_FLOAT_REGISTERS + 1)
-#define TMP_FREG2	(SLJIT_NUMBER_OF_FLOAT_REGISTERS + 2)
+#define TMP_FREG1	((SLJIT_NUMBER_OF_FLOAT_REGISTERS << 1) + 1)
+#define TMP_FREG2	((SLJIT_NUMBER_OF_FLOAT_REGISTERS << 1) + 2)
 
 /* In ARM instruction words.
    Cache lines are usually 32 byte aligned. */
@@ -67,8 +67,18 @@
 	0, 0, 1, 2, 3, 11, 10, 9, 8, 7, 6, 5, 4, 13, 12, 14, 15
 };
 
-static const sljit_u8 freg_map[SLJIT_NUMBER_OF_FLOAT_REGISTERS + 3] = {
-	0, 0, 1, 2, 3, 4, 5, 15, 14, 13, 12, 11, 10, 9, 8, 6, 7
+static const sljit_u8 freg_map[(SLJIT_NUMBER_OF_FLOAT_REGISTERS << 1) + 3] = {
+	0,
+	0, 1, 2, 3, 4, 5, 15, 14, 13, 12, 11, 10, 9, 8,
+	0, 1, 2, 3, 4, 5, 15, 14, 13, 12, 11, 10, 9, 8,
+	6, 7
+};
+
+static const sljit_u8 freg_ebit_map[(SLJIT_NUMBER_OF_FLOAT_REGISTERS << 1) + 3] = {
+	0,
+	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+	0, 0
 };
 
 #define RM(rm) ((sljit_ins)reg_map[rm])
@@ -76,9 +86,9 @@
 #define RD(rd) ((sljit_ins)reg_map[rd] << 12)
 #define RN(rn) ((sljit_ins)reg_map[rn] << 16)
 
-#define VM(rm) ((sljit_ins)freg_map[rm])
-#define VD(rd) ((sljit_ins)freg_map[rd] << 12)
-#define VN(rn) ((sljit_ins)freg_map[rn] << 16)
+#define VM(vm) (((sljit_ins)freg_map[vm]) | ((sljit_ins)freg_ebit_map[vm] << 5))
+#define VD(vd) (((sljit_ins)freg_map[vd] << 12) | ((sljit_ins)freg_ebit_map[vd] << 22))
+#define VN(vn) (((sljit_ins)freg_map[vn] << 16) | ((sljit_ins)freg_ebit_map[vn] << 7))
 
 /* --------------------------------------------------------------------- */
 /*  Instrucion forms                                                     */
@@ -95,13 +105,13 @@
 #define AND		0xe0000000
 #define B		0xea000000
 #define BIC		0xe1c00000
+#define BKPT		0xe1200070
 #define BL		0xeb000000
 #define BLX		0xe12fff30
 #define BX		0xe12fff10
 #define CLZ		0xe16f0f10
 #define CMN		0xe1600000
 #define CMP		0xe1400000
-#define BKPT		0xe1200070
 #define EOR		0xe0200000
 #define LDR		0xe5100000
 #define LDR_POST	0xe4100000
@@ -115,7 +125,6 @@
 #define ORR		0xe1800000
 #define PUSH		0xe92d0000
 #define POP		0xe8bd0000
-#define RBIT		0xe6ff0f30
 #define REV		0xe6bf0f30
 #define REV16		0xe6bf0fb0
 #define RSB		0xe0600000
@@ -152,12 +161,16 @@
 #define VMOV2		0xec400a10
 #define VMOV_i		0xf2800010
 #define VMOV_s		0xee000b10
+#define VMOVN		0xf3b20200
 #define VMRS		0xeef1fa10
 #define VMUL_F32	0xee200a00
 #define VNEG_F32	0xeeb10a40
 #define VORR		0xf2200110
 #define VPOP		0xecbd0b00
 #define VPUSH		0xed2d0b00
+#define VSHLL		0xf2800a10
+#define VSHR		0xf2800010
+#define VSRA		0xf2800110
 #define VST1		0xf4000000
 #define VST1_s		0xf4800000
 #define VSTR_F32	0xed000a00
@@ -167,8 +180,25 @@
 /* Arm v7 specific instructions. */
 #define MOVT		0xe3400000
 #define MOVW		0xe3000000
+#define RBIT		0xe6ff0f30
 #endif
 
+#if (defined SLJIT_ARGUMENT_CHECKS && SLJIT_ARGUMENT_CHECKS)
+
+static sljit_s32 function_check_is_freg(struct sljit_compiler *compiler, sljit_s32 fr, sljit_s32 is_32)
+{
+	if (compiler->scratches == -1)
+		return 0;
+
+	if (is_32 && fr >= (SLJIT_FS0 + SLJIT_FR0) && fr <= (SLJIT_FS0 + SLJIT_FS0))
+		fr -= SLJIT_FS0;
+
+	return (fr >= SLJIT_FR0 && fr < (SLJIT_FR0 + compiler->fscratches))
+		|| (fr > (SLJIT_FS0 - compiler->fsaveds) && fr <= SLJIT_FS0);
+}
+
+#endif /* SLJIT_ARGUMENT_CHECKS */
+
 #if (defined SLJIT_CONFIG_ARM_V6 && SLJIT_CONFIG_ARM_V6)
 
 static sljit_s32 push_cpool(struct sljit_compiler *compiler)
@@ -973,25 +1003,43 @@
 {
 	switch (feature_type) {
 	case SLJIT_HAS_FPU:
-	case SLJIT_HAS_SIMD:
-	case SLJIT_SIMD_REGS_ARE_PAIRS:
+	case SLJIT_HAS_F64_AS_F32_PAIR:
 #ifdef SLJIT_IS_FPU_AVAILABLE
-		return SLJIT_IS_FPU_AVAILABLE;
+		return (SLJIT_IS_FPU_AVAILABLE) != 0;
 #else
 		/* Available by default. */
 		return 1;
-#endif
+#endif /* SLJIT_IS_FPU_AVAILABLE */
+	case SLJIT_HAS_SIMD:
+#if (defined SLJIT_CONFIG_ARM_V6 && SLJIT_CONFIG_ARM_V6)
+		return 0;
+#else
+#ifdef SLJIT_IS_FPU_AVAILABLE
+		return (SLJIT_IS_FPU_AVAILABLE) != 0;
+#else
+		/* Available by default. */
+		return 1;
+#endif /* SLJIT_IS_FPU_AVAILABLE */
+#endif /* SLJIT_CONFIG_ARM_V6 */
 
+	case SLJIT_SIMD_REGS_ARE_PAIRS:
 	case SLJIT_HAS_CLZ:
-	case SLJIT_HAS_CTZ:
 	case SLJIT_HAS_ROT:
 	case SLJIT_HAS_CMOV:
 	case SLJIT_HAS_REV:
 	case SLJIT_HAS_PREFETCH:
 	case SLJIT_HAS_COPY_F32:
 	case SLJIT_HAS_COPY_F64:
+	case SLJIT_HAS_ATOMIC:
 		return 1;
 
+	case SLJIT_HAS_CTZ:
+#if defined(SLJIT_CONFIG_ARM_V6) && SLJIT_CONFIG_ARM_V6
+		return 2;
+#else
+		return 1;
+#endif /* SLJIT_CONFIG_ARM_V6 */
+
 	default:
 		return 0;
 	}
@@ -1491,8 +1539,16 @@
 	case SLJIT_CTZ:
 		SLJIT_ASSERT(!(flags & INV_IMM) && !(src2 & SRC2_IMM));
 		SLJIT_ASSERT(src1 == TMP_REG1 && !(flags & ARGS_SWAPPED));
+#if (defined SLJIT_CONFIG_ARM_V6 && SLJIT_CONFIG_ARM_V6)
+		FAIL_IF(push_inst(compiler, RSB | SRC2_IMM | RD(TMP_REG1) | RN(src2) | 0));
+		FAIL_IF(push_inst(compiler, AND | RD(TMP_REG2) | RN(src2) | RM(TMP_REG1)));
+		FAIL_IF(push_inst(compiler, CLZ | RD(dst) | RM(TMP_REG2)));
+		FAIL_IF(push_inst(compiler, CMP | SET_FLAGS | SRC2_IMM | RN(dst) | 32));
+		return push_inst(compiler, (EOR ^ 0xf0000000) | SRC2_IMM | RD(dst) | RN(dst) | 0x1f);
+#else /* !SLJIT_CONFIG_ARM_V6 */
 		FAIL_IF(push_inst(compiler, RBIT | RD(dst) | RM(src2)));
 		return push_inst(compiler, CLZ | RD(dst) | RM(dst));
+#endif /* SLJIT_CONFIG_ARM_V6 */
 
 	case SLJIT_REV:
 	case SLJIT_REV_U32:
@@ -1951,10 +2007,11 @@
 		if (!(inp_flags & ALLOW_IMM))
 			break;
 
-		if (src2 & SLJIT_IMM) {
+		if (src2 == SLJIT_IMM) {
 			src2_reg = (sljit_s32)get_imm((sljit_uw)src2w);
 			if (src2_reg)
 				break;
+
 			if (inp_flags & ALLOW_INV_IMM) {
 				src2_reg = (sljit_s32)get_imm(~(sljit_uw)src2w);
 				if (src2_reg) {
@@ -1962,8 +2019,9 @@
 					break;
 				}
 			}
+
 			if (neg_op != 0) {
-				src2_reg = (sljit_s32)get_imm((sljit_uw)-src2w);
+				src2_reg = (sljit_s32)get_imm((neg_op == SLJIT_ADD || neg_op == SLJIT_SUB) ? (sljit_uw)-src2w : ~(sljit_uw)src2w);
 				if (src2_reg) {
 					op = neg_op | GET_ALL_FLAGS(op);
 					break;
@@ -1971,7 +2029,7 @@
 			}
 		}
 
-		if (src1 & SLJIT_IMM) {
+		if (src1 == SLJIT_IMM) {
 			src2_reg = (sljit_s32)get_imm((sljit_uw)src1w);
 			if (src2_reg) {
 				flags |= ARGS_SWAPPED;
@@ -1979,6 +2037,7 @@
 				src1w = src2w;
 				break;
 			}
+
 			if (inp_flags & ALLOW_INV_IMM) {
 				src2_reg = (sljit_s32)get_imm(~(sljit_uw)src1w);
 				if (src2_reg) {
@@ -1988,8 +2047,11 @@
 					break;
 				}
 			}
+
 			if (neg_op >= SLJIT_SUB) {
 				/* Note: additive operation (commutative). */
+				SLJIT_ASSERT(op == SLJIT_ADD || op == SLJIT_ADDC);
+
 				src2_reg = (sljit_s32)get_imm((sljit_uw)-src1w);
 				if (src2_reg) {
 					src1 = src2;
@@ -2211,16 +2273,16 @@
 		return emit_op(compiler, SLJIT_MOV, ALLOW_ANY_IMM, dst, dstw, TMP_REG1, 0, src, srcw);
 
 	case SLJIT_MOV_U8:
-		return emit_op(compiler, SLJIT_MOV_U8, ALLOW_ANY_IMM | BYTE_SIZE, dst, dstw, TMP_REG1, 0, src, (src & SLJIT_IMM) ? (sljit_u8)srcw : srcw);
+		return emit_op(compiler, SLJIT_MOV_U8, ALLOW_ANY_IMM | BYTE_SIZE, dst, dstw, TMP_REG1, 0, src, (src == SLJIT_IMM) ? (sljit_u8)srcw : srcw);
 
 	case SLJIT_MOV_S8:
-		return emit_op(compiler, SLJIT_MOV_S8, ALLOW_ANY_IMM | SIGNED | BYTE_SIZE, dst, dstw, TMP_REG1, 0, src, (src & SLJIT_IMM) ? (sljit_s8)srcw : srcw);
+		return emit_op(compiler, SLJIT_MOV_S8, ALLOW_ANY_IMM | SIGNED | BYTE_SIZE, dst, dstw, TMP_REG1, 0, src, (src == SLJIT_IMM) ? (sljit_s8)srcw : srcw);
 
 	case SLJIT_MOV_U16:
-		return emit_op(compiler, SLJIT_MOV_U16, ALLOW_ANY_IMM | HALF_SIZE, dst, dstw, TMP_REG1, 0, src, (src & SLJIT_IMM) ? (sljit_u16)srcw : srcw);
+		return emit_op(compiler, SLJIT_MOV_U16, ALLOW_ANY_IMM | HALF_SIZE, dst, dstw, TMP_REG1, 0, src, (src == SLJIT_IMM) ? (sljit_u16)srcw : srcw);
 
 	case SLJIT_MOV_S16:
-		return emit_op(compiler, SLJIT_MOV_S16, ALLOW_ANY_IMM | SIGNED | HALF_SIZE, dst, dstw, TMP_REG1, 0, src, (src & SLJIT_IMM) ? (sljit_s16)srcw : srcw);
+		return emit_op(compiler, SLJIT_MOV_S16, ALLOW_ANY_IMM | SIGNED | HALF_SIZE, dst, dstw, TMP_REG1, 0, src, (src == SLJIT_IMM) ? (sljit_s16)srcw : srcw);
 
 	case SLJIT_CLZ:
 	case SLJIT_CTZ:
@@ -2262,7 +2324,7 @@
 
 	case SLJIT_XOR:
 		inp_flags = ALLOW_IMM | ALLOW_DOUBLE_IMM;
-		if (((src1 & SLJIT_IMM) && src1w == -1) || ((src2 & SLJIT_IMM) && src2w == -1)) {
+		if ((src1 == SLJIT_IMM && src1w == -1) || (src2 == SLJIT_IMM && src2w == -1)) {
 			inp_flags |= ALLOW_INV_IMM;
 		}
 		return emit_op(compiler, op, inp_flags, dst, dstw, src1, src1w, src2, src2w);
@@ -2281,7 +2343,7 @@
 	case SLJIT_MASHR:
 	case SLJIT_ROTL:
 	case SLJIT_ROTR:
-		if (src2 & SLJIT_IMM) {
+		if (src2 == SLJIT_IMM) {
 			compiler->shift_imm = src2w & 0x1f;
 			return emit_op(compiler, op, 0, dst, dstw, TMP_REG1, 0, src1, src1w);
 		} else {
@@ -2326,7 +2388,7 @@
 	ADJUST_LOCAL_OFFSET(src3, src3w);
 
 	/* Shift type of ROR is 3. */
-	if (src3 & SLJIT_IMM) {
+	if (src3 == SLJIT_IMM) {
 		src3w &= 0x1f;
 
 		if (src3w == 0)
@@ -2427,7 +2489,7 @@
 {
 	CHECK_REG_INDEX(check_sljit_get_register_index(type, reg));
 
-	if (type == SLJIT_INT_REGISTER)
+	if (type == SLJIT_GP_REGISTER)
 		return reg_map[reg];
 
 	if (type == SLJIT_FLOAT_REGISTER || type == SLJIT_SIMD_REG_64)
@@ -2689,8 +2751,10 @@
 SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_fset32(struct sljit_compiler *compiler,
 	sljit_s32 freg, sljit_f32 value)
 {
+#if defined(__ARM_NEON) && __ARM_NEON
 	sljit_u32 exp;
 	sljit_ins ins;
+#endif /* NEON */
 	union {
 		sljit_u32 imm;
 		sljit_f32 value;
@@ -2701,6 +2765,7 @@
 
 	u.value = value;
 
+#if defined(__ARM_NEON) && __ARM_NEON
 	if ((u.imm << (32 - 19)) == 0) {
 		exp = (u.imm >> (23 + 2)) & 0x3f;
 
@@ -2709,6 +2774,7 @@
 			return push_inst(compiler, (VMOV_F32 ^ (1 << 6)) | ((ins & 0xf0) << 12) | VD(freg) | (ins & 0xf));
 		}
 	}
+#endif /* NEON */
 
 	FAIL_IF(load_immediate(compiler, TMP_REG1, u.imm));
 	return push_inst(compiler, VMOV | VN(freg) | RD(TMP_REG1));
@@ -2717,8 +2783,10 @@
 SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_fset64(struct sljit_compiler *compiler,
 	sljit_s32 freg, sljit_f64 value)
 {
+#if defined(__ARM_NEON) && __ARM_NEON
 	sljit_u32 exp;
 	sljit_ins ins;
+#endif /* NEON */
 	union {
 		sljit_u32 imm[2];
 		sljit_f64 value;
@@ -2729,6 +2797,7 @@
 
 	u.value = value;
 
+#if defined(__ARM_NEON) && __ARM_NEON
 	if (u.imm[0] == 0 && (u.imm[1] << (64 - 48)) == 0) {
 		exp = (u.imm[1] >> ((52 - 32) + 2)) & 0x1ff;
 
@@ -2737,6 +2806,7 @@
 			return push_inst(compiler, (VMOV_F32 ^ (1 << 6)) | (1 << 8) | ((ins & 0xf0) << 12) | VD(freg) | (ins & 0xf));
 		}
 	}
+#endif /* NEON */
 
 	FAIL_IF(load_immediate(compiler, TMP_REG1, u.imm[0]));
 	if (u.imm[0] == u.imm[1])
@@ -3169,7 +3239,7 @@
 
 	SLJIT_ASSERT(reg_map[TMP_REG1] != 14);
 
-	if (!(src & SLJIT_IMM)) {
+	if (src != SLJIT_IMM) {
 		if (FAST_IS_REG(src)) {
 			SLJIT_ASSERT(reg_map[src] != 14);
 			return push_inst(compiler, (type <= SLJIT_JUMP ? BX : BLX) | RM(src));
@@ -3365,7 +3435,7 @@
 
 	cc = get_cc(compiler, type & ~SLJIT_32);
 
-	if (SLJIT_UNLIKELY(src1 & SLJIT_IMM)) {
+	if (SLJIT_UNLIKELY(src1 == SLJIT_IMM)) {
 		tmp = get_imm((sljit_uw)src1w);
 		if (tmp)
 			return push_inst(compiler, ((MOV | RD(dst_reg) | tmp) & ~COND_MASK) | cc);
@@ -3686,6 +3756,20 @@
 	return push_inst(compiler, ADD | RD(TMP_REG1) | RN(TMP_REG1) | RM(mem));
 }
 
+static SLJIT_INLINE sljit_s32 simd_get_quad_reg_index(sljit_s32 freg)
+{
+	freg += freg & 0x1;
+
+	SLJIT_ASSERT((freg_map[freg] & 0x1) == (freg <= SLJIT_NUMBER_OF_SCRATCH_FLOAT_REGISTERS));
+
+	if (freg <= SLJIT_NUMBER_OF_SCRATCH_FLOAT_REGISTERS)
+		freg--;
+
+	return freg;
+}
+
+#define SLJIT_QUAD_OTHER_HALF(freg) ((((freg) & 0x1) << 1) - 1)
+
 SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_simd_mov(struct sljit_compiler *compiler, sljit_s32 type,
 	sljit_s32 freg,
 	sljit_s32 srcdst, sljit_sw srcdstw)
@@ -3710,9 +3794,12 @@
 		return SLJIT_SUCCESS;
 
 	if (reg_size == 4)
-		freg -= 1 - (freg & 0x1);
+		freg = simd_get_quad_reg_index(freg);
 
 	if (!(srcdst & SLJIT_MEM)) {
+		if (reg_size == 4)
+			srcdst = simd_get_quad_reg_index(srcdst);
+
 		if (type & SLJIT_SIMD_STORE)
 			ins = VD(srcdst) | VN(freg) | VM(freg);
 		else
@@ -3859,9 +3946,9 @@
 		return SLJIT_SUCCESS;
 
 	if (reg_size == 4)
-		freg -= 1 - (freg & 0x1);
+		freg = simd_get_quad_reg_index(freg);
 
-	if ((src & SLJIT_IMM) && srcw == 0)
+	if (src == SLJIT_IMM && srcw == 0)
 		return push_inst(compiler, VMOV_i | ((reg_size == 4) ? (1 << 6) : 0) | VD(freg));
 
 	if (SLJIT_UNLIKELY(elem_size == 3)) {
@@ -3873,7 +3960,7 @@
 		} else if (freg != src)
 			FAIL_IF(push_inst(compiler, VORR | VD(freg) | VN(src) | VM(src)));
 
-		freg++;
+		freg += SLJIT_QUAD_OTHER_HALF(freg);
 
 		if (freg != src)
 			return push_inst(compiler, VORR | VD(freg) | VN(src) | VM(src));
@@ -3892,15 +3979,16 @@
 	}
 
 	if (type & SLJIT_SIMD_FLOAT) {
-		ins = ((sljit_ins)1 << (16 + elem_size));
+		SLJIT_ASSERT(elem_size == 2);
+		ins = ((sljit_ins)freg_ebit_map[src] << (16 + 2 + 1)) | ((sljit_ins)1 << (16 + 2));
 
 		if (reg_size == 4)
 			ins |= 1 << 6;
 
-		return push_inst(compiler, VDUP_s | ins | VD(freg) | VM(src));
+		return push_inst(compiler, VDUP_s | ins | VD(freg) | (sljit_ins)freg_map[src]);
 	}
 
-	if (src & SLJIT_IMM) {
+	if (src == SLJIT_IMM) {
 		if (elem_size < 2)
 			srcw &= ((sljit_sw)1 << (((sljit_sw)1 << elem_size) << 3)) - 1;
 
@@ -3957,13 +4045,37 @@
 	if (type & SLJIT_SIMD_TEST)
 		return SLJIT_SUCCESS;
 
-	if (reg_size == 4) {
-		freg -= 1 - (freg & 0x1);
+	if (reg_size == 4)
+		freg = simd_get_quad_reg_index(freg);
 
-		if (lane_index >= (0x8 >> elem_size)) {
-			lane_index -= (0x8 >> elem_size);
-			freg++;
+	if (type & SLJIT_SIMD_LANE_ZERO) {
+		ins = (reg_size == 3) ? 0 : ((sljit_ins)1 << 6);
+
+		if (type & SLJIT_SIMD_FLOAT) {
+			if (elem_size == 3 && !(srcdst & SLJIT_MEM)) {
+				if (lane_index == 1)
+					freg += SLJIT_QUAD_OTHER_HALF(freg);
+
+				if (srcdst != freg)
+					FAIL_IF(push_inst(compiler, VORR | VD(freg) | VN(srcdst) | VM(srcdst)));
+
+				freg += SLJIT_QUAD_OTHER_HALF(freg);
+				return push_inst(compiler, VMOV_i | VD(freg));
+			}
+
+			if (srcdst == freg || (elem_size == 3 && srcdst == (freg + SLJIT_QUAD_OTHER_HALF(freg)))) {
+				FAIL_IF(push_inst(compiler, VORR | ins | VD(TMP_FREG1) | VN(freg) | VM(freg)));
+				srcdst = TMP_FREG1;
+				srcdstw = 0;
+			}
 		}
+
+		FAIL_IF(push_inst(compiler, VMOV_i | ins | VD(freg)));
+	}
+
+	if (reg_size == 4 && lane_index >= (0x8 >> elem_size)) {
+		lane_index -= (0x8 >> elem_size);
+		freg += SLJIT_QUAD_OTHER_HALF(freg);
 	}
 
 	if (srcdst & SLJIT_MEM) {
@@ -3985,16 +4097,22 @@
 		}
 
 		if (type & SLJIT_SIMD_STORE) {
-			if (lane_index == 0)
-				return push_inst(compiler, VMOV_F32 | SLJIT_32 | VD(srcdst) | VM(freg));
-			return push_inst(compiler, VDUP_s | 0xc0000 | VD(srcdst) | VM(freg));
+			if (freg_ebit_map[freg] == 0) {
+				if (lane_index == 1)
+					freg = SLJIT_F64_SECOND(freg);
+
+				return push_inst(compiler, VMOV_F32 | VD(srcdst) | VM(freg));
+			}
+
+			FAIL_IF(push_inst(compiler, VMOV_s | (1 << 20) | ((sljit_ins)lane_index << 21) | VN(freg) | RD(TMP_REG1)));
+			return push_inst(compiler, VMOV | VN(srcdst) | RD(TMP_REG1));
 		}
 
 		FAIL_IF(push_inst(compiler, VMOV | (1 << 20) | VN(srcdst) | RD(TMP_REG1)));
 		return push_inst(compiler, VMOV_s | ((sljit_ins)lane_index << 21) | VN(freg) | RD(TMP_REG1));
 	}
 
-	if (srcdst & SLJIT_IMM) {
+	if (srcdst == SLJIT_IMM) {
 		if (elem_size < 2)
 			srcdstw &= ((sljit_sw)1 << (((sljit_sw)1 << elem_size) << 3)) - 1;
 
@@ -4013,10 +4131,10 @@
 	ins |= (sljit_ins)(((lane_index & 0x4) << 19) | ((lane_index & 0x3) << 5));
 
 	if (type & SLJIT_SIMD_STORE) {
-		ins |= 0x100000;
+		ins |= (1 << 20);
 
-		if (elem_size < 2)
-			ins |= 0x800000;
+		if (elem_size < 2 && !(type & SLJIT_SIMD_LANE_SIGNED))
+			ins |= (1 << 23);
 	}
 
 	return push_inst(compiler, VMOV_s | ins | VN(freg) | RD(srcdst));
@@ -4043,12 +4161,12 @@
 		return SLJIT_SUCCESS;
 
 	if (reg_size == 4) {
-		freg -= 1 - (freg & 0x1);
-		src -= 1 - (src & 0x1);
+		freg = simd_get_quad_reg_index(freg);
+		src = simd_get_quad_reg_index(src);
 
 		if (src_lane_index >= (0x8 >> elem_size)) {
 			src_lane_index -= (0x8 >> elem_size);
-			src++;
+			src += SLJIT_QUAD_OTHER_HALF(src);
 		}
 	}
 
@@ -4056,7 +4174,7 @@
 		if (freg != src)
 			FAIL_IF(push_inst(compiler, VORR | VD(freg) | VN(src) | VM(src)));
 
-		freg++;
+		freg += SLJIT_QUAD_OTHER_HALF(freg);
 
 		if (freg != src)
 			return push_inst(compiler, VORR | VD(freg) | VN(src) | VM(src));
@@ -4071,6 +4189,148 @@
 	return push_inst(compiler, VDUP_s | ins | VD(freg) | VM(src));
 }
 
+SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_simd_extend(struct sljit_compiler *compiler, sljit_s32 type,
+	sljit_s32 freg,
+	sljit_s32 src, sljit_sw srcw)
+{
+	sljit_s32 reg_size = SLJIT_SIMD_GET_REG_SIZE(type);
+	sljit_s32 elem_size = SLJIT_SIMD_GET_ELEM_SIZE(type);
+	sljit_s32 elem2_size = SLJIT_SIMD_GET_ELEM2_SIZE(type);
+	sljit_s32 dst_reg;
+
+	CHECK_ERROR();
+	CHECK(check_sljit_emit_simd_extend(compiler, type, freg, src, srcw));
+
+	ADJUST_LOCAL_OFFSET(src, srcw);
+
+	if (reg_size != 3 && reg_size != 4)
+		return SLJIT_ERR_UNSUPPORTED;
+
+	if ((type & SLJIT_SIMD_FLOAT) && (elem_size != 2 || elem2_size != 3))
+		return SLJIT_ERR_UNSUPPORTED;
+
+	if (type & SLJIT_SIMD_TEST)
+		return SLJIT_SUCCESS;
+
+	if (reg_size == 4)
+		freg = simd_get_quad_reg_index(freg);
+
+	if (src & SLJIT_MEM) {
+		FAIL_IF(sljit_emit_simd_mem_offset(compiler, &src, srcw));
+		if (reg_size == 4 && elem2_size - elem_size == 1)
+			FAIL_IF(push_inst(compiler, VLD1 | (0x7 << 8) | VD(freg) | RN(src) | 0xf));
+		else
+			FAIL_IF(push_inst(compiler, VLD1_s | (sljit_ins)((reg_size - elem2_size + elem_size) << 10) | VD(freg) | RN(src) | 0xf));
+		src = freg;
+	} else if (reg_size == 4)
+		src = simd_get_quad_reg_index(src);
+
+	if (!(type & SLJIT_SIMD_FLOAT)) {
+		dst_reg = (reg_size == 4) ? freg : TMP_FREG1;
+
+		do {
+			FAIL_IF(push_inst(compiler, VSHLL | ((type & SLJIT_SIMD_EXTEND_SIGNED) ? 0 : (1 << 24))
+				| ((sljit_ins)1 << (19 + elem_size)) | VD(dst_reg) | VM(src)));
+			src = dst_reg;
+		} while (++elem_size < elem2_size);
+
+		if (dst_reg == TMP_FREG1)
+			return push_inst(compiler, VORR | VD(freg) | VN(TMP_FREG1) | VM(TMP_FREG1));
+		return SLJIT_SUCCESS;
+	}
+
+	/* No SIMD variant, must use VFP instead. */
+	SLJIT_ASSERT(reg_size == 4);
+
+	if (freg == src) {
+		freg += SLJIT_QUAD_OTHER_HALF(freg);
+		FAIL_IF(push_inst(compiler, VCVT_F64_F32 | VD(freg) | VM(src) | 0x20));
+		freg += SLJIT_QUAD_OTHER_HALF(freg);
+		return push_inst(compiler, VCVT_F64_F32 | VD(freg) | VM(src));
+	}
+
+	FAIL_IF(push_inst(compiler, VCVT_F64_F32 | VD(freg) | VM(src)));
+	freg += SLJIT_QUAD_OTHER_HALF(freg);
+	return push_inst(compiler, VCVT_F64_F32 | VD(freg) | VM(src) | 0x20);
+}
+
+SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_simd_sign(struct sljit_compiler *compiler, sljit_s32 type,
+	sljit_s32 freg,
+	sljit_s32 dst, sljit_sw dstw)
+{
+	sljit_s32 reg_size = SLJIT_SIMD_GET_REG_SIZE(type);
+	sljit_s32 elem_size = SLJIT_SIMD_GET_ELEM_SIZE(type);
+	sljit_ins ins, imms;
+	sljit_s32 dst_r;
+
+	CHECK_ERROR();
+	CHECK(check_sljit_emit_simd_sign(compiler, type, freg, dst, dstw));
+
+	ADJUST_LOCAL_OFFSET(dst, dstw);
+
+	if (reg_size != 3 && reg_size != 4)
+		return SLJIT_ERR_UNSUPPORTED;
+
+	if ((type & SLJIT_SIMD_FLOAT) && (elem_size < 2 || elem_size > 3))
+		return SLJIT_ERR_UNSUPPORTED;
+
+	if (type & SLJIT_SIMD_TEST)
+		return SLJIT_SUCCESS;
+
+	switch (elem_size) {
+	case 0:
+		imms = 0x243219;
+		ins = VSHR | (1 << 24) | (0x9 << 16);
+		break;
+	case 1:
+		imms = (reg_size == 4) ? 0x243219 : 0x2231;
+		ins = VSHR | (1 << 24) | (0x11 << 16);
+		break;
+	case 2:
+		imms = (reg_size == 4) ? 0x2231 : 0x21;
+		ins = VSHR | (1 << 24) | (0x21 << 16);
+		break;
+	default:
+		imms = 0x21;
+		ins = VSHR | (1 << 24) | (0x1 << 16) | (1 << 7);
+		break;
+	}
+
+	if (reg_size == 4) {
+		freg = simd_get_quad_reg_index(freg);
+		ins |= (1 << 6);
+	}
+
+	SLJIT_ASSERT((freg_map[TMP_FREG1] & 0x1) == 0);
+	FAIL_IF(push_inst(compiler, ins | VD(TMP_FREG1) | VM(freg)));
+
+	if (reg_size == 4 && elem_size > 0)
+		FAIL_IF(push_inst(compiler, VMOVN | ((sljit_ins)(elem_size - 1) << 18) | VD(TMP_FREG1) | VM(TMP_FREG1)));
+
+	ins = (reg_size == 4 && elem_size == 0) ? (1 << 6) : 0;
+
+	while (imms >= 0x100) {
+		FAIL_IF(push_inst(compiler, VSRA | (1 << 24) | ins | ((imms & 0xff) << 16) | VD(TMP_FREG1) | VM(TMP_FREG1)));
+		imms >>= 8;
+	}
+
+	FAIL_IF(push_inst(compiler, VSRA | (1 << 24) | ins | (1 << 7) | (imms << 16) | VD(TMP_FREG1) | VM(TMP_FREG1)));
+
+	dst_r = FAST_IS_REG(dst) ? dst : TMP_REG1;
+	FAIL_IF(push_inst(compiler, VMOV_s | (1 << 20) | (1 << 23) | (0x2 << 21) | RD(dst_r) | VN(TMP_FREG1)));
+
+	if (reg_size == 4 && elem_size == 0) {
+		SLJIT_ASSERT(freg_map[TMP_FREG1] + 1 == freg_map[TMP_FREG2]);
+		FAIL_IF(push_inst(compiler, VMOV_s | (1 << 20) | (1 << 23) | (0x2 << 21) | RD(TMP_REG2) | VN(TMP_FREG2)));
+		FAIL_IF(push_inst(compiler, ORR | RD(dst_r) | RN(dst_r) | RM(TMP_REG2) | (0x8 << 7)));
+	}
+
+	if (dst_r == TMP_REG1)
+		return emit_op_mem(compiler, WORD_SIZE, TMP_REG1, dst, dstw, TMP_REG2);
+
+	return SLJIT_SUCCESS;
+}
+
 #undef FPU_LOAD
 
 SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_atomic_load(struct sljit_compiler *compiler, sljit_s32 op,
@@ -4104,6 +4364,9 @@
 {
 	sljit_u32 ins;
 
+	/* temp_reg == mem_reg is undefined so use another temp register */
+	SLJIT_UNUSED_ARG(temp_reg);
+
 	CHECK_ERROR();
 	CHECK(check_sljit_emit_atomic_store(compiler, op, src_reg, mem_reg, temp_reg));
 
@@ -4120,7 +4383,10 @@
 	}
 
 	FAIL_IF(push_inst(compiler, ins | RN(mem_reg) | RD(TMP_REG1) | RM(src_reg)));
-	return push_inst(compiler, CMP | SET_FLAGS | SRC2_IMM | RN(TMP_REG1) | 0);
+	if (op & SLJIT_SET_ATOMIC_STORED)
+		return push_inst(compiler, CMP | SET_FLAGS | SRC2_IMM | RN(TMP_REG1));
+
+	return SLJIT_SUCCESS;
 }
 
 SLJIT_API_FUNC_ATTRIBUTE struct sljit_const* sljit_emit_const(struct sljit_compiler *compiler, sljit_s32 dst, sljit_sw dstw, sljit_sw init_value)
diff --git a/src/sljit/sljitNativeARM_64.c b/src/sljit/sljitNativeARM_64.c
index f9c64d7..8194670 100644
--- a/src/sljit/sljitNativeARM_64.c
+++ b/src/sljit/sljitNativeARM_64.c
@@ -98,6 +98,7 @@
 #define FCMP 0x1e602000
 #define FCSEL 0x1e600c00
 #define FCVT 0x1e224000
+#define FCVTL 0x0e217800
 #define FCVTZS 0x9e780000
 #define FDIV 0x1e601800
 #define FMOV 0x1e604000
@@ -143,7 +144,9 @@
 #define SCVTF 0x9e620000
 #define SDIV 0x9ac00c00
 #define SMADDL 0x9b200000
+#define SMOV 0x0e002c00
 #define SMULH 0x9b403c00
+#define SSHLL 0x0f00a400
 #define ST1 0x0c007000
 #define ST1_s 0x0d000000
 #define STP 0xa9000000
@@ -169,12 +172,15 @@
 #define UDIV 0x9ac00800
 #define UMOV 0x0e003c00
 #define UMULH 0x9bc03c00
+#define USHLL 0x2f00a400
+#define USHR 0x2f000400
+#define USRA 0x2f001400
+#define XTN 0x0e212800
 
 #define CSET (CSINC | RM(TMP_ZERO) | RN(TMP_ZERO))
 #define LDR (STRI | (1 << 22))
 #define LDRB (STRBI | (1 << 22))
 #define LDRH (LDRB | (1 << 30))
-#define LDRSW ((LDRI ^ (1 << 30)) ^ (0x3 << 22))
 #define MOV (ORR | RN(TMP_ZERO))
 
 static sljit_s32 push_inst(struct sljit_compiler *compiler, sljit_ins ins)
@@ -423,7 +429,7 @@
 	case SLJIT_HAS_FPU:
 	case SLJIT_HAS_SIMD:
 #ifdef SLJIT_IS_FPU_AVAILABLE
-		return SLJIT_IS_FPU_AVAILABLE;
+		return (SLJIT_IS_FPU_AVAILABLE) != 0;
 #else
 		/* Available by default. */
 		return 1;
@@ -437,6 +443,7 @@
 	case SLJIT_HAS_PREFETCH:
 	case SLJIT_HAS_COPY_F32:
 	case SLJIT_HAS_COPY_F64:
+	case SLJIT_HAS_ATOMIC:
 		return 1;
 
 	default:
@@ -1423,33 +1430,33 @@
 			break;
 		case SLJIT_MOV_U8:
 			mem_flags = BYTE_SIZE;
-			if (src & SLJIT_IMM)
+			if (src == SLJIT_IMM)
 				srcw = (sljit_u8)srcw;
 			break;
 		case SLJIT_MOV_S8:
 			mem_flags = BYTE_SIZE | SIGNED;
-			if (src & SLJIT_IMM)
+			if (src == SLJIT_IMM)
 				srcw = (sljit_s8)srcw;
 			break;
 		case SLJIT_MOV_U16:
 			mem_flags = HALF_SIZE;
-			if (src & SLJIT_IMM)
+			if (src == SLJIT_IMM)
 				srcw = (sljit_u16)srcw;
 			break;
 		case SLJIT_MOV_S16:
 			mem_flags = HALF_SIZE | SIGNED;
-			if (src & SLJIT_IMM)
+			if (src == SLJIT_IMM)
 				srcw = (sljit_s16)srcw;
 			break;
 		case SLJIT_MOV_U32:
 			mem_flags = INT_SIZE;
-			if (src & SLJIT_IMM)
+			if (src == SLJIT_IMM)
 				srcw = (sljit_u32)srcw;
 			break;
 		case SLJIT_MOV_S32:
 		case SLJIT_MOV32:
 			mem_flags = INT_SIZE | SIGNED;
-			if (src & SLJIT_IMM)
+			if (src == SLJIT_IMM)
 				srcw = (sljit_s32)srcw;
 			break;
 		default:
@@ -1458,7 +1465,7 @@
 			break;
 		}
 
-		if (src & SLJIT_IMM)
+		if (src == SLJIT_IMM)
 			FAIL_IF(emit_op_imm(compiler, SLJIT_MOV | ARG2_IMM, dst_r, TMP_REG1, srcw));
 		else if (!(src & SLJIT_MEM))
 			dst_r = src;
@@ -1538,12 +1545,12 @@
 		src2 = TMP_REG2;
 	}
 
-	if (src1 & SLJIT_IMM)
+	if (src1 == SLJIT_IMM)
 		flags |= ARG1_IMM;
 	else
 		src1w = src1;
 
-	if (src2 & SLJIT_IMM)
+	if (src2 == SLJIT_IMM)
 		flags |= ARG2_IMM;
 	else
 		src2w = src2;
@@ -1590,7 +1597,7 @@
 
 	inv_bits = (op & SLJIT_32) ? W_OP : 0;
 
-	if (src3 & SLJIT_IMM) {
+	if (src3 == SLJIT_IMM) {
 		mask = inv_bits ? 0x1f : 0x3f;
 		src3w &= mask;
 
@@ -1705,7 +1712,7 @@
 {
 	CHECK_REG_INDEX(check_sljit_get_register_index(type, reg));
 
-	if (type == SLJIT_INT_REGISTER)
+	if (type == SLJIT_GP_REGISTER)
 		return reg_map[reg];
 
 	if (type != SLJIT_FLOAT_REGISTER && type != SLJIT_SIMD_REG_64 && type != SLJIT_SIMD_REG_128)
@@ -1808,7 +1815,7 @@
 	if (src & SLJIT_MEM) {
 		emit_op_mem(compiler, (ins & W_OP) ? WORD_SIZE : INT_SIZE, TMP_REG1, src, srcw, TMP_REG1);
 		src = TMP_REG1;
-	} else if (src & SLJIT_IMM) {
+	} else if (src == SLJIT_IMM) {
 		FAIL_IF(load_immediate(compiler, TMP_REG1, srcw));
 		src = TMP_REG1;
 	}
@@ -1829,7 +1836,7 @@
 	if (GET_OPCODE(op) == SLJIT_CONV_F64_FROM_S32) {
 		inv_bits |= W_OP;
 
-		if (src & SLJIT_IMM)
+		if (src == SLJIT_IMM)
 			srcw = (sljit_s32)srcw;
 	}
 
@@ -1845,7 +1852,7 @@
 	if (GET_OPCODE(op) == SLJIT_CONV_F64_FROM_U32) {
 		inv_bits |= W_OP;
 
-		if (src & SLJIT_IMM)
+		if (src == SLJIT_IMM)
 			srcw = (sljit_u32)srcw;
 	}
 
@@ -2216,7 +2223,7 @@
 		PTR_FAIL_IF(emit_op_mem(compiler, inv_bits ? INT_SIZE : WORD_SIZE, TMP_REG1, src, srcw, TMP_REG1));
 		src = TMP_REG1;
 	}
-	else if (src & SLJIT_IMM) {
+	else if (src == SLJIT_IMM) {
 		PTR_FAIL_IF(load_immediate(compiler, TMP_REG1, srcw));
 		src = TMP_REG1;
 	}
@@ -2240,7 +2247,7 @@
 	CHECK_ERROR();
 	CHECK(check_sljit_emit_ijump(compiler, type, src, srcw));
 
-	if (!(src & SLJIT_IMM)) {
+	if (src != SLJIT_IMM) {
 		if (src & SLJIT_MEM) {
 			ADJUST_LOCAL_OFFSET(src, srcw);
 			FAIL_IF(emit_op_mem(compiler, WORD_SIZE, TMP_REG1, src, srcw, TMP_REG1));
@@ -2349,7 +2356,7 @@
 
 	ADJUST_LOCAL_OFFSET(src1, src1w);
 
-	if (src1 & SLJIT_IMM) {
+	if (src1 == SLJIT_IMM) {
 		if (type & SLJIT_32)
 			src1w = (sljit_s32)src1w;
 		FAIL_IF(load_immediate(compiler, TMP_REG1, src1w));
@@ -2775,13 +2782,13 @@
 		ins |= (sljit_ins)1 << 30;
 
 	if (type & SLJIT_SIMD_FLOAT) {
-		if (src & SLJIT_IMM)
+		if (src == SLJIT_IMM)
 			return push_inst(compiler, MOVI | (ins & ((sljit_ins)1 << 30)) | VD(freg));
 
 		return push_inst(compiler, DUP_e | ins | VD(freg) | VN(src));
 	}
 
-	if (src & SLJIT_IMM) {
+	if (src == SLJIT_IMM) {
 		if (elem_size < 3)
 			srcw &= ((sljit_sw)1 << (((sljit_sw)1 << elem_size) << 3)) - 1;
 
@@ -2822,6 +2829,18 @@
 	if (type & SLJIT_SIMD_TEST)
 		return SLJIT_SUCCESS;
 
+	if (type & SLJIT_SIMD_LANE_ZERO) {
+		ins = (reg_size == 3) ? 0 : ((sljit_ins)1 << 30);
+
+		if ((type & SLJIT_SIMD_FLOAT) && freg == srcdst) {
+			FAIL_IF(push_inst(compiler, ORR_v | ins | VD(TMP_FREG1) | VN(freg) | VM(freg)));
+			srcdst = TMP_FREG1;
+			srcdstw = 0;
+		}
+
+		FAIL_IF(push_inst(compiler, MOVI | ins | VD(freg)));
+	}
+
 	if (srcdst & SLJIT_MEM) {
 		FAIL_IF(sljit_emit_simd_mem_offset(compiler, &srcdst, srcdstw));
 
@@ -2840,14 +2859,14 @@
 
 	if (type & SLJIT_SIMD_FLOAT) {
 		if (type & SLJIT_SIMD_STORE)
-			ins = INS_e | ((sljit_ins)1 << (16 + elem_size)) | ((sljit_ins)lane_index << (11 + elem_size)) | RD(srcdst) | VN(freg);
+			ins = INS_e | ((sljit_ins)1 << (16 + elem_size)) | ((sljit_ins)lane_index << (11 + elem_size)) | VD(srcdst) | VN(freg);
 		else
-			ins = INS_e | ((((sljit_ins)lane_index << 1) | 1) << (16 + elem_size)) | VD(freg) | RN(srcdst);
+			ins = INS_e | ((((sljit_ins)lane_index << 1) | 1) << (16 + elem_size)) | VD(freg) | VN(srcdst);
 
 		return push_inst(compiler, ins);
 	}
 
-	if (srcdst & SLJIT_IMM) {
+	if (srcdst == SLJIT_IMM) {
 		if (elem_size < 3)
 			srcdstw &= ((sljit_sw)1 << (((sljit_sw)1 << elem_size) << 3)) - 1;
 
@@ -2855,9 +2874,17 @@
 		srcdst = TMP_REG1;
 	}
 
-	if (type & SLJIT_SIMD_STORE)
-		ins = UMOV | RD(srcdst) | VN(freg);
-	else
+	if (type & SLJIT_SIMD_STORE) {
+		ins = RD(srcdst) | VN(freg);
+
+		if ((type & SLJIT_SIMD_LANE_SIGNED) && (elem_size < 2 || (elem_size == 2 && !(type & SLJIT_32)))) {
+			ins |= SMOV;
+
+			if (!(type & SLJIT_32))
+				ins |= (sljit_ins)1 << 30;
+		} else
+			ins |= UMOV;
+	} else
 		ins = INS | VD(freg) | RN(srcdst);
 
 	if (elem_size == 3)
@@ -2894,6 +2921,129 @@
 	return push_inst(compiler, DUP_e | ins | VD(freg) | VN(src));
 }
 
+SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_simd_extend(struct sljit_compiler *compiler, sljit_s32 type,
+	sljit_s32 freg,
+	sljit_s32 src, sljit_sw srcw)
+{
+	sljit_s32 reg_size = SLJIT_SIMD_GET_REG_SIZE(type);
+	sljit_s32 elem_size = SLJIT_SIMD_GET_ELEM_SIZE(type);
+	sljit_s32 elem2_size = SLJIT_SIMD_GET_ELEM2_SIZE(type);
+
+	CHECK_ERROR();
+	CHECK(check_sljit_emit_simd_extend(compiler, type, freg, src, srcw));
+
+	ADJUST_LOCAL_OFFSET(src, srcw);
+
+	if (reg_size != 3 && reg_size != 4)
+		return SLJIT_ERR_UNSUPPORTED;
+
+	if ((type & SLJIT_SIMD_FLOAT) && (elem_size != 2 || elem2_size != 3))
+		return SLJIT_ERR_UNSUPPORTED;
+
+	if (type & SLJIT_SIMD_TEST)
+		return SLJIT_SUCCESS;
+
+	if (src & SLJIT_MEM) {
+		FAIL_IF(sljit_emit_simd_mem_offset(compiler, &src, srcw));
+
+		if (reg_size == 4 && elem2_size - elem_size == 1)
+			FAIL_IF(push_inst(compiler, LD1 | ((sljit_ins)elem_size << 10) | RN(src) | VT(freg)));
+		else
+			FAIL_IF(push_inst(compiler, LD1_s | ((sljit_ins)0x2000 << (reg_size - elem2_size + elem_size)) | RN(src) | VT(freg)));
+		src = freg;
+	}
+
+	if (type & SLJIT_SIMD_FLOAT) {
+		SLJIT_ASSERT(reg_size == 4);
+		return push_inst(compiler, FCVTL | (1 << 22) | VD(freg) | VN(src));
+	}
+
+	do {
+		FAIL_IF(push_inst(compiler, ((type & SLJIT_SIMD_EXTEND_SIGNED) ? SSHLL : USHLL)
+			| ((sljit_ins)1 << (19 + elem_size)) | VD(freg) | VN(src)));
+		src = freg;
+	} while (++elem_size < elem2_size);
+
+	return SLJIT_SUCCESS;
+}
+
+SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_simd_sign(struct sljit_compiler *compiler, sljit_s32 type,
+	sljit_s32 freg,
+	sljit_s32 dst, sljit_sw dstw)
+{
+	sljit_s32 reg_size = SLJIT_SIMD_GET_REG_SIZE(type);
+	sljit_s32 elem_size = SLJIT_SIMD_GET_ELEM_SIZE(type);
+	sljit_ins ins, imms;
+	sljit_s32 dst_r;
+
+	CHECK_ERROR();
+	CHECK(check_sljit_emit_simd_sign(compiler, type, freg, dst, dstw));
+
+	ADJUST_LOCAL_OFFSET(dst, dstw);
+
+	if (reg_size != 3 && reg_size != 4)
+		return SLJIT_ERR_UNSUPPORTED;
+
+	if ((type & SLJIT_SIMD_FLOAT) && (elem_size < 2 || elem_size > 3))
+		return SLJIT_ERR_UNSUPPORTED;
+
+	if (type & SLJIT_SIMD_TEST)
+		return SLJIT_SUCCESS;
+
+	switch (elem_size) {
+	case 0:
+		imms = 0x643219;
+		ins = USHR | (0x9 << 16);
+		break;
+	case 1:
+		imms = (reg_size == 4) ? 0x643219 : 0x6231;
+		ins = USHR | (0x11 << 16);
+		break;
+	case 2:
+		imms = (reg_size == 4) ? 0x6231 : 0x61;
+		ins = USHR | (0x21 << 16);
+		break;
+	default:
+		imms = 0x61;
+		ins = USHR | (0x41 << 16);
+		break;
+	}
+
+	if (reg_size == 4)
+		ins |= (1 << 30);
+
+	FAIL_IF(push_inst(compiler, ins | VD(TMP_FREG1) | VN(freg)));
+
+	if (reg_size == 4 && elem_size > 0)
+		FAIL_IF(push_inst(compiler, XTN | ((sljit_ins)(elem_size - 1) << 22) | VD(TMP_FREG1) | VN(TMP_FREG1)));
+
+	if (imms >= 0x100) {
+		ins = (reg_size == 4 && elem_size == 0) ? (1 << 30) : 0;
+
+		do {
+			FAIL_IF(push_inst(compiler, USRA | ins | ((imms & 0xff) << 16) | VD(TMP_FREG1) | VN(TMP_FREG1)));
+			imms >>= 8;
+		} while (imms >= 0x100);
+	}
+
+	FAIL_IF(push_inst(compiler, USRA | (1 << 30) | (imms << 16) | VD(TMP_FREG1) | VN(TMP_FREG1)));
+
+	dst_r = FAST_IS_REG(dst) ? dst : TMP_REG1;
+	ins = (0x1 << 16);
+
+	if (reg_size == 4 && elem_size == 0) {
+		FAIL_IF(push_inst(compiler, INS_e | (0x3 << 16) | (0x8 << 11) | VD(TMP_FREG1) | VN(TMP_FREG1)));
+		ins = (0x2 << 16);
+	}
+
+	FAIL_IF(push_inst(compiler, UMOV | ins | RD(dst_r) | VN(TMP_FREG1)));
+
+	if (dst_r == TMP_REG1)
+		return emit_op_mem(compiler, STORE | ((type & SLJIT_32) ? INT_SIZE : WORD_SIZE), TMP_REG1, dst, dstw, TMP_REG2);
+
+	return SLJIT_SUCCESS;
+}
+
 SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_atomic_load(struct sljit_compiler *compiler, sljit_s32 op,
 	sljit_s32 dst_reg,
 	sljit_s32 mem_reg)
@@ -2906,10 +3056,8 @@
 #ifdef __ARM_FEATURE_ATOMICS
 	switch (GET_OPCODE(op)) {
 	case SLJIT_MOV32:
-		ins = LDR ^ (1 << 30);
-		break;
 	case SLJIT_MOV_U32:
-		ins = LDRSW;
+		ins = LDR ^ (1 << 30);
 		break;
 	case SLJIT_MOV_U16:
 		ins = LDRH;
@@ -2978,7 +3126,7 @@
 	}
 
 	if (cmp) {
-		FAIL_IF(push_inst(compiler, MOV ^ inv_bits | RM(temp_reg) | RD(TMP_REG1)));
+		FAIL_IF(push_inst(compiler, (MOV ^ inv_bits) | RM(temp_reg) | RD(TMP_REG1)));
 		tmp = TMP_REG1;
 	}
 	FAIL_IF(push_inst(compiler, ins | RM(tmp) | RN(mem_reg) | RD(src_reg)));
@@ -2986,7 +3134,7 @@
 		return SLJIT_SUCCESS;
 
 	FAIL_IF(push_inst(compiler, cmp | RM(tmp) | RN(temp_reg)));
-	FAIL_IF(push_inst(compiler, CSET ^ inv_bits | RD(tmp)));
+	FAIL_IF(push_inst(compiler, (CSET ^ inv_bits) | RD(tmp)));
 	return push_inst(compiler, cmp | RM(tmp) | RN(TMP_ZERO));
 #else /* !__ARM_FEATURE_ATOMICS */
 	SLJIT_UNUSED_ARG(tmp);
diff --git a/src/sljit/sljitNativeARM_T2_32.c b/src/sljit/sljitNativeARM_T2_32.c
index 8d0b769..f914eb7 100644
--- a/src/sljit/sljitNativeARM_T2_32.c
+++ b/src/sljit/sljitNativeARM_T2_32.c
@@ -41,16 +41,26 @@
 #define TMP_REG2	(SLJIT_NUMBER_OF_REGISTERS + 3)
 #define TMP_PC		(SLJIT_NUMBER_OF_REGISTERS + 4)
 
-#define TMP_FREG1	(SLJIT_NUMBER_OF_FLOAT_REGISTERS + 1)
-#define TMP_FREG2	(SLJIT_NUMBER_OF_FLOAT_REGISTERS + 2)
+#define TMP_FREG1	((SLJIT_NUMBER_OF_FLOAT_REGISTERS << 1) + 1)
+#define TMP_FREG2	((SLJIT_NUMBER_OF_FLOAT_REGISTERS << 1) + 2)
 
 /* See sljit_emit_enter and sljit_emit_op0 if you want to change them. */
 static const sljit_u8 reg_map[SLJIT_NUMBER_OF_REGISTERS + 5] = {
 	0, 0, 1, 2, 3, 11, 10, 9, 8, 7, 6, 5, 4, 13, 12, 14, 15
 };
 
-static const sljit_u8 freg_map[SLJIT_NUMBER_OF_FLOAT_REGISTERS + 3] = {
-	0, 0, 1, 2, 3, 4, 5, 15, 14, 13, 12, 11, 10, 9, 8, 6, 7
+static const sljit_u8 freg_map[(SLJIT_NUMBER_OF_FLOAT_REGISTERS << 1) + 3] = {
+	0,
+	0, 1, 2, 3, 4, 5, 15, 14, 13, 12, 11, 10, 9, 8,
+	0, 1, 2, 3, 4, 5, 15, 14, 13, 12, 11, 10, 9, 8,
+	6, 7
+};
+
+static const sljit_u8 freg_ebit_map[(SLJIT_NUMBER_OF_FLOAT_REGISTERS << 1) + 3] = {
+	0,
+	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+	0, 0
 };
 
 #define COPY_BITS(src, from, to, bits) \
@@ -79,9 +89,11 @@
 #define RD4(rd) ((sljit_ins)reg_map[rd] << 8)
 #define RT4(rt) ((sljit_ins)reg_map[rt] << 12)
 #define RN4(rn) ((sljit_ins)reg_map[rn] << 16)
-#define VM4(dm) ((sljit_ins)freg_map[dm])
-#define VD4(dd) ((sljit_ins)freg_map[dd] << 12)
-#define VN4(dn) ((sljit_ins)freg_map[dn] << 16)
+
+#define VM4(vm) (((sljit_ins)freg_map[vm]) | ((sljit_ins)freg_ebit_map[vm] << 5))
+#define VD4(vd) (((sljit_ins)freg_map[vd] << 12) | ((sljit_ins)freg_ebit_map[vd] << 22))
+#define VN4(vn) (((sljit_ins)freg_map[vn] << 16) | ((sljit_ins)freg_ebit_map[vn] << 7))
+
 #define IMM5(imm) \
 	(COPY_BITS(imm, 2, 12, 3) | (((sljit_ins)imm & 0x3) << 6))
 #define IMM12(imm) \
@@ -128,12 +140,12 @@
 #define EORS		0x4040
 #define EOR_W		0xea800000
 #define IT		0xbf00
-#define LDAEX		0xe8d00fef
-#define LDAEXB		0xe8d00fcf
-#define LDAEXH		0xe8d00fdf
 #define LDR		0xf8d00000
 #define LDR_SP		0x9800
 #define LDRD		0xe9500000
+#define LDREX		0xe8500f00
+#define LDREXB		0xe8d00f4f
+#define LDREXH		0xe8d00f5f
 #define LDRI		0xf8500800
 #define LSLS		0x4080
 #define LSLSI		0x0000
@@ -180,9 +192,9 @@
 #define SMULL		0xfb800000
 #define STR_SP		0x9000
 #define STRD		0xe9400000
-#define STREX		0xe8c00fe0
-#define STREXB		0xe8c00fc0
-#define STREXH		0xe8c00fd0
+#define STREX		0xe8400000
+#define STREXB		0xe8c00f40
+#define STREXH		0xe8c00f50
 #define SUBS		0x1a00
 #define SUBSI3		0x1e00
 #define SUBSI8		0x3800
@@ -222,17 +234,37 @@
 #define VMOV2		0xec400a10
 #define VMOV_i		0xef800010
 #define VMOV_s		0xee000b10
+#define VMOVN		0xffb20200
 #define VMRS		0xeef1fa10
 #define VMUL_F32	0xee200a00
 #define VNEG_F32	0xeeb10a40
 #define VORR		0xef200110
 #define VPOP		0xecbd0b00
 #define VPUSH		0xed2d0b00
+#define VSHLL		0xef800a10
+#define VSHR		0xef800010
+#define VSRA		0xef800110
 #define VST1		0xf9000000
 #define VST1_s		0xf9800000
 #define VSTR_F32	0xed000a00
 #define VSUB_F32	0xee300a40
 
+#if (defined SLJIT_ARGUMENT_CHECKS && SLJIT_ARGUMENT_CHECKS)
+
+static sljit_s32 function_check_is_freg(struct sljit_compiler *compiler, sljit_s32 fr, sljit_s32 is_32)
+{
+	if (compiler->scratches == -1)
+		return 0;
+
+	if (is_32 && fr >= (SLJIT_FS0 + SLJIT_FR0) && fr <= (SLJIT_FS0 + SLJIT_FS0))
+		fr -= SLJIT_FS0;
+
+	return (fr >= SLJIT_FR0 && fr < (SLJIT_FR0 + compiler->fscratches))
+		|| (fr > (SLJIT_FS0 - compiler->fsaveds) && fr <= SLJIT_FS0);
+}
+
+#endif /* SLJIT_ARGUMENT_CHECKS */
+
 static sljit_s32 push_inst16(struct sljit_compiler *compiler, sljit_ins inst)
 {
 	sljit_u16 *ptr;
@@ -509,15 +541,16 @@
 {
 	switch (feature_type) {
 	case SLJIT_HAS_FPU:
+	case SLJIT_HAS_F64_AS_F32_PAIR:
 	case SLJIT_HAS_SIMD:
-	case SLJIT_SIMD_REGS_ARE_PAIRS:
 #ifdef SLJIT_IS_FPU_AVAILABLE
-		return SLJIT_IS_FPU_AVAILABLE;
+		return (SLJIT_IS_FPU_AVAILABLE) != 0;
 #else
 		/* Available by default. */
 		return 1;
 #endif
 
+	case SLJIT_SIMD_REGS_ARE_PAIRS:
 	case SLJIT_HAS_CLZ:
 	case SLJIT_HAS_CTZ:
 	case SLJIT_HAS_REV:
@@ -526,6 +559,7 @@
 	case SLJIT_HAS_PREFETCH:
 	case SLJIT_HAS_COPY_F32:
 	case SLJIT_HAS_COPY_F64:
+	case SLJIT_HAS_ATOMIC:
 		return 1;
 
 	default:
@@ -682,9 +716,14 @@
 			break;
 		case SLJIT_ADDC:
 			compiler->status_flags_state = SLJIT_CURRENT_FLAGS_ADD;
-			imm = get_imm(imm);
-			if (imm != INVALID_IMM)
-				return push_inst32(compiler, ADCI | (flags & SET_FLAGS) | RD4(dst) | RN4(reg) | imm);
+			imm2 = get_imm(imm);
+			if (imm2 != INVALID_IMM)
+				return push_inst32(compiler, ADCI | (flags & SET_FLAGS) | RD4(dst) | RN4(reg) | imm2);
+			if (flags & ARG2_IMM) {
+				imm = get_imm(~imm);
+				if (imm != INVALID_IMM)
+					return push_inst32(compiler, SBCI | (flags & SET_FLAGS) | RD4(dst) | RN4(reg) | imm);
+			}
 			break;
 		case SLJIT_SUB:
 			compiler->status_flags_state = SLJIT_CURRENT_FLAGS_SUB;
@@ -737,9 +776,12 @@
 			compiler->status_flags_state = SLJIT_CURRENT_FLAGS_SUB;
 			if (flags & ARG1_IMM)
 				break;
-			imm = get_imm(imm);
+			imm2 = get_imm(imm);
+			if (imm2 != INVALID_IMM)
+				return push_inst32(compiler, SBCI | (flags & SET_FLAGS) | RD4(dst) | RN4(reg) | imm2);
+			imm = get_imm(~imm);
 			if (imm != INVALID_IMM)
-				return push_inst32(compiler, SBCI | (flags & SET_FLAGS) | RD4(dst) | RN4(reg) | imm);
+				return push_inst32(compiler, ADCI | (flags & SET_FLAGS) | RD4(dst) | RN4(reg) | imm);
 			break;
 		case SLJIT_AND:
 			imm2 = get_imm(imm);
@@ -818,8 +860,7 @@
 			imm = arg2;
 			arg2 = (arg1 == TMP_REG1) ? TMP_REG2 : TMP_REG1;
 			FAIL_IF(load_immediate(compiler, (sljit_s32)arg2, imm));
-		}
-		else {
+		} else {
 			imm = arg1;
 			arg1 = (arg2 == TMP_REG1) ? TMP_REG2 : TMP_REG1;
 			FAIL_IF(load_immediate(compiler, (sljit_s32)arg1, imm));
@@ -1754,22 +1795,22 @@
 			break;
 		case SLJIT_MOV_U8:
 			flags = BYTE_SIZE;
-			if (src & SLJIT_IMM)
+			if (src == SLJIT_IMM)
 				srcw = (sljit_u8)srcw;
 			break;
 		case SLJIT_MOV_S8:
 			flags = BYTE_SIZE | SIGNED;
-			if (src & SLJIT_IMM)
+			if (src == SLJIT_IMM)
 				srcw = (sljit_s8)srcw;
 			break;
 		case SLJIT_MOV_U16:
 			flags = HALF_SIZE;
-			if (src & SLJIT_IMM)
+			if (src == SLJIT_IMM)
 				srcw = (sljit_u16)srcw;
 			break;
 		case SLJIT_MOV_S16:
 			flags = HALF_SIZE | SIGNED;
-			if (src & SLJIT_IMM)
+			if (src == SLJIT_IMM)
 				srcw = (sljit_s16)srcw;
 			break;
 		default:
@@ -1778,7 +1819,7 @@
 			break;
 		}
 
-		if (src & SLJIT_IMM)
+		if (src == SLJIT_IMM)
 			FAIL_IF(emit_op_imm(compiler, SLJIT_MOV | ARG2_IMM, dst_r, TMP_REG2, (sljit_uw)srcw));
 		else if (src & SLJIT_MEM) {
 			FAIL_IF(emit_op_mem(compiler, flags, dst_r, src, srcw, TMP_REG1));
@@ -1831,7 +1872,7 @@
 	if (dst == TMP_REG1)
 		flags |= UNUSED_RETURN;
 
-	if (src1 & SLJIT_IMM)
+	if (src1 == SLJIT_IMM)
 		flags |= ARG1_IMM;
 	else if (src1 & SLJIT_MEM) {
 		emit_op_mem(compiler, WORD_SIZE, TMP_REG1, src1, src1w, TMP_REG1);
@@ -1840,7 +1881,7 @@
 	else
 		src1w = src1;
 
-	if (src2 & SLJIT_IMM)
+	if (src2 == SLJIT_IMM)
 		flags |= ARG2_IMM;
 	else if (src2 & SLJIT_MEM) {
 		src2_reg = (!(flags & ARG1_IMM) && (src1w == TMP_REG1)) ? TMP_REG2 : TMP_REG1;
@@ -1889,7 +1930,7 @@
 
 	ADJUST_LOCAL_OFFSET(src3, src3w);
 
-	if (src3 & SLJIT_IMM) {
+	if (src3 == SLJIT_IMM) {
 		src3w &= 0x1f;
 
 		if (src3w == 0)
@@ -1998,7 +2039,7 @@
 {
 	CHECK_REG_INDEX(check_sljit_get_register_index(type, reg));
 
-	if (type == SLJIT_INT_REGISTER)
+	if (type == SLJIT_GP_REGISTER)
 		return reg_map[reg];
 
 	if (type == SLJIT_FLOAT_REGISTER || type == SLJIT_SIMD_REG_64)
@@ -2259,8 +2300,10 @@
 SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_fset32(struct sljit_compiler *compiler,
 	sljit_s32 freg, sljit_f32 value)
 {
+#if defined(__ARM_NEON) && __ARM_NEON
 	sljit_u32 exp;
 	sljit_ins ins;
+#endif /* NEON */
 	union {
 		sljit_u32 imm;
 		sljit_f32 value;
@@ -2271,6 +2314,7 @@
 
 	u.value = value;
 
+#if defined(__ARM_NEON) && __ARM_NEON
 	if ((u.imm << (32 - 19)) == 0) {
 		exp = (u.imm >> (23 + 2)) & 0x3f;
 
@@ -2279,6 +2323,7 @@
 			return push_inst32(compiler, (VMOV_F32 ^ (1 << 6)) | ((ins & 0xf0) << 12) | VD4(freg) | (ins & 0xf));
 		}
 	}
+#endif /* NEON */
 
 	FAIL_IF(load_immediate(compiler, TMP_REG1, u.imm));
 	return push_inst32(compiler, VMOV | VN4(freg) | RT4(TMP_REG1));
@@ -2287,8 +2332,10 @@
 SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_fset64(struct sljit_compiler *compiler,
 	sljit_s32 freg, sljit_f64 value)
 {
+#if defined(__ARM_NEON) && __ARM_NEON
 	sljit_u32 exp;
 	sljit_ins ins;
+#endif /* NEON */
 	union {
 		sljit_u32 imm[2];
 		sljit_f64 value;
@@ -2299,6 +2346,7 @@
 
 	u.value = value;
 
+#if defined(__ARM_NEON) && __ARM_NEON
 	if (u.imm[0] == 0 && (u.imm[1] << (64 - 48)) == 0) {
 		exp = (u.imm[1] >> ((52 - 32) + 2)) & 0x1ff;
 
@@ -2307,6 +2355,7 @@
 			return push_inst32(compiler, (VMOV_F32 ^ (1 << 6)) | (1 << 8) | ((ins & 0xf0) << 12) | VD4(freg) | (ins & 0xf));
 		}
 	}
+#endif /* NEON */
 
 	FAIL_IF(load_immediate(compiler, TMP_REG1, u.imm[0]));
 	if (u.imm[0] == u.imm[1])
@@ -2729,7 +2778,7 @@
 
 	SLJIT_ASSERT(reg_map[TMP_REG1] != 14);
 
-	if (!(src & SLJIT_IMM)) {
+	if (src != SLJIT_IMM) {
 		if (FAST_IS_REG(src)) {
 			SLJIT_ASSERT(reg_map[src] != 14);
 			return push_inst16(compiler, (type <= SLJIT_JUMP ? BX : BLX) | RN3(src));
@@ -2929,7 +2978,7 @@
 
 	cc = get_cc(compiler, type & ~SLJIT_32);
 
-	if (!(src1 & SLJIT_IMM)) {
+	if (src1 != SLJIT_IMM) {
 		FAIL_IF(push_inst16(compiler, IT | (cc << 4) | 0x8));
 		return push_inst16(compiler, MOV | SET_REGS44(dst_reg, src1));
 	}
@@ -3018,7 +3067,7 @@
 				imm = get_imm((sljit_uw)(memw & ~0xfff));
 
 				if (imm != INVALID_IMM)
-					memw &= 0xff;
+					memw &= 0xfff;
 			}
 
 			if (imm == INVALID_IMM) {
@@ -3357,6 +3406,20 @@
 	return push_inst16(compiler, ADD | SET_REGS44(TMP_REG1, mem));
 }
 
+static SLJIT_INLINE sljit_s32 simd_get_quad_reg_index(sljit_s32 freg)
+{
+	freg += freg & 0x1;
+
+	SLJIT_ASSERT((freg_map[freg] & 0x1) == (freg <= SLJIT_NUMBER_OF_SCRATCH_FLOAT_REGISTERS));
+
+	if (freg <= SLJIT_NUMBER_OF_SCRATCH_FLOAT_REGISTERS)
+		freg--;
+
+	return freg;
+}
+
+#define SLJIT_QUAD_OTHER_HALF(freg) ((((freg) & 0x1) << 1) - 1)
+
 SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_simd_mov(struct sljit_compiler *compiler, sljit_s32 type,
 	sljit_s32 freg,
 	sljit_s32 srcdst, sljit_sw srcdstw)
@@ -3381,9 +3444,12 @@
 		return SLJIT_SUCCESS;
 
 	if (reg_size == 4)
-		freg -= 1 - (freg & 0x1);
+		freg = simd_get_quad_reg_index(freg);
 
 	if (!(srcdst & SLJIT_MEM)) {
+		if (reg_size == 4)
+			srcdst = simd_get_quad_reg_index(srcdst);
+
 		if (type & SLJIT_SIMD_STORE)
 			ins = VD4(srcdst) | VN4(freg) | VM4(freg);
 		else
@@ -3530,9 +3596,9 @@
 		return SLJIT_SUCCESS;
 
 	if (reg_size == 4)
-		freg -= 1 - (freg & 0x1);
+		freg = simd_get_quad_reg_index(freg);
 
-	if ((src & SLJIT_IMM) && srcw == 0)
+	if (src == SLJIT_IMM && srcw == 0)
 		return push_inst32(compiler, VMOV_i | ((reg_size == 4) ? (1 << 6) : 0) | VD4(freg));
 
 	if (SLJIT_UNLIKELY(elem_size == 3)) {
@@ -3544,7 +3610,7 @@
 		} else if (freg != src)
 			FAIL_IF(push_inst32(compiler, VORR | VD4(freg) | VN4(src) | VM4(src)));
 
-		freg++;
+		freg += SLJIT_QUAD_OTHER_HALF(freg);
 
 		if (freg != src)
 			return push_inst32(compiler, VORR | VD4(freg) | VN4(src) | VM4(src));
@@ -3563,15 +3629,16 @@
 	}
 
 	if (type & SLJIT_SIMD_FLOAT) {
-		ins = ((sljit_ins)1 << (16 + elem_size));
+		SLJIT_ASSERT(elem_size == 2);
+		ins = ((sljit_ins)freg_ebit_map[src] << (16 + 2 + 1)) | ((sljit_ins)1 << (16 + 2));
 
 		if (reg_size == 4)
 			ins |= 1 << 6;
 
-		return push_inst32(compiler, VDUP_s | ins | VD4(freg) | VM4(src));
+		return push_inst32(compiler, VDUP_s | ins | VD4(freg) | (sljit_ins)freg_map[src]);
 	}
 
-	if (src & SLJIT_IMM) {
+	if (src == SLJIT_IMM) {
 		if (elem_size < 2)
 			srcw &= ((sljit_sw)1 << (((sljit_sw)1 << elem_size) << 3)) - 1;
 
@@ -3628,13 +3695,37 @@
 	if (type & SLJIT_SIMD_TEST)
 		return SLJIT_SUCCESS;
 
-	if (reg_size == 4) {
-		freg -= 1 - (freg & 0x1);
+	if (reg_size == 4)
+		freg = simd_get_quad_reg_index(freg);
 
-		if (lane_index >= (0x8 >> elem_size)) {
-			lane_index -= (0x8 >> elem_size);
-			freg++;
+	if (type & SLJIT_SIMD_LANE_ZERO) {
+		ins = (reg_size == 3) ? 0 : ((sljit_ins)1 << 6);
+
+		if (type & SLJIT_SIMD_FLOAT) {
+			if (elem_size == 3 && !(srcdst & SLJIT_MEM)) {
+				if (lane_index == 1)
+					freg += SLJIT_QUAD_OTHER_HALF(freg);
+
+				if (srcdst != freg)
+					FAIL_IF(push_inst32(compiler, VORR | VD4(freg) | VN4(srcdst) | VM4(srcdst)));
+
+				freg += SLJIT_QUAD_OTHER_HALF(freg);
+				return push_inst32(compiler, VMOV_i | VD4(freg));
+			}
+
+			if (srcdst == freg || (elem_size == 3 && srcdst == (freg + SLJIT_QUAD_OTHER_HALF(freg)))) {
+				FAIL_IF(push_inst32(compiler, VORR | ins | VD4(TMP_FREG1) | VN4(freg) | VM4(freg)));
+				srcdst = TMP_FREG1;
+				srcdstw = 0;
+			}
 		}
+
+		FAIL_IF(push_inst32(compiler, VMOV_i | ins | VD4(freg)));
+	}
+
+	if (reg_size == 4 && lane_index >= (0x8 >> elem_size)) {
+		lane_index -= (0x8 >> elem_size);
+		freg += SLJIT_QUAD_OTHER_HALF(freg);
 	}
 
 	if (srcdst & SLJIT_MEM) {
@@ -3656,16 +3747,22 @@
 		}
 
 		if (type & SLJIT_SIMD_STORE) {
-			if (lane_index == 0)
-				return push_inst32(compiler, VMOV_F32 | SLJIT_32 | VD4(srcdst) | VM4(freg));
-			return push_inst32(compiler, VDUP_s | 0xc0000 | VD4(srcdst) | VM4(freg));
+			if (freg_ebit_map[freg] == 0) {
+				if (lane_index == 1)
+					freg = SLJIT_F64_SECOND(freg);
+
+				return push_inst32(compiler, VMOV_F32 | VD4(srcdst) | VM4(freg));
+			}
+
+			FAIL_IF(push_inst32(compiler, VMOV_s | (1 << 20) | ((sljit_ins)lane_index << 21) | VN4(freg) | RT4(TMP_REG1)));
+			return push_inst32(compiler, VMOV | VN4(srcdst) | RT4(TMP_REG1));
 		}
 
 		FAIL_IF(push_inst32(compiler, VMOV | (1 << 20) | VN4(srcdst) | RT4(TMP_REG1)));
 		return push_inst32(compiler, VMOV_s | ((sljit_ins)lane_index << 21) | VN4(freg) | RT4(TMP_REG1));
 	}
 
-	if (srcdst & SLJIT_IMM) {
+	if (srcdst == SLJIT_IMM) {
 		if (elem_size < 2)
 			srcdstw &= ((sljit_sw)1 << (((sljit_sw)1 << elem_size) << 3)) - 1;
 
@@ -3684,10 +3781,10 @@
 	ins |= (sljit_ins)(((lane_index & 0x4) << 19) | ((lane_index & 0x3) << 5));
 
 	if (type & SLJIT_SIMD_STORE) {
-		ins |= 0x100000;
+		ins |= (1 << 20);
 
-		if (elem_size < 2)
-			ins |= 0x800000;
+		if (elem_size < 2 && !(type & SLJIT_SIMD_LANE_SIGNED))
+			ins |= (1 << 23);
 	}
 
 	return push_inst32(compiler, VMOV_s | ins | VN4(freg) | RT4(srcdst));
@@ -3714,12 +3811,12 @@
 		return SLJIT_SUCCESS;
 
 	if (reg_size == 4) {
-		freg -= 1 - (freg & 0x1);
-		src -= 1 - (src & 0x1);
+		freg = simd_get_quad_reg_index(freg);
+		src = simd_get_quad_reg_index(src);
 
 		if (src_lane_index >= (0x8 >> elem_size)) {
 			src_lane_index -= (0x8 >> elem_size);
-			src++;
+			src += SLJIT_QUAD_OTHER_HALF(src);
 		}
 	}
 
@@ -3727,7 +3824,7 @@
 		if (freg != src)
 			FAIL_IF(push_inst32(compiler, VORR | VD4(freg) | VN4(src) | VM4(src)));
 
-		freg++;
+		freg += SLJIT_QUAD_OTHER_HALF(freg);
 
 		if (freg != src)
 			return push_inst32(compiler, VORR | VD4(freg) | VN4(src) | VM4(src));
@@ -3742,26 +3839,168 @@
 	return push_inst32(compiler, VDUP_s | ins | VD4(freg) | VM4(src));
 }
 
+SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_simd_extend(struct sljit_compiler *compiler, sljit_s32 type,
+	sljit_s32 freg,
+	sljit_s32 src, sljit_sw srcw)
+{
+	sljit_s32 reg_size = SLJIT_SIMD_GET_REG_SIZE(type);
+	sljit_s32 elem_size = SLJIT_SIMD_GET_ELEM_SIZE(type);
+	sljit_s32 elem2_size = SLJIT_SIMD_GET_ELEM2_SIZE(type);
+	sljit_s32 dst_reg;
+
+	CHECK_ERROR();
+	CHECK(check_sljit_emit_simd_extend(compiler, type, freg, src, srcw));
+
+	ADJUST_LOCAL_OFFSET(src, srcw);
+
+	if (reg_size != 3 && reg_size != 4)
+		return SLJIT_ERR_UNSUPPORTED;
+
+	if ((type & SLJIT_SIMD_FLOAT) && (elem_size != 2 || elem2_size != 3))
+		return SLJIT_ERR_UNSUPPORTED;
+
+	if (type & SLJIT_SIMD_TEST)
+		return SLJIT_SUCCESS;
+
+	if (reg_size == 4)
+		freg = simd_get_quad_reg_index(freg);
+
+	if (src & SLJIT_MEM) {
+		FAIL_IF(sljit_emit_simd_mem_offset(compiler, &src, srcw));
+		if (reg_size == 4 && elem2_size - elem_size == 1)
+			FAIL_IF(push_inst32(compiler, VLD1 | (0x7 << 8) | VD4(freg) | RN4(src) | 0xf));
+		else
+			FAIL_IF(push_inst32(compiler, VLD1_s | (sljit_ins)((reg_size - elem2_size + elem_size) << 10) | VD4(freg) | RN4(src) | 0xf));
+		src = freg;
+	} else if (reg_size == 4)
+		src = simd_get_quad_reg_index(src);
+
+	if (!(type & SLJIT_SIMD_FLOAT)) {
+		dst_reg = (reg_size == 4) ? freg : TMP_FREG1;
+
+		do {
+			FAIL_IF(push_inst32(compiler, VSHLL | ((type & SLJIT_SIMD_EXTEND_SIGNED) ? 0 : (1 << 28))
+				| ((sljit_ins)1 << (19 + elem_size)) | VD4(dst_reg) | VM4(src)));
+			src = dst_reg;
+		} while (++elem_size < elem2_size);
+
+		if (dst_reg == TMP_FREG1)
+			return push_inst32(compiler, VORR | VD4(freg) | VN4(TMP_FREG1) | VM4(TMP_FREG1));
+		return SLJIT_SUCCESS;
+	}
+
+	/* No SIMD variant, must use VFP instead. */
+	SLJIT_ASSERT(reg_size == 4);
+
+	if (freg == src) {
+		freg += SLJIT_QUAD_OTHER_HALF(freg);
+		FAIL_IF(push_inst32(compiler, VCVT_F64_F32 | VD4(freg) | VM4(src) | 0x20));
+		freg += SLJIT_QUAD_OTHER_HALF(freg);
+		return push_inst32(compiler, VCVT_F64_F32 | VD4(freg) | VM4(src));
+	}
+
+	FAIL_IF(push_inst32(compiler, VCVT_F64_F32 | VD4(freg) | VM4(src)));
+	freg += SLJIT_QUAD_OTHER_HALF(freg);
+	return push_inst32(compiler, VCVT_F64_F32 | VD4(freg) | VM4(src) | 0x20);
+}
+
+SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_simd_sign(struct sljit_compiler *compiler, sljit_s32 type,
+	sljit_s32 freg,
+	sljit_s32 dst, sljit_sw dstw)
+{
+	sljit_s32 reg_size = SLJIT_SIMD_GET_REG_SIZE(type);
+	sljit_s32 elem_size = SLJIT_SIMD_GET_ELEM_SIZE(type);
+	sljit_ins ins, imms;
+	sljit_s32 dst_r;
+
+	CHECK_ERROR();
+	CHECK(check_sljit_emit_simd_sign(compiler, type, freg, dst, dstw));
+
+	ADJUST_LOCAL_OFFSET(dst, dstw);
+
+	if (reg_size != 3 && reg_size != 4)
+		return SLJIT_ERR_UNSUPPORTED;
+
+	if ((type & SLJIT_SIMD_FLOAT) && (elem_size < 2 || elem_size > 3))
+		return SLJIT_ERR_UNSUPPORTED;
+
+	if (type & SLJIT_SIMD_TEST)
+		return SLJIT_SUCCESS;
+
+	switch (elem_size) {
+	case 0:
+		imms = 0x243219;
+		ins = VSHR | (1 << 28) | (0x9 << 16);
+		break;
+	case 1:
+		imms = (reg_size == 4) ? 0x243219 : 0x2231;
+		ins = VSHR | (1 << 28) | (0x11 << 16);
+		break;
+	case 2:
+		imms = (reg_size == 4) ? 0x2231 : 0x21;
+		ins = VSHR | (1 << 28) | (0x21 << 16);
+		break;
+	default:
+		imms = 0x21;
+		ins = VSHR | (1 << 28) | (0x1 << 16) | (1 << 7);
+		break;
+	}
+
+	if (reg_size == 4) {
+		freg = simd_get_quad_reg_index(freg);
+		ins |= (1 << 6);
+	}
+
+	SLJIT_ASSERT((freg_map[TMP_FREG1] & 0x1) == 0);
+	FAIL_IF(push_inst32(compiler, ins | VD4(TMP_FREG1) | VM4(freg)));
+
+	if (reg_size == 4 && elem_size > 0)
+		FAIL_IF(push_inst32(compiler, VMOVN | ((sljit_ins)(elem_size - 1) << 18) | VD4(TMP_FREG1) | VM4(TMP_FREG1)));
+
+	ins = (reg_size == 4 && elem_size == 0) ? (1 << 6) : 0;
+
+	while (imms >= 0x100) {
+		FAIL_IF(push_inst32(compiler, VSRA | (1 << 28) | ins | ((imms & 0xff) << 16) | VD4(TMP_FREG1) | VM4(TMP_FREG1)));
+		imms >>= 8;
+	}
+
+	FAIL_IF(push_inst32(compiler, VSRA | (1 << 28) | ins | (1 << 7) | (imms << 16) | VD4(TMP_FREG1) | VM4(TMP_FREG1)));
+
+	dst_r = FAST_IS_REG(dst) ? dst : TMP_REG1;
+	FAIL_IF(push_inst32(compiler, VMOV_s | (1 << 20) | (1 << 23) | (0x2 << 21) | RT4(dst_r) | VN4(TMP_FREG1)));
+
+	if (reg_size == 4 && elem_size == 0) {
+		SLJIT_ASSERT(freg_map[TMP_FREG1] + 1 == freg_map[TMP_FREG2]);
+		FAIL_IF(push_inst32(compiler, VMOV_s | (1 << 20) | (1 << 23) | (0x2 << 21) | RT4(TMP_REG2) | VN4(TMP_FREG2)));
+		FAIL_IF(push_inst32(compiler, ORR_W | RD4(dst_r) | RN4(dst_r) | RM4(TMP_REG2) | (0x2 << 12)));
+	}
+
+	if (dst_r == TMP_REG1)
+		return emit_op_mem(compiler, STORE | WORD_SIZE, TMP_REG1, dst, dstw, TMP_REG2);
+
+	return SLJIT_SUCCESS;
+}
+
 #undef FPU_LOAD
 
 SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_atomic_load(struct sljit_compiler *compiler, sljit_s32 op,
 	sljit_s32 dst_reg,
 	sljit_s32 mem_reg)
 {
-	sljit_ins ins = 0;
+	sljit_ins ins;
 
 	CHECK_ERROR();
 	CHECK(check_sljit_emit_atomic_load(compiler, op, dst_reg, mem_reg));
 
 	switch (GET_OPCODE(op)) {
 	case SLJIT_MOV_U8:
-		ins = LDAEXB;
+		ins = LDREXB;
 		break;
 	case SLJIT_MOV_U16:
-		ins = LDAEXH;
+		ins = LDREXH;
 		break;
 	default:
-		ins = LDAEX;
+		ins = LDREX;
 		break;
 	}
 
@@ -3773,25 +4012,31 @@
 	sljit_s32 mem_reg,
 	sljit_s32 temp_reg)
 {
-	sljit_ins ins = 0;
+	sljit_ins ins;
+
+	/* temp_reg == mem_reg is undefined so use another temp register */
+	SLJIT_UNUSED_ARG(temp_reg);
 
 	CHECK_ERROR();
 	CHECK(check_sljit_emit_atomic_store(compiler, op, src_reg, mem_reg, temp_reg));
 
 	switch (GET_OPCODE(op)) {
 	case SLJIT_MOV_U8:
-		ins = STREXB;
+		ins = STREXB | RM4(TMP_REG1);
 		break;
 	case SLJIT_MOV_U16:
-		ins = STREXH;
+		ins = STREXH | RM4(TMP_REG1);
 		break;
 	default:
-		ins = STREX;
+		ins = STREX | RD4(TMP_REG1);
 		break;
 	}
 
-	FAIL_IF(push_inst32(compiler, ins | RN4(mem_reg) | RT4(src_reg)| RM4(TMP_REG1)));
-	return push_inst32(compiler, CMPI_W | RN4(TMP_REG1) | 0);
+	FAIL_IF(push_inst32(compiler, ins | RN4(mem_reg) | RT4(src_reg)));
+	if (op & SLJIT_SET_ATOMIC_STORED)
+		return push_inst32(compiler, CMPI_W | RN4(TMP_REG1));
+
+	return SLJIT_SUCCESS;
 }
 
 SLJIT_API_FUNC_ATTRIBUTE struct sljit_const* sljit_emit_const(struct sljit_compiler *compiler, sljit_s32 dst, sljit_sw dstw, sljit_sw init_value)
diff --git a/src/sljit/sljitNativeLOONGARCH_64.c b/src/sljit/sljitNativeLOONGARCH_64.c
index 79e8202..fe6fad4 100644
--- a/src/sljit/sljitNativeLOONGARCH_64.c
+++ b/src/sljit/sljitNativeLOONGARCH_64.c
@@ -46,7 +46,6 @@
 #define TMP_FREG1	(SLJIT_NUMBER_OF_FLOAT_REGISTERS + 1)
 #define TMP_FREG2	(SLJIT_NUMBER_OF_FLOAT_REGISTERS + 2)
 
-
 static const sljit_u8 reg_map[SLJIT_NUMBER_OF_REGISTERS + 7] = {
 	0, 4, 5, 6, 7, 8, 9, 10, 11, 16, 17, 18, 19, 20, 22, 31, 30, 29, 28, 27, 26, 25, 24, 23, 3, 13, 1, 14, 12, 15
 };
@@ -245,6 +244,12 @@
 #define LL_D OPC_2RI14(0x22)
 #define SC_D OPC_2RI14(0x23)
 
+/* LoongArch V1.10 Instructions */
+#define AMCAS_B OPC_3R(0x70B0)
+#define AMCAS_H OPC_3R(0x70B1)
+#define AMCAS_W OPC_3R(0x70B2)
+#define AMCAS_D OPC_3R(0x70B3)
+
 /* Other instructions */
 #define BREAK OPC_3R(0x54)
 #define DBGCALL OPC_3R(0x55)
@@ -332,6 +337,19 @@
 
 #define INST(inst, type) ((sljit_ins)((type & SLJIT_32) ? inst##_W : inst##_D))
 
+/* LoongArch CPUCFG register for feature detection */
+#define LOONGARCH_CFG2				0x02
+#define LOONGARCH_FEATURE_LAMCAS	(1 << 28)
+
+sljit_u32 cpu_feature_list = 0;
+
+static SLJIT_INLINE sljit_u32 get_cpu_features(void)
+{
+	if (cpu_feature_list == 0)
+		__asm__ ("cpucfg %0, %1" : "+&r"(cpu_feature_list) : "r"(LOONGARCH_CFG2));
+	return cpu_feature_list;
+}
+
 static sljit_s32 push_inst(struct sljit_compiler *compiler, sljit_ins ins)
 {
 	sljit_ins *ptr = (sljit_ins*)ensure_buf(compiler, sizeof(sljit_ins));
@@ -629,12 +647,15 @@
 	{
 	case SLJIT_HAS_FPU:
 #ifdef SLJIT_IS_FPU_AVAILABLE
-		return SLJIT_IS_FPU_AVAILABLE;
+		return (SLJIT_IS_FPU_AVAILABLE) != 0;
 #else
 		/* Available by default. */
 		return 1;
 #endif
 
+	case SLJIT_HAS_ATOMIC:
+		return (LOONGARCH_FEATURE_LAMCAS & get_cpu_features());
+
 	case SLJIT_HAS_CLZ:
 	case SLJIT_HAS_CTZ:
 	case SLJIT_HAS_REV:
@@ -1603,11 +1624,11 @@
 		flags |= SLOW_DEST;
 
 	if (flags & IMM_OP) {
-		if ((src2 & SLJIT_IMM) && src2w != 0 && src2w <= I12_MAX && src2w >= I12_MIN) {
+		if (src2 == SLJIT_IMM && src2w != 0 && src2w <= I12_MAX && src2w >= I12_MIN) {
 			flags |= SRC2_IMM;
 			src2_r = src2w;
 		}
-		else if ((flags & CUMULATIVE_OP) && (src1 & SLJIT_IMM) && src1w != 0 && src1w <= I12_MAX && src1w >= I12_MIN) {
+		else if ((flags & CUMULATIVE_OP) && src1 == SLJIT_IMM && src1w != 0 && src1w <= I12_MAX && src1w >= I12_MIN) {
 			flags |= SRC2_IMM;
 			src2_r = src1w;
 
@@ -1624,7 +1645,7 @@
 		src1_r = src1;
 		flags |= REG1_SOURCE;
 	}
-	else if (src1 & SLJIT_IMM) {
+	else if (src1 == SLJIT_IMM) {
 		if (src1w) {
 			FAIL_IF(load_immediate(compiler, TMP_REG1, src1w));
 			src1_r = TMP_REG1;
@@ -1647,7 +1668,7 @@
 		if ((flags & (REG_DEST | MOVE_OP)) == MOVE_OP)
 			dst_r = (sljit_s32)src2_r;
 	}
-	else if (src2 & SLJIT_IMM) {
+	else if (src2 == SLJIT_IMM) {
 		if (!(flags & SRC2_IMM)) {
 			if (src2w) {
 				FAIL_IF(load_immediate(compiler, sugg_src2_r, src2w));
@@ -1761,24 +1782,24 @@
 		return emit_op(compiler, SLJIT_MOV, WORD_DATA | MOVE_OP, dst, dstw, TMP_REG1, 0, src, srcw);
 
 	case SLJIT_MOV_U32:
-		return emit_op(compiler, SLJIT_MOV_U32, INT_DATA | MOVE_OP, dst, dstw, TMP_REG1, 0, src, (src & SLJIT_IMM) ? (sljit_u32)srcw : srcw);
+		return emit_op(compiler, SLJIT_MOV_U32, INT_DATA | MOVE_OP, dst, dstw, TMP_REG1, 0, src, (src == SLJIT_IMM) ? (sljit_u32)srcw : srcw);
 
 	case SLJIT_MOV_S32:
 	/* Logical operators have no W variant, so sign extended input is necessary for them. */
 	case SLJIT_MOV32:
-		return emit_op(compiler, SLJIT_MOV_S32, INT_DATA | SIGNED_DATA | MOVE_OP, dst, dstw, TMP_REG1, 0, src, (src & SLJIT_IMM) ? (sljit_s32)srcw : srcw);
+		return emit_op(compiler, SLJIT_MOV_S32, INT_DATA | SIGNED_DATA | MOVE_OP, dst, dstw, TMP_REG1, 0, src, (src == SLJIT_IMM) ? (sljit_s32)srcw : srcw);
 
 	case SLJIT_MOV_U8:
-		return emit_op(compiler, op, BYTE_DATA | MOVE_OP, dst, dstw, TMP_REG1, 0, src, (src & SLJIT_IMM) ? (sljit_u8)srcw : srcw);
+		return emit_op(compiler, op, BYTE_DATA | MOVE_OP, dst, dstw, TMP_REG1, 0, src, (src == SLJIT_IMM) ? (sljit_u8)srcw : srcw);
 
 	case SLJIT_MOV_S8:
-		return emit_op(compiler, op, BYTE_DATA | SIGNED_DATA | MOVE_OP, dst, dstw, TMP_REG1, 0, src, (src & SLJIT_IMM) ? (sljit_s8)srcw : srcw);
+		return emit_op(compiler, op, BYTE_DATA | SIGNED_DATA | MOVE_OP, dst, dstw, TMP_REG1, 0, src, (src == SLJIT_IMM) ? (sljit_s8)srcw : srcw);
 
 	case SLJIT_MOV_U16:
-		return emit_op(compiler, op, HALF_DATA | MOVE_OP, dst, dstw, TMP_REG1, 0, src, (src & SLJIT_IMM) ? (sljit_u16)srcw : srcw);
+		return emit_op(compiler, op, HALF_DATA | MOVE_OP, dst, dstw, TMP_REG1, 0, src, (src == SLJIT_IMM) ? (sljit_u16)srcw : srcw);
 
 	case SLJIT_MOV_S16:
-		return emit_op(compiler, op, HALF_DATA | SIGNED_DATA | MOVE_OP, dst, dstw, TMP_REG1, 0, src, (src & SLJIT_IMM) ? (sljit_s16)srcw : srcw);
+		return emit_op(compiler, op, HALF_DATA | SIGNED_DATA | MOVE_OP, dst, dstw, TMP_REG1, 0, src, (src == SLJIT_IMM) ? (sljit_s16)srcw : srcw);
 
 	case SLJIT_CLZ:
 	case SLJIT_CTZ:
@@ -1813,9 +1834,9 @@
 
 	if (op & SLJIT_32) {
 		flags |= INT_DATA | SIGNED_DATA;
-		if (src1 & SLJIT_IMM)
+		if (src1 == SLJIT_IMM)
 			src1w = (sljit_s32)src1w;
-		if (src2 & SLJIT_IMM)
+		if (src2 == SLJIT_IMM)
 			src2w = (sljit_s32)src2w;
 	}
 
@@ -1848,7 +1869,7 @@
 	case SLJIT_MASHR:
 	case SLJIT_ROTL:
 	case SLJIT_ROTR:
-		if (src2 & SLJIT_IMM) {
+		if (src2 == SLJIT_IMM) {
 			if (op & SLJIT_32)
 				src2w &= 0x1f;
 			else
@@ -1897,7 +1918,7 @@
 
 	ADJUST_LOCAL_OFFSET(src3, src3w);
 
-	if (src3 & SLJIT_IMM) {
+	if (src3 == SLJIT_IMM) {
 		src3w &= bit_length - 1;
 
 		if (src3w == 0)
@@ -2022,7 +2043,7 @@
 {
 	CHECK_REG_INDEX(check_sljit_get_register_index(type, reg));
 
-	if (type == SLJIT_INT_REGISTER)
+	if (type == SLJIT_GP_REGISTER)
 		return reg_map[reg];
 
 	if (type != SLJIT_FLOAT_REGISTER)
@@ -2126,7 +2147,7 @@
 	if (src & SLJIT_MEM) {
 		FAIL_IF(emit_op_mem2(compiler, (word_data ? WORD_DATA : INT_DATA) | LOAD_DATA, TMP_REG1, src, srcw, dst, dstw));
 		src = TMP_REG1;
-	} else if (src & SLJIT_IMM) {
+	} else if (src == SLJIT_IMM) {
 		if (GET_OPCODE(op) == SLJIT_CONV_F64_FROM_S32)
 			srcw = (sljit_s32)srcw;
 
@@ -2173,7 +2194,7 @@
 	if (src & SLJIT_MEM) {
 		FAIL_IF(emit_op_mem2(compiler, (word_data ? WORD_DATA : INT_DATA) | LOAD_DATA, TMP_REG1, src, srcw, dst, dstw));
 		src = TMP_REG1;
-	} else if (src & SLJIT_IMM) {
+	} else if (src == SLJIT_IMM) {
 		if (GET_OPCODE(op) == SLJIT_CONV_F64_FROM_U32)
 			srcw = (sljit_u32)srcw;
 
@@ -2363,8 +2384,6 @@
 	case SLJIT_DIV_F64:
 		FAIL_IF(push_inst(compiler, FINST(FDIV, op) | FRD(dst_r) | FRJ(src1) | FRK(src2)));
 		break;
-	case SLJIT_COPYSIGN_F64:
-		return push_inst(compiler, FINST(FCOPYSIGN, op) | FRD(dst_r) | FRJ(src1) | FRK(src2));
 	}
 
 	if (dst_r == TMP_FREG2)
@@ -2372,6 +2391,32 @@
 	return SLJIT_SUCCESS;
 }
 
+SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_fop2r(struct sljit_compiler *compiler, sljit_s32 op,
+	sljit_s32 dst,
+	sljit_s32 src1, sljit_sw src1w,
+	sljit_s32 src2, sljit_sw src2w)
+{
+	sljit_s32 reg;
+
+	CHECK_ERROR();
+	CHECK(check_sljit_emit_fop2r(compiler, op, dst, src1, src1w, src2, src2w));
+	ADJUST_LOCAL_OFFSET(src1, src1w);
+	ADJUST_LOCAL_OFFSET(src2, src2w);
+
+	if (src2 & SLJIT_MEM) {
+		FAIL_IF(emit_op_mem2(compiler, FLOAT_DATA(op) | LOAD_DATA, TMP_FREG1, src2, src2w, 0, 0));
+		src2 = TMP_FREG1;
+	}
+
+	if (src1 & SLJIT_MEM) {
+		reg = (dst == src2) ? TMP_FREG1 : dst;
+		FAIL_IF(emit_op_mem2(compiler, FLOAT_DATA(op) | LOAD_DATA, reg, src1, src1w, 0, 0));
+		src1 = reg;
+	}
+
+	return push_inst(compiler, FINST(FCOPYSIGN, op) | FRD(dst) | FRJ(src1) | FRK(src2));
+}
+
 SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_fset32(struct sljit_compiler *compiler,
 	sljit_s32 freg, sljit_f32 value)
 {
@@ -2451,8 +2496,10 @@
 {
 	switch (type) {
 	case SLJIT_EQUAL:
+	case SLJIT_ATOMIC_NOT_STORED:
 		return BNE | RJ(EQUAL_FLAG) | RD(TMP_ZERO);
 	case SLJIT_NOT_EQUAL:
+	case SLJIT_ATOMIC_STORED:
 		return BEQ | RJ(EQUAL_FLAG) | RD(TMP_ZERO);
 	case SLJIT_LESS:
 	case SLJIT_GREATER:
@@ -2576,7 +2623,7 @@
 		src2 = TMP_REG2;
 	}
 
-	if (src1 & SLJIT_IMM) {
+	if (src1 == SLJIT_IMM) {
 		if (src1w != 0) {
 			PTR_FAIL_IF(load_immediate(compiler, TMP_REG1, src1w));
 			src1 = TMP_REG1;
@@ -2585,7 +2632,7 @@
 			src1 = TMP_ZERO;
 	}
 
-	if (src2 & SLJIT_IMM) {
+	if (src2 == SLJIT_IMM) {
 		if (src2w != 0) {
 			PTR_FAIL_IF(load_immediate(compiler, TMP_REG2, src2w));
 			src2 = TMP_REG2;
@@ -2653,7 +2700,7 @@
 	CHECK_ERROR();
 	CHECK(check_sljit_emit_ijump(compiler, type, src, srcw));
 
-	if (!(src & SLJIT_IMM)) {
+	if (src != SLJIT_IMM) {
 		if (src & SLJIT_MEM) {
 			ADJUST_LOCAL_OFFSET(src, srcw);
 			FAIL_IF(emit_op_mem(compiler, WORD_DATA | LOAD_DATA, TMP_REG1, src, srcw));
@@ -2736,6 +2783,12 @@
 			FAIL_IF(push_inst(compiler, SLTUI | RD(dst_r) | RJ(EQUAL_FLAG) | IMM_I12(1)));
 			src_r = dst_r;
 			break;
+		case SLJIT_ATOMIC_STORED:
+		case SLJIT_ATOMIC_NOT_STORED:
+			FAIL_IF(push_inst(compiler, SLTUI | RD(dst_r) | RJ(EQUAL_FLAG) | IMM_I12(1)));
+			src_r = dst_r;
+			invert ^= 0x1;
+			break;
 		case SLJIT_OVERFLOW:
 		case SLJIT_NOT_OVERFLOW:
 			if (compiler->status_flags_state & (SLJIT_CURRENT_FLAGS_ADD | SLJIT_CURRENT_FLAGS_SUB)) {
@@ -2829,7 +2882,7 @@
 
 	if (src1 & SLJIT_MEM) {
 		FAIL_IF(emit_op_mem(compiler, inp_flags, dst_reg, src1, src1w));
-	} else if (src1 & SLJIT_IMM) {
+	} else if (src1 == SLJIT_IMM) {
 		if (type & SLJIT_32)
 			src1w = (sljit_s32)src1w;
 		FAIL_IF(load_immediate(compiler, dst_reg, src1w));
@@ -2935,15 +2988,33 @@
 	sljit_s32 dst_reg,
 	sljit_s32 mem_reg)
 {
-	SLJIT_UNUSED_ARG(compiler);
-	SLJIT_UNUSED_ARG(op);
-	SLJIT_UNUSED_ARG(dst_reg);
-	SLJIT_UNUSED_ARG(mem_reg);
+	sljit_ins ins;
 
 	CHECK_ERROR();
 	CHECK(check_sljit_emit_atomic_load(compiler, op, dst_reg, mem_reg));
 
-	return SLJIT_ERR_UNSUPPORTED;
+	if (!(LOONGARCH_FEATURE_LAMCAS & get_cpu_features()))
+		return SLJIT_ERR_UNSUPPORTED;
+
+	switch(GET_OPCODE(op)) {
+	case SLJIT_MOV_U8:
+		ins = LD_BU;
+		break;
+	case SLJIT_MOV_U16:
+		ins = LD_HU;
+		break;
+	case SLJIT_MOV32:
+		ins = LD_W;
+		break;
+	case SLJIT_MOV_U32:
+		ins = LD_WU;
+		break;
+	default:
+		ins = LD_D;
+		break;
+	}
+
+	return push_inst(compiler, ins | RD(dst_reg) | RJ(mem_reg) | IMM_I12(0));
 }
 
 SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_atomic_store(struct sljit_compiler *compiler,
@@ -2952,16 +3023,50 @@
 	sljit_s32 mem_reg,
 	sljit_s32 temp_reg)
 {
-	SLJIT_UNUSED_ARG(compiler);
-	SLJIT_UNUSED_ARG(op);
-	SLJIT_UNUSED_ARG(src_reg);
-	SLJIT_UNUSED_ARG(mem_reg);
-	SLJIT_UNUSED_ARG(temp_reg);
+	sljit_ins ins = 0;
+	sljit_ins unsign = 0;
+	sljit_s32 tmp = temp_reg;
 
 	CHECK_ERROR();
 	CHECK(check_sljit_emit_atomic_store(compiler, op, src_reg, mem_reg, temp_reg));
 
-	return SLJIT_ERR_UNSUPPORTED;
+	if (!(LOONGARCH_FEATURE_LAMCAS & get_cpu_features()))
+		return SLJIT_ERR_UNSUPPORTED;
+
+	switch (GET_OPCODE(op)) {
+	case SLJIT_MOV_U8:
+		ins = AMCAS_B;
+		unsign = BSTRPICK_D | (7 << 16);
+		break;
+	case SLJIT_MOV_U16:
+		ins = AMCAS_H;
+		unsign = BSTRPICK_D | (15 << 16);
+		break;
+	case SLJIT_MOV32:
+		ins = AMCAS_W;
+		break;
+	case SLJIT_MOV_U32:
+		ins = AMCAS_W;
+		unsign = BSTRPICK_D | (31 << 16);
+		break;
+	default:
+		ins = AMCAS_D;
+		break;
+	}
+
+	if (op & SLJIT_SET_ATOMIC_STORED) {
+		FAIL_IF(push_inst(compiler, XOR | RD(TMP_REG1) | RJ(temp_reg) | RK(TMP_ZERO)));
+		tmp = TMP_REG1;
+	}
+	FAIL_IF(push_inst(compiler, ins | RD(tmp) | RJ(mem_reg) | RK(src_reg)));
+	if (!(op & SLJIT_SET_ATOMIC_STORED))
+		return SLJIT_SUCCESS;
+
+	if (unsign)
+		FAIL_IF(push_inst(compiler, unsign | RD(tmp) | RJ(tmp)));
+
+	FAIL_IF(push_inst(compiler, XOR | RD(EQUAL_FLAG) | RJ(tmp) | RK(temp_reg)));
+	return push_inst(compiler, SLTUI | RD(EQUAL_FLAG) | RJ(EQUAL_FLAG) | IMM_I12(1));
 }
 
 static SLJIT_INLINE sljit_s32 emit_const(struct sljit_compiler *compiler, sljit_s32 dst, sljit_sw init_value, sljit_ins last_ins)
diff --git a/src/sljit/sljitNativeMIPS_32.c b/src/sljit/sljitNativeMIPS_32.c
index f22346b..9620b94 100644
--- a/src/sljit/sljitNativeMIPS_32.c
+++ b/src/sljit/sljitNativeMIPS_32.c
@@ -26,6 +26,49 @@
 
 /* mips 32-bit arch dependent functions. */
 
+static sljit_s32 emit_copysign(struct sljit_compiler *compiler, sljit_s32 op,
+		sljit_sw src1, sljit_sw src2, sljit_sw dst)
+{
+	int is_32 = (op & SLJIT_32);
+	sljit_ins mfhc = MFC1, mthc = MTC1;
+	sljit_ins src1_r = FS(src1), src2_r = FS(src2), dst_r = FS(dst);
+
+	if (!is_32) {
+		switch (cpu_feature_list & CPU_FEATURE_FR) {
+#if defined(SLJIT_MIPS_REV) && SLJIT_MIPS_REV >= 2
+		case CPU_FEATURE_FR:
+			mfhc = MFHC1;
+			mthc = MTHC1;
+			break;
+#endif /* SLJIT_MIPS_REV >= 2 */
+		default:
+			src1_r |= (1 << 11);
+			src2_r |= (1 << 11);
+			dst_r |= (1 << 11);
+			break;
+		}
+	}
+
+	FAIL_IF(push_inst(compiler, mfhc | T(TMP_REG1) | src1_r, DR(TMP_REG1)));
+	FAIL_IF(push_inst(compiler, mfhc | T(TMP_REG2) | src2_r, DR(TMP_REG2)));
+	if (!is_32 && src1 != dst)
+		FAIL_IF(push_inst(compiler, MOV_fmt(FMT_S) | FS(src1) | FD(dst), MOVABLE_INS));
+#if !defined(SLJIT_MIPS_REV) || SLJIT_MIPS_REV <= 1
+	else
+		FAIL_IF(push_inst(compiler, NOP, UNMOVABLE_INS));
+#endif /* MIPS III */
+	FAIL_IF(push_inst(compiler, XOR | T(TMP_REG1) | D(TMP_REG2) | S(TMP_REG2), DR(TMP_REG2)));
+	FAIL_IF(push_inst(compiler, SRL | T(TMP_REG2) | D(TMP_REG2) | SH_IMM(31), DR(TMP_REG2)));
+	FAIL_IF(push_inst(compiler, SLL | T(TMP_REG2) | D(TMP_REG2) | SH_IMM(31), DR(TMP_REG2)));
+	FAIL_IF(push_inst(compiler, XOR | T(TMP_REG2) | D(TMP_REG1) | S(TMP_REG1), DR(TMP_REG1)));
+	FAIL_IF(push_inst(compiler, mthc | T(TMP_REG1) | dst_r, MOVABLE_INS));
+#if !defined(SLJIT_MIPS_REV) || SLJIT_MIPS_REV <= 1
+	if (mthc == MTC1)
+		return push_inst(compiler, NOP, UNMOVABLE_INS);
+#endif /* MIPS III */
+	return SLJIT_SUCCESS;
+}
+
 static sljit_s32 load_immediate(struct sljit_compiler *compiler, sljit_s32 dst_ar, sljit_sw imm)
 {
 	if (!(imm & ~0xffff))
@@ -48,7 +91,15 @@
 	sljit_s32 freg, sljit_f64 value)
 {
 	union {
-		sljit_s32 imm[2];
+		struct {
+#if defined(SLJIT_LITTLE_ENDIAN) && SLJIT_LITTLE_ENDIAN
+			sljit_s32 lo;
+			sljit_s32 hi;
+#else /* !SLJIT_LITTLE_ENDIAN */
+			sljit_s32 hi;
+			sljit_s32 lo;
+#endif /* SLJIT_LITTLE_ENDIAN */
+		} bin;
 		sljit_f64 value;
 	} u;
 
@@ -57,42 +108,85 @@
 
 	u.value = value;
 
-	if (u.imm[0] != 0)
-		FAIL_IF(load_immediate(compiler, DR(TMP_REG1), u.imm[0]));
-	if (u.imm[1] != 0)
-		FAIL_IF(load_immediate(compiler, DR(TMP_REG2), u.imm[1]));
+	if (u.bin.lo != 0)
+		FAIL_IF(load_immediate(compiler, DR(TMP_REG1), u.bin.lo));
+	if (u.bin.hi != 0)
+		FAIL_IF(load_immediate(compiler, DR(TMP_REG2), u.bin.hi));
 
-	FAIL_IF(push_inst(compiler, MTC1 | (u.imm[0] != 0 ? T(TMP_REG1) : TA(0)) | FS(freg), MOVABLE_INS));
-	return push_inst(compiler, MTC1 | (u.imm[1] != 0 ? T(TMP_REG2) : TA(0)) | FS(freg) | (1 << 11), MOVABLE_INS);
+	FAIL_IF(push_inst(compiler, MTC1 | (u.bin.lo != 0 ? T(TMP_REG1) : TA(0)) | FS(freg), MOVABLE_INS));
+	switch (cpu_feature_list & CPU_FEATURE_FR) {
+#if defined(SLJIT_MIPS_REV) && SLJIT_MIPS_REV >= 2
+	case CPU_FEATURE_FR:
+		return push_inst(compiler, MTHC1 | (u.bin.hi != 0 ? T(TMP_REG2) : TA(0)) | FS(freg), MOVABLE_INS);
+#endif /* SLJIT_MIPS_REV >= 2 */
+	default:
+		FAIL_IF(push_inst(compiler, MTC1 | (u.bin.hi != 0 ? T(TMP_REG2) : TA(0)) | FS(freg) | (1 << 11), MOVABLE_INS));
+		break;
+	}
+#if !defined(SLJIT_MIPS_REV) || SLJIT_MIPS_REV <= 1
+	FAIL_IF(push_inst(compiler, NOP, UNMOVABLE_INS));
+#endif /* MIPS III */
+	return SLJIT_SUCCESS;
 }
 
 SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_fcopy(struct sljit_compiler *compiler, sljit_s32 op,
 	sljit_s32 freg, sljit_s32 reg)
 {
-	sljit_s32 reg2;
-	sljit_ins inst;
+	sljit_s32 reg2 = 0;
+	sljit_ins inst = FS(freg);
+	sljit_ins mthc = MTC1, mfhc = MFC1;
+	int is_32 = (op & SLJIT_32);
 
 	CHECK_ERROR();
 	CHECK(check_sljit_emit_fcopy(compiler, op, freg, reg));
 
+	op = GET_OPCODE(op);
 	if (reg & REG_PAIR_MASK) {
 		reg2 = REG_PAIR_SECOND(reg);
 		reg = REG_PAIR_FIRST(reg);
 
-		inst = T(reg2) | FS(freg) | (1 << 11);
+		inst |= T(reg2);
 
 		if (op == SLJIT_COPY_TO_F64)
 			FAIL_IF(push_inst(compiler, MTC1 | inst, MOVABLE_INS));
 		else
 			FAIL_IF(push_inst(compiler, MFC1 | inst, DR(reg2)));
+
+		inst = FS(freg) | (1 << 11);
+#if defined(SLJIT_MIPS_REV) && SLJIT_MIPS_REV >= 2
+		if (cpu_feature_list & CPU_FEATURE_FR) {
+			mthc = MTHC1;
+			mfhc = MFHC1;
+			inst = FS(freg);
+		}
+#endif /* SLJIT_MIPS_REV >= 2 */
 	}
 
-	inst = T(reg) | FS(freg);
+	inst |= T(reg);
+	if (!is_32 && !reg2) {
+		switch (cpu_feature_list & CPU_FEATURE_FR) {
+#if defined(SLJIT_MIPS_REV) && SLJIT_MIPS_REV >= 2
+		case CPU_FEATURE_FR:
+			mthc = MTHC1;
+			mfhc = MFHC1;
+			break;
+#endif /* SLJIT_MIPS_REV >= 2 */
+		default:
+			inst |= (1 << 11);
+			break;
+		}
+	}
 
-	if (GET_OPCODE(op) == SLJIT_COPY_TO_F64)
-		return push_inst(compiler, MTC1 | inst, MOVABLE_INS);
+	if (op == SLJIT_COPY_TO_F64)
+		FAIL_IF(push_inst(compiler, mthc | inst, MOVABLE_INS));
+	else
+		FAIL_IF(push_inst(compiler, mfhc | inst, DR(reg)));
 
-	return push_inst(compiler, MFC1 | inst, DR(reg));
+#if !defined(SLJIT_MIPS_REV) || SLJIT_MIPS_REV <= 1
+	if (mthc == MTC1 || mfhc == MFC1)
+		return push_inst(compiler, NOP, UNMOVABLE_INS);
+#endif /* MIPS III */
+	return SLJIT_SUCCESS;
 }
 
 SLJIT_API_FUNC_ATTRIBUTE void sljit_set_jump_addr(sljit_uw addr, sljit_uw new_target, sljit_sw executable_offset)
@@ -125,6 +219,11 @@
 	sljit_ins ins = NOP;
 	sljit_u8 offsets[4];
 	sljit_u8 *offsets_ptr = offsets;
+#if defined(SLJIT_LITTLE_ENDIAN) && SLJIT_LITTLE_ENDIAN
+	sljit_ins f64_hi = TA(7), f64_lo = TA(6);
+#else
+	sljit_ins f64_hi = TA(6), f64_lo = TA(7);
+#endif /* SLJIT_LITTLE_ENDIAN */
 
 	SLJIT_ASSERT(reg_map[TMP_REG1] == 4 && freg_map[TMP_FREG1] == 12);
 
@@ -189,20 +288,28 @@
 
 		switch (types & SLJIT_ARG_MASK) {
 		case SLJIT_ARG_TYPE_F64:
-			if (*offsets_ptr < 4 * sizeof (sljit_sw)) {
+			if (*offsets_ptr < 4 * sizeof(sljit_sw)) {
 				if (prev_ins != NOP)
 					FAIL_IF(push_inst(compiler, prev_ins, MOVABLE_INS));
 
 				/* Must be preceded by at least one other argument,
 				 * and its starting offset must be 8 because of alignment. */
 				SLJIT_ASSERT((*offsets_ptr >> 2) == 2);
-
-				prev_ins = MFC1 | TA(6) | FS(float_arg_count) | (1 << 11);
-				ins = MFC1 | TA(7) | FS(float_arg_count);
+				switch (cpu_feature_list & CPU_FEATURE_FR) {
+#if defined(SLJIT_MIPS_REV) && SLJIT_MIPS_REV >= 2
+				case CPU_FEATURE_FR:
+					prev_ins = MFHC1 | f64_hi | FS(float_arg_count);
+					break;
+#endif /* SLJIT_MIPS_REV >= 2 */
+				default:
+					prev_ins = MFC1 | f64_hi | FS(float_arg_count) | (1 << 11);
+					break;
+				}
+				ins = MFC1 | f64_lo | FS(float_arg_count);
 			} else if (*offsets_ptr < 254)
 				ins = SDC1 | S(SLJIT_SP) | FT(float_arg_count) | IMM(*offsets_ptr);
 			else if (*offsets_ptr == 254)
-				ins = MOV_S | FMT_D | FS(SLJIT_FR0) | FD(TMP_FREG1);
+				ins = MOV_fmt(FMT_D) | FS(SLJIT_FR0) | FD(TMP_FREG1);
 
 			float_arg_count--;
 			break;
@@ -212,7 +319,7 @@
 			else if (*offsets_ptr < 254)
 				ins = SWC1 | S(SLJIT_SP) | FT(float_arg_count) | IMM(*offsets_ptr);
 			else if (*offsets_ptr == 254)
-				ins = MOV_S | FMT_S | FS(SLJIT_FR0) | FD(TMP_FREG1);
+				ins = MOV_fmt(FMT_S) | FS(SLJIT_FR0) | FD(TMP_FREG1);
 
 			float_arg_count--;
 			break;
@@ -336,7 +443,7 @@
 
 	SLJIT_ASSERT(DR(PIC_ADDR_REG) == 25 && PIC_ADDR_REG == TMP_REG2);
 
-	if (src & SLJIT_IMM)
+	if (src == SLJIT_IMM)
 		FAIL_IF(load_immediate(compiler, DR(PIC_ADDR_REG), srcw));
 	else if (src != PIC_ADDR_REG)
 		FAIL_IF(push_inst(compiler, ADDU | S(src) | TA(0) | D(PIC_ADDR_REG), DR(PIC_ADDR_REG)));
diff --git a/src/sljit/sljitNativeMIPS_64.c b/src/sljit/sljitNativeMIPS_64.c
index a05e1a8..52a0d3f 100644
--- a/src/sljit/sljitNativeMIPS_64.c
+++ b/src/sljit/sljitNativeMIPS_64.c
@@ -26,6 +26,23 @@
 
 /* mips 64-bit arch dependent functions. */
 
+static sljit_s32 emit_copysign(struct sljit_compiler *compiler, sljit_s32 op,
+		sljit_s32 src1, sljit_s32 src2, sljit_s32 dst)
+{
+	FAIL_IF(push_inst(compiler, SELECT_OP(DMFC1, MFC1) | T(TMP_REG1) | FS(src1), DR(TMP_REG1)));
+	FAIL_IF(push_inst(compiler, SELECT_OP(DMFC1, MFC1) | T(TMP_REG2) | FS(src2), DR(TMP_REG2)));
+	FAIL_IF(push_inst(compiler, XOR | S(TMP_REG2) | T(TMP_REG1) | D(TMP_REG2), DR(TMP_REG2)));
+	FAIL_IF(push_inst(compiler, SELECT_OP(DSRL32, SRL) | T(TMP_REG2) | D(TMP_REG2) | SH_IMM(31), DR(TMP_REG2)));
+	FAIL_IF(push_inst(compiler, SELECT_OP(DSLL32, SLL) | T(TMP_REG2) | D(TMP_REG2) | SH_IMM(31), DR(TMP_REG2)));
+	FAIL_IF(push_inst(compiler, XOR | S(TMP_REG1) | T(TMP_REG2) | D(TMP_REG1), DR(TMP_REG1)));
+	FAIL_IF(push_inst(compiler, SELECT_OP(DMTC1, MTC1) | T(TMP_REG1) | FS(dst), MOVABLE_INS));
+#if !defined(SLJIT_MIPS_REV) || SLJIT_MIPS_REV <= 1
+	if (!(op & SLJIT_32))
+		return push_inst(compiler, NOP, UNMOVABLE_INS);
+#endif /* MIPS III */
+	return SLJIT_SUCCESS;
+}
+
 static sljit_s32 load_immediate(struct sljit_compiler *compiler, sljit_s32 dst_ar, sljit_sw imm)
 {
 	sljit_s32 shift = 32;
@@ -141,11 +158,20 @@
 
 	u.value = value;
 
-	if (u.imm == 0)
-		return push_inst(compiler, DMTC1 | TA(0) | FS(freg), MOVABLE_INS);
+	if (u.imm == 0) {
+		FAIL_IF(push_inst(compiler, DMTC1 | TA(0) | FS(freg), MOVABLE_INS));
+#if !defined(SLJIT_MIPS_REV) || SLJIT_MIPS_REV <= 1
+		FAIL_IF(push_inst(compiler, NOP, UNMOVABLE_INS));
+#endif /* MIPS III */
+		return SLJIT_SUCCESS;
+	}
 
 	FAIL_IF(load_immediate(compiler, DR(TMP_REG1), u.imm));
-	return push_inst(compiler, DMTC1 | T(TMP_REG1) | FS(freg), MOVABLE_INS);
+	FAIL_IF(push_inst(compiler, DMTC1 | T(TMP_REG1) | FS(freg), MOVABLE_INS));
+#if !defined(SLJIT_MIPS_REV) || SLJIT_MIPS_REV <= 1
+	FAIL_IF(push_inst(compiler, NOP, UNMOVABLE_INS));
+#endif /* MIPS III */
+	return SLJIT_SUCCESS;
 }
 
 SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_fcopy(struct sljit_compiler *compiler, sljit_s32 op,
@@ -159,9 +185,15 @@
 	inst = T(reg) | FS(freg);
 
 	if (GET_OPCODE(op) == SLJIT_COPY_TO_F64)
-		return push_inst(compiler, ((op & SLJIT_32) ? MTC1 : DMTC1) | inst, MOVABLE_INS);
+		FAIL_IF(push_inst(compiler, SELECT_OP(DMTC1, MTC1) | inst, MOVABLE_INS));
+	else
+		FAIL_IF(push_inst(compiler, SELECT_OP(DMFC1, MFC1) | inst, DR(reg)));
 
-	return push_inst(compiler, ((op & SLJIT_32) ? MFC1 : DMFC1) | inst, DR(reg));
+#if !defined(SLJIT_MIPS_REV) || SLJIT_MIPS_REV <= 1
+	if (!(op & SLJIT_32))
+		return push_inst(compiler, NOP, UNMOVABLE_INS);
+#endif /* MIPS III */
+	return SLJIT_SUCCESS;
 }
 
 SLJIT_API_FUNC_ATTRIBUTE void sljit_set_jump_addr(sljit_uw addr, sljit_uw new_target, sljit_sw executable_offset)
@@ -219,17 +251,17 @@
 		switch (types & SLJIT_ARG_MASK) {
 		case SLJIT_ARG_TYPE_F64:
 			if (arg_count != float_arg_count)
-				ins = MOV_S | FMT_D | FS(float_arg_count) | FD(arg_count);
+				ins = MOV_fmt(FMT_D) | FS(float_arg_count) | FD(arg_count);
 			else if (arg_count == 1)
-				ins = MOV_S | FMT_D | FS(SLJIT_FR0) | FD(TMP_FREG1);
+				ins = MOV_fmt(FMT_D) | FS(SLJIT_FR0) | FD(TMP_FREG1);
 			arg_count--;
 			float_arg_count--;
 			break;
 		case SLJIT_ARG_TYPE_F32:
 			if (arg_count != float_arg_count)
-				ins = MOV_S | FMT_S | FS(float_arg_count) | FD(arg_count);
+				ins = MOV_fmt(FMT_S) | FS(float_arg_count) | FD(arg_count);
 			else if (arg_count == 1)
-				ins = MOV_S | FMT_S | FS(SLJIT_FR0) | FD(TMP_FREG1);
+				ins = MOV_fmt(FMT_S) | FS(SLJIT_FR0) | FD(TMP_FREG1);
 			arg_count--;
 			float_arg_count--;
 			break;
@@ -336,7 +368,7 @@
 
 	SLJIT_ASSERT(DR(PIC_ADDR_REG) == 25 && PIC_ADDR_REG == TMP_REG2);
 
-	if (src & SLJIT_IMM)
+	if (src == SLJIT_IMM)
 		FAIL_IF(load_immediate(compiler, DR(PIC_ADDR_REG), srcw));
 	else if (src != PIC_ADDR_REG)
 		FAIL_IF(push_inst(compiler, DADDU | S(src) | TA(0) | D(PIC_ADDR_REG), DR(PIC_ADDR_REG)));
diff --git a/src/sljit/sljitNativeMIPS_common.c b/src/sljit/sljitNativeMIPS_common.c
index 6e53385..d80a75c 100644
--- a/src/sljit/sljitNativeMIPS_common.c
+++ b/src/sljit/sljitNativeMIPS_common.c
@@ -26,9 +26,12 @@
 
 /* Latest MIPS architecture. */
 
-#ifndef __mips_hard_float
+#ifdef HAVE_PRCTL
+#include <sys/prctl.h>
+#endif
+
+#if !defined(__mips_hard_float) || defined(__mips_single_float)
 /* Disable automatic detection, covers both -msoft-float and -mno-float */
-#undef SLJIT_IS_FPU_AVAILABLE
 #define SLJIT_IS_FPU_AVAILABLE 0
 #endif
 
@@ -42,6 +45,14 @@
 	return "MIPS64-R6" SLJIT_CPUINFO;
 #endif /* SLJIT_CONFIG_MIPS_32 */
 
+#elif (defined SLJIT_MIPS_REV && SLJIT_MIPS_REV >= 5)
+
+#if (defined SLJIT_CONFIG_MIPS_32 && SLJIT_CONFIG_MIPS_32)
+	return "MIPS32-R5" SLJIT_CPUINFO;
+#else /* !SLJIT_CONFIG_MIPS_32 */
+	return "MIPS64-R5" SLJIT_CPUINFO;
+#endif /* SLJIT_CONFIG_MIPS_32 */
+
 #elif (defined SLJIT_MIPS_REV && SLJIT_MIPS_REV >= 2)
 
 #if (defined SLJIT_CONFIG_MIPS_32 && SLJIT_CONFIG_MIPS_32)
@@ -83,27 +94,34 @@
 #define EQUAL_FLAG	3
 #define OTHER_FLAG	1
 
-#define TMP_FREG1	(SLJIT_NUMBER_OF_FLOAT_REGISTERS + 1)
-#define TMP_FREG2	(SLJIT_NUMBER_OF_FLOAT_REGISTERS + 2)
-#define TMP_FREG3	(SLJIT_NUMBER_OF_FLOAT_REGISTERS + 3)
-
 static const sljit_u8 reg_map[SLJIT_NUMBER_OF_REGISTERS + 5] = {
 	0, 2, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 24, 23, 22, 21, 20, 19, 18, 17, 16, 29, 4, 25, 31
 };
 
 #if (defined SLJIT_CONFIG_MIPS_32 && SLJIT_CONFIG_MIPS_32)
 
-static const sljit_u8 freg_map[SLJIT_NUMBER_OF_FLOAT_REGISTERS + 4] = {
-	0, 0, 14, 2, 4, 6, 8, 18, 30, 28, 26, 24, 22, 20, 12, 10, 16
+#define TMP_FREG1	((SLJIT_NUMBER_OF_FLOAT_REGISTERS << 1) + 1)
+#define TMP_FREG2	((SLJIT_NUMBER_OF_FLOAT_REGISTERS << 1) + 2)
+#define TMP_FREG3	((SLJIT_NUMBER_OF_FLOAT_REGISTERS << 1) + 3)
+
+static const sljit_u8 freg_map[(SLJIT_NUMBER_OF_FLOAT_REGISTERS << 1) + 4] = {
+	0,
+	0, 14, 2, 4, 6, 8, 18, 30, 28, 26, 24, 22, 20,
+	1, 15, 3, 5, 7, 9, 19, 31, 29, 27, 25, 23, 21,
+	12, 10, 16,
 };
 
-#else
+#else /* !SLJIT_CONFIG_MIPS_32 */
+
+#define TMP_FREG1	(SLJIT_NUMBER_OF_FLOAT_REGISTERS + 1)
+#define TMP_FREG2	(SLJIT_NUMBER_OF_FLOAT_REGISTERS + 2)
+#define TMP_FREG3	(SLJIT_NUMBER_OF_FLOAT_REGISTERS + 3)
 
 static const sljit_u8 freg_map[SLJIT_NUMBER_OF_FLOAT_REGISTERS + 4] = {
 	0, 0, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 1, 2, 3, 4, 5, 6, 7, 8, 9, 31, 30, 29, 28, 27, 26, 25, 24, 12, 11, 10
 };
 
-#endif
+#endif /* SLJIT_CONFIG_MIPS_32 */
 
 /* --------------------------------------------------------------------- */
 /*  Instrucion forms                                                     */
@@ -200,12 +218,18 @@
 #define DMULTU		(HI(0) | LO(29))
 #endif /* SLJIT_MIPS_REV >= 6 */
 #define DIV_S		(HI(17) | FMT_S | LO(3))
+#if defined(SLJIT_MIPS_REV) && SLJIT_MIPS_REV >= 2
 #define DINSU		(HI(31) | LO(6))
-#define DMFC1		(HI(17) | (1 << 21) | LO(0))
-#define DMTC1		(HI(17) | (5 << 21) | LO(0))
+#endif /* SLJIT_MIPS_REV >= 2 */
+#define DMFC1		(HI(17) | (1 << 21))
+#define DMTC1		(HI(17) | (5 << 21))
+#if defined(SLJIT_MIPS_REV) && SLJIT_MIPS_REV >= 2
 #define DROTR		(HI(0) | (1 << 21) | LO(58))
 #define DROTR32		(HI(0) | (1 << 21) | LO(62))
 #define DROTRV		(HI(0) | (1 << 6) | LO(22))
+#define DSBH		(HI(31) | (2 << 6) | LO(36))
+#define DSHD		(HI(31) | (5 << 6) | LO(36))
+#endif /* SLJIT_MIPS_REV >= 2 */
 #define DSLL		(HI(0) | LO(56))
 #define DSLL32		(HI(0) | LO(60))
 #define DSLLV		(HI(0) | LO(20))
@@ -233,7 +257,10 @@
 #define LWL		(HI(34))
 #define LWR		(HI(38))
 #define LWC1		(HI(49))
-#define MFC1		(HI(17) | (0 << 21))
+#define MFC1		(HI(17))
+#if defined(SLJIT_MIPS_REV) && SLJIT_MIPS_REV >= 2
+#define MFHC1		(HI(17) | (3 << 21))
+#endif /* SLJIT_MIPS_REV >= 2 */
 #if (defined SLJIT_MIPS_REV && SLJIT_MIPS_REV >= 6)
 #define MOD		(HI(0) | (3 << 6) | LO(26))
 #define MODU		(HI(0) | (3 << 6) | LO(27))
@@ -241,8 +268,10 @@
 #define MFHI		(HI(0) | LO(16))
 #define MFLO		(HI(0) | LO(18))
 #endif /* SLJIT_MIPS_REV >= 6 */
-#define MOV_S		(HI(17) | FMT_S | LO(6))
 #define MTC1		(HI(17) | (4 << 21))
+#if defined(SLJIT_MIPS_REV) && SLJIT_MIPS_REV >= 2
+#define MTHC1		(HI(17) | (7 << 21))
+#endif /* SLJIT_MIPS_REV >= 2 */
 #if (defined SLJIT_MIPS_REV && SLJIT_MIPS_REV >= 6)
 #define MUH		(HI(0) | (3 << 6) | LO(24))
 #define MUHU		(HI(0) | (3 << 6) | LO(25))
@@ -258,8 +287,10 @@
 #define NOR		(HI(0) | LO(39))
 #define OR		(HI(0) | LO(37))
 #define ORI		(HI(13))
+#if defined(SLJIT_MIPS_REV) && SLJIT_MIPS_REV >= 2
 #define ROTR		(HI(0) | (1 << 21) | LO(2))
 #define ROTRV		(HI(0) | (1 << 6) | LO(6))
+#endif /* SLJIT_MIPS_REV >= 2 */
 #define SD		(HI(63))
 #define SDL		(HI(44))
 #define SDR		(HI(45))
@@ -281,6 +312,9 @@
 #define SWR		(HI(46))
 #define SWC1		(HI(57))
 #define TRUNC_W_S	(HI(17) | FMT_S | LO(13))
+#if defined(SLJIT_MIPS_REV) && SLJIT_MIPS_REV >= 2
+#define WSBH		(HI(31) | (2 << 6) | LO(32))
+#endif /* SLJIT_MIPS_REV >= 2 */
 #define XOR		(HI(0) | LO(38))
 #define XORI		(HI(14))
 
@@ -302,8 +336,10 @@
 #endif /* SLJIT_MIPS_REV >= 6 */
 #define PREF		(HI(51))
 #define PREFX		(HI(19) | LO(15))
+#if defined(SLJIT_MIPS_REV) && SLJIT_MIPS_REV >= 2
 #define SEB		(HI(31) | (16 << 6) | LO(32))
 #define SEH		(HI(31) | (24 << 6) | LO(32))
+#endif /* SLJIT_MIPS_REV >= 2 */
 #endif /* SLJIT_MIPS_REV >= 1 */
 
 #if (defined SLJIT_CONFIG_MIPS_32 && SLJIT_CONFIG_MIPS_32)
@@ -324,10 +360,106 @@
 #define LOAD_W		LD
 #endif
 
+#define MOV_fmt(f)	(HI(17) | f | LO(6))
+
 #define SIMM_MAX	(0x7fff)
 #define SIMM_MIN	(-0x8000)
 #define UIMM_MAX	(0xffff)
 
+#define CPU_FEATURE_DETECTED	(1 << 0)
+#define CPU_FEATURE_FPU		(1 << 1)
+#define CPU_FEATURE_FP64	(1 << 2)
+#define CPU_FEATURE_FR		(1 << 3)
+
+static sljit_u32 cpu_feature_list = 0;
+
+#if (defined SLJIT_CONFIG_MIPS_32 && SLJIT_CONFIG_MIPS_32) \
+	&& (defined SLJIT_ARGUMENT_CHECKS && SLJIT_ARGUMENT_CHECKS)
+
+static sljit_s32 function_check_is_freg(struct sljit_compiler *compiler, sljit_s32 fr, sljit_s32 is_32)
+{
+	if (compiler->scratches == -1)
+		return 0;
+
+	if (is_32 && fr >= (SLJIT_FS0 + SLJIT_FR0) && fr <= (SLJIT_FS0 + SLJIT_FS0))
+		fr -= SLJIT_FS0;
+
+	return (fr >= SLJIT_FR0 && fr < (SLJIT_FR0 + compiler->fscratches))
+		|| (fr > (SLJIT_FS0 - compiler->fsaveds) && fr <= SLJIT_FS0);
+}
+
+#endif /* SLJIT_CONFIG_MIPS_32 && SLJIT_ARGUMENT_CHECKS */
+
+static void get_cpu_features(void)
+{
+#if !defined(SLJIT_IS_FPU_AVAILABLE) && defined(__GNUC__)
+	sljit_u32 fir = 0;
+#endif /* !SLJIT_IS_FPU_AVAILABLE && __GNUC__ */
+	sljit_u32 feature_list = CPU_FEATURE_DETECTED;
+
+#if defined(SLJIT_IS_FPU_AVAILABLE)
+#if SLJIT_IS_FPU_AVAILABLE
+	feature_list |= CPU_FEATURE_FPU;
+#if SLJIT_IS_FPU_AVAILABLE == 64
+	feature_list |= CPU_FEATURE_FP64;
+#endif /* SLJIT_IS_FPU_AVAILABLE == 64 */
+#endif /* SLJIT_IS_FPU_AVAILABLE */
+#elif defined(__GNUC__)
+	__asm__ ("cfc1 %0, $0" : "=r"(fir));
+	if ((fir & (0x3 << 16)) == (0x3 << 16))
+		feature_list |= CPU_FEATURE_FPU;
+
+#if (defined(SLJIT_CONFIG_MIPS_64) && SLJIT_CONFIG_MIPS_64) \
+	&& (!defined(SLJIT_MIPS_REV) || SLJIT_MIPS_REV < 2)
+	if ((feature_list & CPU_FEATURE_FPU))
+		feature_list |= CPU_FEATURE_FP64;
+#else /* SLJIT_CONFIG_MIPS32 || SLJIT_MIPS_REV >= 2 */
+	if ((fir & (1 << 22)))
+		feature_list |= CPU_FEATURE_FP64;
+#endif /* SLJIT_CONFIG_MIPS_64 && SLJIT_MIPS_REV < 2 */
+#endif /* SLJIT_IS_FPU_AVAILABLE */
+
+	if ((feature_list & CPU_FEATURE_FPU) && (feature_list & CPU_FEATURE_FP64)) {
+#if defined(SLJIT_CONFIG_MIPS_32) && SLJIT_CONFIG_MIPS_32
+#if defined(SLJIT_MIPS_REV) && SLJIT_MIPS_REV >= 6
+		feature_list |= CPU_FEATURE_FR;
+#elif defined(SLJIT_DETECT_FR) && SLJIT_DETECT_FR == 0
+#if defined(SLJIT_MIPS_REV) && SLJIT_MIPS_REV >= 5
+		feature_list |= CPU_FEATURE_FR;
+#endif /* SLJIT_MIPS_REV >= 5 */
+#else
+		sljit_s32 flag = -1;
+#ifndef FR_GET_FP_MODE
+		sljit_f64 zero = 0.0;
+#else /* PR_GET_FP_MODE */
+		flag = prctl(PR_GET_FP_MODE);
+
+		if (flag > 0)
+			feature_list |= CPU_FEATURE_FR;
+#endif /* FP_GET_PR_MODE */
+#if ((defined(SLJIT_DETECT_FR) && SLJIT_DETECT_FR == 2) \
+	|| (!defined(PR_GET_FP_MODE) && (!defined(SLJIT_DETECT_FR) || SLJIT_DETECT_FR >= 1))) \
+	&& (defined(__GNUC__) && (defined(__mips) && __mips >= 2))
+		if (flag < 0) {
+			__asm__ (".set oddspreg\n"
+				"lwc1 $f17, %0\n"
+				"ldc1 $f16, %1\n"
+				"swc1 $f17, %0\n"
+			: "+m" (flag) : "m" (zero) : "$f16", "$f17");
+			if (flag)
+				feature_list |= CPU_FEATURE_FR;
+		}
+#endif /* (!PR_GET_FP_MODE || (PR_GET_FP_MODE && SLJIT_DETECT_FR == 2)) && __GNUC__ */
+#endif /* SLJIT_MIPS_REV >= 6 */
+#else /* !SLJIT_CONFIG_MIPS_32 */
+		/* StatusFR=1 is the only mode supported by the code in MIPS64 */
+		feature_list |= CPU_FEATURE_FR;
+#endif /* SLJIT_CONFIG_MIPS_32 */
+	}
+
+	cpu_feature_list = feature_list;
+}
+
 /* dest_reg is the absolute name of the register
    Useful for reordering instructions in the delay slot. */
 static sljit_s32 push_inst(struct sljit_compiler *compiler, sljit_ins ins, sljit_s32 delay_slot)
@@ -721,20 +853,20 @@
 
 SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_has_cpu_feature(sljit_s32 feature_type)
 {
-#if defined(__GNUC__) && !defined(SLJIT_IS_FPU_AVAILABLE)
-	sljit_sw fir = 0;
-#endif /* __GNUC__ && !SLJIT_IS_FPU_AVAILABLE */
-
 	switch (feature_type) {
+#if (defined SLJIT_CONFIG_MIPS_32 && SLJIT_CONFIG_MIPS_32) \
+		&& (!defined(SLJIT_IS_FPU_AVAILABLE) || SLJIT_IS_FPU_AVAILABLE)
+	case SLJIT_HAS_F64_AS_F32_PAIR:
+		if (!cpu_feature_list)
+			get_cpu_features();
+
+		return (cpu_feature_list & CPU_FEATURE_FR) != 0;
+#endif /* SLJIT_CONFIG_MIPS_32 && SLJIT_IS_FPU_AVAILABLE */
 	case SLJIT_HAS_FPU:
-#ifdef SLJIT_IS_FPU_AVAILABLE
-		return SLJIT_IS_FPU_AVAILABLE;
-#elif defined(__GNUC__)
-		__asm__ ("cfc1 %0, $0" : "=r"(fir));
-		return (fir >> 22) & 0x1;
-#else
-#error "FIR check is not implemented for this architecture"
-#endif
+		if (!cpu_feature_list)
+			get_cpu_features();
+
+		return (cpu_feature_list & CPU_FEATURE_FPU) != 0;
 	case SLJIT_HAS_ZERO_REGISTER:
 	case SLJIT_HAS_COPY_F32:
 	case SLJIT_HAS_COPY_F64:
@@ -749,6 +881,7 @@
 		return 2;
 #endif /* SLJIT_MIPS_REV >= 1 */
 #if (defined SLJIT_MIPS_REV && SLJIT_MIPS_REV >= 2)
+	case SLJIT_HAS_REV:
 	case SLJIT_HAS_ROT:
 		return 1;
 #endif /* SLJIT_MIPS_REV >= 2 */
@@ -801,6 +934,12 @@
 static sljit_s32 emit_stack_frame_release(struct sljit_compiler *compiler, sljit_s32 frame_size, sljit_ins *ins_ptr);
 
 #if (defined SLJIT_CONFIG_MIPS_32 && SLJIT_CONFIG_MIPS_32)
+#define SELECT_OP(a, b)	(b)
+#else
+#define SELECT_OP(a, b)	(!(op & SLJIT_32) ? a : b)
+#endif
+
+#if (defined SLJIT_CONFIG_MIPS_32 && SLJIT_CONFIG_MIPS_32)
 #include "sljitNativeMIPS_32.c"
 #else
 #include "sljitNativeMIPS_64.c"
@@ -927,10 +1066,19 @@
 
 			if (word_arg_count == 0 && float_arg_count <= 2) {
 				if (float_arg_count == 1)
-					FAIL_IF(push_inst(compiler, MOV_S | FMT_D | FS(TMP_FREG1) | FD(SLJIT_FR0), MOVABLE_INS));
+					FAIL_IF(push_inst(compiler, MOV_fmt(FMT_D) | FS(TMP_FREG1) | FD(SLJIT_FR0), MOVABLE_INS));
 			} else if (arg_count < 4) {
 				FAIL_IF(push_inst(compiler, MTC1 | TA(4 + arg_count) | FS(float_arg_count), MOVABLE_INS));
-				FAIL_IF(push_inst(compiler, MTC1 | TA(5 + arg_count) | FS(float_arg_count) | (1 << 11), MOVABLE_INS));
+				switch (cpu_feature_list & CPU_FEATURE_FR) {
+#if defined(SLJIT_MIPS_REV) && SLJIT_MIPS_REV >= 2
+				case CPU_FEATURE_FR:
+					FAIL_IF(push_inst(compiler, MTHC1 | TA(5 + arg_count) | FS(float_arg_count), MOVABLE_INS));
+					break;
+#endif /* SLJIT_MIPS_REV >= 2 */
+				default:
+					FAIL_IF(push_inst(compiler, MTC1 | TA(5 + arg_count) | FS(float_arg_count) | (1 << 11), MOVABLE_INS));
+					break;
+				}
 			} else
 				FAIL_IF(push_inst(compiler, LDC1 | base | FT(float_arg_count) | IMM(local_size + (arg_count << 2)), MOVABLE_INS));
 			arg_count++;
@@ -940,7 +1088,7 @@
 
 			if (word_arg_count == 0 && float_arg_count <= 2) {
 				if (float_arg_count == 1)
-					FAIL_IF(push_inst(compiler, MOV_S | FMT_S | FS(TMP_FREG1) | FD(SLJIT_FR0), MOVABLE_INS));
+					FAIL_IF(push_inst(compiler, MOV_fmt(FMT_S) | FS(TMP_FREG1) | FD(SLJIT_FR0), MOVABLE_INS));
 			} else if (arg_count < 4)
 				FAIL_IF(push_inst(compiler, MTC1 | TA(4 + arg_count) | FS(float_arg_count), MOVABLE_INS));
 			else
@@ -975,16 +1123,16 @@
 		case SLJIT_ARG_TYPE_F64:
 			float_arg_count++;
 			if (arg_count != float_arg_count)
-				FAIL_IF(push_inst(compiler, MOV_S | FMT_D | FS(arg_count) | FD(float_arg_count), MOVABLE_INS));
+				FAIL_IF(push_inst(compiler, MOV_fmt(FMT_D) | FS(arg_count) | FD(float_arg_count), MOVABLE_INS));
 			else if (arg_count == 1)
-				FAIL_IF(push_inst(compiler, MOV_S | FMT_D | FS(TMP_FREG1) | FD(SLJIT_FR0), MOVABLE_INS));
+				FAIL_IF(push_inst(compiler, MOV_fmt(FMT_D) | FS(TMP_FREG1) | FD(SLJIT_FR0), MOVABLE_INS));
 			break;
 		case SLJIT_ARG_TYPE_F32:
 			float_arg_count++;
 			if (arg_count != float_arg_count)
-				FAIL_IF(push_inst(compiler, MOV_S | FMT_S | FS(arg_count) | FD(float_arg_count), MOVABLE_INS));
+				FAIL_IF(push_inst(compiler, MOV_fmt(FMT_S) | FS(arg_count) | FD(float_arg_count), MOVABLE_INS));
 			else if (arg_count == 1)
-				FAIL_IF(push_inst(compiler, MOV_S | FMT_S | FS(TMP_FREG1) | FD(SLJIT_FR0), MOVABLE_INS));
+				FAIL_IF(push_inst(compiler, MOV_fmt(FMT_S) | FS(TMP_FREG1) | FD(SLJIT_FR0), MOVABLE_INS));
 			break;
 		default:
 			word_arg_count++;
@@ -1147,7 +1295,7 @@
 
 	FAIL_IF(emit_stack_frame_release(compiler, 1, &ins));
 
-	if (!(src & SLJIT_IMM)) {
+	if (src != SLJIT_IMM) {
 		FAIL_IF(push_inst(compiler, JR | S(src), UNMOVABLE_INS));
 		return push_inst(compiler, ins, UNMOVABLE_INS);
 	}
@@ -1397,16 +1545,12 @@
 
 #if (defined SLJIT_CONFIG_MIPS_32 && SLJIT_CONFIG_MIPS_32)
 
-#define SELECT_OP(a, b) (b)
-
 #define EMIT_SHIFT(dimm, dimm32, imm, dv, v) \
 	op_imm = (imm); \
 	op_v = (v);
 
 #else /* !SLJIT_CONFIG_MIPS_32 */
 
-#define SELECT_OP(a, b) \
-	(!(op & SLJIT_32) ? a : b)
 
 #define EMIT_SHIFT(dimm, dimm32, imm, dv, v) \
 	op_dimm = (dimm); \
@@ -1424,9 +1568,9 @@
 	sljit_s32 is_clz = (GET_OPCODE(op) == SLJIT_CLZ);
 #if (defined SLJIT_CONFIG_MIPS_64 && SLJIT_CONFIG_MIPS_64)
 	sljit_ins word_size = (op & SLJIT_32) ? 32 : 64;
-#else /* !SLJIT_CONFIG_RISCV_64 */
+#else /* !SLJIT_CONFIG_MIPS_64 */
 	sljit_ins word_size = 32;
-#endif /* SLJIT_CONFIG_RISCV_64 */
+#endif /* SLJIT_CONFIG_MIPS_64 */
 
 	/* The TMP_REG2 is the next value. */
 	if (src != TMP_REG2)
@@ -1470,21 +1614,36 @@
 
 static sljit_s32 emit_rev(struct sljit_compiler *compiler, sljit_s32 op, sljit_s32 dst, sljit_sw src)
 {
-	SLJIT_UNUSED_ARG(op);
+#if defined(SLJIT_CONFIG_MIPS_64) && SLJIT_CONFIG_MIPS_64
+	int is_32 = (op & SLJIT_32);
+#endif /* SLJIT_CONFIG_MIPS_64 */
 
-#if (defined SLJIT_CONFIG_MIPS_64 && SLJIT_CONFIG_MIPS_64)
-	if (!(op & SLJIT_32)) {
-#if (defined SLJIT_MIPS_REV && SLJIT_MIPS_REV >= 2)
-		FAIL_IF(push_inst(compiler, ORI | SA(0) | TA(OTHER_FLAG) | 0xffff, OTHER_FLAG));
-		FAIL_IF(push_inst(compiler, DROTR32 | T(src) | D(dst) | SH_IMM(0), DR(dst)));
-		FAIL_IF(push_inst(compiler, DSLL32 | TA(OTHER_FLAG) | DA(OTHER_FLAG) | SH_IMM(0), OTHER_FLAG));
+	op = GET_OPCODE(op);
+#if defined(SLJIT_MIPS_REV) && SLJIT_MIPS_REV >= 2
+#if defined(SLJIT_CONFIG_MIPS_64) && SLJIT_CONFIG_MIPS_64
+	if (!is_32 && (op == SLJIT_REV)) {
+		FAIL_IF(push_inst(compiler, DSBH | T(src) | D(dst), DR(dst)));
+		return push_inst(compiler, DSHD | T(dst) | D(dst), DR(dst));
+	}
+	if (op != SLJIT_REV && src != TMP_REG2) {
+		FAIL_IF(push_inst(compiler, SLL | T(src) | D(TMP_REG1), DR(TMP_REG1)));
+		src = TMP_REG1;
+	}
+#endif /* SLJIT_CONFIG_MIPS_64 */
+	FAIL_IF(push_inst(compiler, WSBH | T(src) | D(dst), DR(dst)));
+	FAIL_IF(push_inst(compiler, ROTR | T(dst) | D(dst) | SH_IMM(16), DR(dst)));
+#if defined(SLJIT_CONFIG_MIPS_64) && SLJIT_CONFIG_MIPS_64
+	if (op == SLJIT_REV_U32 && dst != TMP_REG2 && dst != TMP_REG3)
+		FAIL_IF(push_inst(compiler, DINSU | T(dst) | SA(0) | (31 << 11), DR(dst)));
+#endif /* SLJIT_CONFIG_MIPS_64 */
 #else /* SLJIT_MIPS_REV < 2 */
+#if (defined SLJIT_CONFIG_MIPS_64 && SLJIT_CONFIG_MIPS_64)
+	if (!is_32) {
 		FAIL_IF(push_inst(compiler, DSRL32 | T(src) | D(TMP_REG1) | SH_IMM(0), DR(TMP_REG1)));
 		FAIL_IF(push_inst(compiler, ORI | SA(0) | TA(OTHER_FLAG) | 0xffff, OTHER_FLAG));
 		FAIL_IF(push_inst(compiler, DSLL32 | T(src) | D(dst) | SH_IMM(0), DR(dst)));
 		FAIL_IF(push_inst(compiler, DSLL32 | TA(OTHER_FLAG) | DA(OTHER_FLAG) | SH_IMM(0), OTHER_FLAG));
 		FAIL_IF(push_inst(compiler, OR | S(dst) | T(TMP_REG1) | D(dst), DR(dst)));
-#endif /* SLJIT_MIPS_REV >= 2 */
 
 		FAIL_IF(push_inst(compiler, DSRL | T(dst) | D(TMP_REG1) | SH_IMM(16), DR(TMP_REG1)));
 		FAIL_IF(push_inst(compiler, ORI | SA(OTHER_FLAG) | TA(OTHER_FLAG) | 0xffff, OTHER_FLAG));
@@ -1502,23 +1661,17 @@
 		return push_inst(compiler, OR | S(dst) | T(TMP_REG1) | D(dst), DR(dst));
 	}
 
-	if (GET_OPCODE(op) != SLJIT_REV && src != TMP_REG2) {
+	if (op != SLJIT_REV && src != TMP_REG2) {
 		FAIL_IF(push_inst(compiler, SLL | T(src) | D(TMP_REG2) | SH_IMM(0), DR(TMP_REG2)));
 		src = TMP_REG2;
 	}
 #endif /* SLJIT_CONFIG_MIPS_64 */
 
-#if (defined SLJIT_MIPS_REV && SLJIT_MIPS_REV >= 2)
-	FAIL_IF(push_inst(compiler, LUI | TA(OTHER_FLAG) | 0xff, OTHER_FLAG));
-	FAIL_IF(push_inst(compiler, ROTR | T(src) | D(dst) | SH_IMM(16), DR(dst)));
-	FAIL_IF(push_inst(compiler, ORI | SA(OTHER_FLAG) | TA(OTHER_FLAG) | 0xff, OTHER_FLAG));
-#else /* SLJIT_MIPS_REV < 2 */
 	FAIL_IF(push_inst(compiler, SRL | T(src) | D(TMP_REG1) | SH_IMM(16), DR(TMP_REG1)));
 	FAIL_IF(push_inst(compiler, LUI | TA(OTHER_FLAG) | 0xff, OTHER_FLAG));
 	FAIL_IF(push_inst(compiler, SLL | T(src) | D(dst) | SH_IMM(16), DR(dst)));
 	FAIL_IF(push_inst(compiler, ORI | SA(OTHER_FLAG) | TA(OTHER_FLAG) | 0xff, OTHER_FLAG));
 	FAIL_IF(push_inst(compiler, OR | S(dst) | T(TMP_REG1) | D(dst), DR(dst)));
-#endif /* SLJIT_MIPS_REV >= 2 */
 
 	FAIL_IF(push_inst(compiler, SRL | T(dst) | D(TMP_REG1) | SH_IMM(8), DR(TMP_REG1)));
 	FAIL_IF(push_inst(compiler, AND | S(dst) | TA(OTHER_FLAG) | D(dst), DR(dst)));
@@ -1527,25 +1680,34 @@
 	FAIL_IF(push_inst(compiler, OR | S(dst) | T(TMP_REG1) | D(dst), DR(dst)));
 
 #if (defined SLJIT_CONFIG_MIPS_64 && SLJIT_CONFIG_MIPS_64)
-	if (GET_OPCODE(op) == SLJIT_REV_U32 && dst != TMP_REG2) {
-#if (defined SLJIT_MIPS_REV && SLJIT_MIPS_REV >= 2)
-		return push_inst(compiler, DINSU | T(dst) | SA(0) | (31 << 11) | (0 << 6), DR(dst));
-#else /* SLJIT_MIPS_REV < 2 */
+	if (op == SLJIT_REV_U32 && dst != TMP_REG2 && dst != TMP_REG3) {
 		FAIL_IF(push_inst(compiler, DSLL32 | T(dst) | D(dst) | SH_IMM(0), DR(dst)));
-		return push_inst(compiler, DSRL32 | T(dst) | D(dst) | SH_IMM(0), DR(dst));
-#endif /* SLJIT_MIPS_REV >= 2 */
+		FAIL_IF(push_inst(compiler, DSRL32 | T(dst) | D(dst) | SH_IMM(0), DR(dst)));
 	}
 #endif /* SLJIT_CONFIG_MIPS_64 */
+#endif /* SLJIT_MIPR_REV >= 2 */
 	return SLJIT_SUCCESS;
 }
 
 static sljit_s32 emit_rev16(struct sljit_compiler *compiler, sljit_s32 op, sljit_s32 dst, sljit_sw src)
 {
+#if defined(SLJIT_MIPS_REV) && SLJIT_MIPS_REV >= 2
+#if defined(SLJIT_CONFIG_MIPS_32) && SLJIT_CONFIG_MIPS_32
+	FAIL_IF(push_inst(compiler, WSBH | T(src) | D(dst), DR(dst)));
+#else /* !SLJIT_CONFIG_MIPS_32 */
+	FAIL_IF(push_inst(compiler, DSBH | T(src) | D(dst), DR(dst)));
+#endif /* SLJIT_CONFIG_MIPS_32 */
+	if (GET_OPCODE(op) == SLJIT_REV_U16)
+		return push_inst(compiler, ANDI | S(dst) | T(dst) | 0xffff, DR(dst));
+	else
+		return push_inst(compiler, SEH | T(dst) | D(dst), DR(dst));
+#else /* SLJIT_MIPS_REV < 2 */
 	FAIL_IF(push_inst(compiler, SELECT_OP(DSRL, SRL) | T(src) | D(TMP_REG1) | SH_IMM(8), DR(TMP_REG1)));
 	FAIL_IF(push_inst(compiler, SELECT_OP(DSLL32, SLL) | T(src) | D(dst) | SH_IMM(24), DR(dst)));
 	FAIL_IF(push_inst(compiler, ANDI | S(TMP_REG1) | T(TMP_REG1) | 0xff, DR(TMP_REG1)));
 	FAIL_IF(push_inst(compiler, (GET_OPCODE(op) == SLJIT_REV_U16 ? SELECT_OP(DSRL32, SRL) : SELECT_OP(DSRA32, SRA)) | T(dst) | D(dst) | SH_IMM(16), DR(dst)));
 	return push_inst(compiler, OR | S(dst) | T(TMP_REG1) | D(dst), DR(dst));
+#endif /* SLJIT_MIPS_REV >= 2 */
 }
 
 static SLJIT_INLINE sljit_s32 emit_single_op(struct sljit_compiler *compiler, sljit_s32 op, sljit_s32 flags,
@@ -1575,17 +1737,17 @@
 		SLJIT_ASSERT(src1 == TMP_REG1 && !(flags & SRC2_IMM));
 		if ((flags & (REG_DEST | REG2_SOURCE)) == (REG_DEST | REG2_SOURCE)) {
 #if (defined SLJIT_CONFIG_MIPS_32 && SLJIT_CONFIG_MIPS_32)
-#if (defined SLJIT_MIPS_REV && SLJIT_MIPS_REV >= 1)
+#if (defined SLJIT_MIPS_REV && SLJIT_MIPS_REV >= 2)
 			return push_inst(compiler, SEB | T(src2) | D(dst), DR(dst));
-#else /* SLJIT_MIPS_REV < 1 */
+#else /* SLJIT_MIPS_REV < 2 */
 			FAIL_IF(push_inst(compiler, SLL | T(src2) | D(dst) | SH_IMM(24), DR(dst)));
 			return push_inst(compiler, SRA | T(dst) | D(dst) | SH_IMM(24), DR(dst));
-#endif /* SLJIT_MIPS_REV >= 1 */
+#endif /* SLJIT_MIPS_REV >= 2 */
 #else /* !SLJIT_CONFIG_MIPS_32 */
-#if (defined SLJIT_MIPS_REV && SLJIT_MIPS_REV >= 1)
+#if (defined SLJIT_MIPS_REV && SLJIT_MIPS_REV >= 2)
 			if (op & SLJIT_32)
 				return push_inst(compiler, SEB | T(src2) | D(dst), DR(dst));
-#endif /* SLJIT_MIPS_REV >= 1 */
+#endif /* SLJIT_MIPS_REV >= 2 */
 			FAIL_IF(push_inst(compiler, DSLL32 | T(src2) | D(dst) | SH_IMM(24), DR(dst)));
 			return push_inst(compiler, DSRA32 | T(dst) | D(dst) | SH_IMM(24), DR(dst));
 #endif /* SLJIT_CONFIG_MIPS_32 */
@@ -1604,17 +1766,17 @@
 		SLJIT_ASSERT(src1 == TMP_REG1 && !(flags & SRC2_IMM));
 		if ((flags & (REG_DEST | REG2_SOURCE)) == (REG_DEST | REG2_SOURCE)) {
 #if (defined SLJIT_CONFIG_MIPS_32 && SLJIT_CONFIG_MIPS_32)
-#if (defined SLJIT_MIPS_REV && SLJIT_MIPS_REV >= 1)
+#if (defined SLJIT_MIPS_REV && SLJIT_MIPS_REV >= 2)
 			return push_inst(compiler, SEH | T(src2) | D(dst), DR(dst));
-#else /* SLJIT_MIPS_REV < 1 */
+#else /* SLJIT_MIPS_REV < 2 */
 			FAIL_IF(push_inst(compiler, SLL | T(src2) | D(dst) | SH_IMM(16), DR(dst)));
 			return push_inst(compiler, SRA | T(dst) | D(dst) | SH_IMM(16), DR(dst));
-#endif /* SLJIT_MIPS_REV >= 1 */
+#endif /* SLJIT_MIPS_REV >= 2 */
 #else /* !SLJIT_CONFIG_MIPS_32 */
-#if (defined SLJIT_MIPS_REV && SLJIT_MIPS_REV >= 1)
+#if (defined SLJIT_MIPS_REV && SLJIT_MIPS_REV >= 2)
 			if (op & SLJIT_32)
 				return push_inst(compiler, SEH | T(src2) | D(dst), DR(dst));
-#endif /* SLJIT_MIPS_REV >= 1 */
+#endif /* SLJIT_MIPS_REV >= 2 */
 			FAIL_IF(push_inst(compiler, DSLL32 | T(src2) | D(dst) | SH_IMM(16), DR(dst)));
 			return push_inst(compiler, DSRA32 | T(dst) | D(dst) | SH_IMM(16), DR(dst));
 #endif /* SLJIT_CONFIG_MIPS_32 */
@@ -1628,7 +1790,7 @@
 		if ((flags & (REG_DEST | REG2_SOURCE)) == (REG_DEST | REG2_SOURCE)) {
 #if (defined SLJIT_MIPS_REV && SLJIT_MIPS_REV >= 2)
 			if (dst == src2)
-				return push_inst(compiler, DINSU | T(src2) | SA(0) | (31 << 11) | (0 << 11), DR(dst));
+				return push_inst(compiler, DINSU | T(src2) | SA(0) | (31 << 11), DR(dst));
 #endif /* SLJIT_MIPS_REV >= 2 */
 			FAIL_IF(push_inst(compiler, DSLL32 | T(src2) | D(dst) | SH_IMM(0), DR(dst)));
 			return push_inst(compiler, DSRL32 | T(dst) | D(dst) | SH_IMM(0), DR(dst));
@@ -2144,10 +2306,10 @@
 		flags |= SLOW_DEST;
 
 	if (flags & IMM_OP) {
-		if ((src2 & SLJIT_IMM) && src2w != 0 && CHECK_IMM(flags, src2w)) {
+		if (src2 == SLJIT_IMM && src2w != 0 && CHECK_IMM(flags, src2w)) {
 			flags |= SRC2_IMM;
 			src2_r = src2w;
-		} else if ((flags & CUMULATIVE_OP) && (src1 & SLJIT_IMM) && src1w != 0 && CHECK_IMM(flags, src1w)) {
+		} else if ((flags & CUMULATIVE_OP) && src1 == SLJIT_IMM && src1w != 0 && CHECK_IMM(flags, src1w)) {
 			flags |= SRC2_IMM;
 			src2_r = src1w;
 
@@ -2164,7 +2326,7 @@
 		src1_r = src1;
 		flags |= REG1_SOURCE;
 	}
-	else if (src1 & SLJIT_IMM) {
+	else if (src1 == SLJIT_IMM) {
 		if (src1w) {
 			FAIL_IF(load_immediate(compiler, DR(TMP_REG1), src1w));
 			src1_r = TMP_REG1;
@@ -2187,7 +2349,7 @@
 		if ((flags & (REG_DEST | MOVE_OP)) == MOVE_OP)
 			dst_r = (sljit_s32)src2_r;
 	}
-	else if (src2 & SLJIT_IMM) {
+	else if (src2 == SLJIT_IMM) {
 		if (!(flags & SRC2_IMM)) {
 			if (src2w) {
 				FAIL_IF(load_immediate(compiler, DR(sugg_src2_r), src2w));
@@ -2375,24 +2537,24 @@
 
 #if (defined SLJIT_CONFIG_MIPS_64 && SLJIT_CONFIG_MIPS_64)
 	case SLJIT_MOV_U32:
-		return emit_op(compiler, SLJIT_MOV_U32, INT_DATA | MOVE_OP, dst, dstw, TMP_REG1, 0, src, (src & SLJIT_IMM) ? (sljit_u32)srcw : srcw);
+		return emit_op(compiler, SLJIT_MOV_U32, INT_DATA | MOVE_OP, dst, dstw, TMP_REG1, 0, src, (src == SLJIT_IMM) ? (sljit_u32)srcw : srcw);
 
 	case SLJIT_MOV_S32:
 	case SLJIT_MOV32:
-		return emit_op(compiler, SLJIT_MOV_S32, INT_DATA | SIGNED_DATA | MOVE_OP, dst, dstw, TMP_REG1, 0, src, (src & SLJIT_IMM) ? (sljit_s32)srcw : srcw);
+		return emit_op(compiler, SLJIT_MOV_S32, INT_DATA | SIGNED_DATA | MOVE_OP, dst, dstw, TMP_REG1, 0, src, (src == SLJIT_IMM) ? (sljit_s32)srcw : srcw);
 #endif
 
 	case SLJIT_MOV_U8:
-		return emit_op(compiler, op, BYTE_DATA | MOVE_OP, dst, dstw, TMP_REG1, 0, src, (src & SLJIT_IMM) ? (sljit_u8)srcw : srcw);
+		return emit_op(compiler, op, BYTE_DATA | MOVE_OP, dst, dstw, TMP_REG1, 0, src, (src == SLJIT_IMM) ? (sljit_u8)srcw : srcw);
 
 	case SLJIT_MOV_S8:
-		return emit_op(compiler, op, BYTE_DATA | SIGNED_DATA | MOVE_OP, dst, dstw, TMP_REG1, 0, src, (src & SLJIT_IMM) ? (sljit_s8)srcw : srcw);
+		return emit_op(compiler, op, BYTE_DATA | SIGNED_DATA | MOVE_OP, dst, dstw, TMP_REG1, 0, src, (src == SLJIT_IMM) ? (sljit_s8)srcw : srcw);
 
 	case SLJIT_MOV_U16:
-		return emit_op(compiler, op, HALF_DATA | MOVE_OP, dst, dstw, TMP_REG1, 0, src, (src & SLJIT_IMM) ? (sljit_u16)srcw : srcw);
+		return emit_op(compiler, op, HALF_DATA | MOVE_OP, dst, dstw, TMP_REG1, 0, src, (src == SLJIT_IMM) ? (sljit_u16)srcw : srcw);
 
 	case SLJIT_MOV_S16:
-		return emit_op(compiler, op, HALF_DATA | SIGNED_DATA | MOVE_OP, dst, dstw, TMP_REG1, 0, src, (src & SLJIT_IMM) ? (sljit_s16)srcw : srcw);
+		return emit_op(compiler, op, HALF_DATA | SIGNED_DATA | MOVE_OP, dst, dstw, TMP_REG1, 0, src, (src == SLJIT_IMM) ? (sljit_s16)srcw : srcw);
 
 	case SLJIT_CLZ:
 	case SLJIT_CTZ:
@@ -2428,9 +2590,9 @@
 #if (defined SLJIT_CONFIG_MIPS_64 && SLJIT_CONFIG_MIPS_64)
 	if (op & SLJIT_32) {
 		flags |= INT_DATA | SIGNED_DATA;
-		if (src1 & SLJIT_IMM)
+		if (src1 == SLJIT_IMM)
 			src1w = (sljit_s32)src1w;
-		if (src2 & SLJIT_IMM)
+		if (src2 == SLJIT_IMM)
 			src2w = (sljit_s32)src2w;
 	}
 #endif
@@ -2451,7 +2613,7 @@
 		return emit_op(compiler, op, flags | CUMULATIVE_OP, dst, dstw, src1, src1w, src2, src2w);
 
 	case SLJIT_XOR:
-		if (((src1 & SLJIT_IMM) && src1w == -1) || ((src2 & SLJIT_IMM) && src2w == -1)) {
+		if ((src1 == SLJIT_IMM && src1w == -1) || (src2 == SLJIT_IMM && src2w == -1)) {
 			return emit_op(compiler, op, flags | CUMULATIVE_OP | IMM_OP, dst, dstw, src1, src1w, src2, src2w);
 		}
 		/* fallthrough */
@@ -2468,10 +2630,10 @@
 	case SLJIT_ROTL:
 	case SLJIT_ROTR:
 #if (defined SLJIT_CONFIG_MIPS_32 && SLJIT_CONFIG_MIPS_32)
-		if (src2 & SLJIT_IMM)
+		if (src2 == SLJIT_IMM)
 			src2w &= 0x1f;
 #else
-		if (src2 & SLJIT_IMM) {
+		if (src2 == SLJIT_IMM) {
 			if (op & SLJIT_32)
 				src2w &= 0x1f;
 			else
@@ -2532,7 +2694,7 @@
 
 	ADJUST_LOCAL_OFFSET(src3, src3w);
 
-	if (src3 & SLJIT_IMM) {
+	if (src3 == SLJIT_IMM) {
 		src3w &= bit_length - 1;
 
 		if (src3w == 0)
@@ -2653,7 +2815,7 @@
 {
 	CHECK_REG_INDEX(check_sljit_get_register_index(type, reg));
 
-	if (type == SLJIT_INT_REGISTER)
+	if (type == SLJIT_GP_REGISTER)
 		return reg_map[reg];
 
 	if (type != SLJIT_FLOAT_REGISTER)
@@ -2665,6 +2827,8 @@
 SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_op_custom(struct sljit_compiler *compiler,
 	void *instruction, sljit_u32 size)
 {
+	SLJIT_UNUSED_ARG(size);
+
 	CHECK_ERROR();
 	CHECK(check_sljit_emit_op_custom(compiler, instruction, size));
 
@@ -2676,14 +2840,14 @@
 /* --------------------------------------------------------------------- */
 
 #define FLOAT_DATA(op) (DOUBLE_DATA | ((op & SLJIT_32) >> 7))
-#define FMT(op) ((~(sljit_ins)op & SLJIT_32) << (21 - 8))
+#define FMT(op) (FMT_S | (~(sljit_ins)op & SLJIT_32) << (21 - (5 + 3)))
 
 static SLJIT_INLINE sljit_s32 sljit_emit_fop1_conv_sw_from_f64(struct sljit_compiler *compiler, sljit_s32 op,
 	sljit_s32 dst, sljit_sw dstw,
 	sljit_s32 src, sljit_sw srcw)
 {
 #if (defined SLJIT_CONFIG_MIPS_32 && SLJIT_CONFIG_MIPS_32)
-#	define flags (sljit_u32)0
+	sljit_u32 flags = 0;
 #else
 	sljit_u32 flags = ((sljit_u32)(GET_OPCODE(op) == SLJIT_CONV_SW_FROM_F64)) << 21;
 #endif
@@ -2697,18 +2861,13 @@
 
 	if (FAST_IS_REG(dst)) {
 		FAIL_IF(push_inst(compiler, MFC1 | flags | T(dst) | FS(TMP_FREG1), MOVABLE_INS));
-#if (!defined SLJIT_MIPS_REV || SLJIT_MIPS_REV <= 1)
+#if !defined(SLJIT_MIPS_REV) || (SLJIT_CONFIG_MIPS_32 && SLJIT_MIPS_REV <= 1)
 		FAIL_IF(push_inst(compiler, NOP, UNMOVABLE_INS));
-#endif /* SLJIT_MIPS_REV <= 1 */
+#endif /* MIPS III */
 		return SLJIT_SUCCESS;
 	}
 
-	/* Store the integer value from a VFP register. */
 	return emit_op_mem2(compiler, flags ? DOUBLE_DATA : SINGLE_DATA, FR(TMP_FREG1), dst, dstw, 0, 0);
-
-#if (defined SLJIT_CONFIG_MIPS_32 && SLJIT_CONFIG_MIPS_32)
-#	undef flags
-#endif
 }
 
 static SLJIT_INLINE sljit_s32 sljit_emit_fop1_conv_f64_from_sw(struct sljit_compiler *compiler, sljit_s32 op,
@@ -2722,11 +2881,10 @@
 #endif
 	sljit_s32 dst_r = FAST_IS_REG(dst) ? dst : TMP_FREG1;
 
-	if (src & SLJIT_MEM) {
-		/* Load the integer value into a VFP register. */
+	if (src & SLJIT_MEM)
 		FAIL_IF(emit_op_mem2(compiler, (flags ? DOUBLE_DATA : SINGLE_DATA) | LOAD_DATA, FR(TMP_FREG1), src, srcw, dst, dstw));
-	} else {
-		if (src & SLJIT_IMM) {
+	else {
+		if (src == SLJIT_IMM) {
 #if (defined SLJIT_CONFIG_MIPS_64 && SLJIT_CONFIG_MIPS_64)
 			if (GET_OPCODE(op) == SLJIT_CONV_F64_FROM_S32)
 				srcw = (sljit_s32)srcw;
@@ -2736,9 +2894,9 @@
 		}
 
 		FAIL_IF(push_inst(compiler, MTC1 | flags | T(src) | FS(TMP_FREG1), MOVABLE_INS));
-#if (!defined SLJIT_MIPS_REV || SLJIT_MIPS_REV <= 1)
+#if !defined(SLJIT_MIPS_REV) || (SLJIT_CONFIG_MIPS_32 && SLJIT_MIPS_REV <= 1)
 		FAIL_IF(push_inst(compiler, NOP, UNMOVABLE_INS));
-#endif /* SLJIT_MIPS_REV <= 1 */
+#endif /* MIPS III */
 	}
 
 	FAIL_IF(push_inst(compiler, CVT_S_S | flags | (4 << 21) | ((~(sljit_ins)op & SLJIT_32) >> 8) | FS(TMP_FREG1) | FD(dst_r), MOVABLE_INS));
@@ -2762,7 +2920,7 @@
 	if (src & SLJIT_MEM) {
 		FAIL_IF(emit_op_mem2(compiler, (GET_OPCODE(op) == SLJIT_CONV_F64_FROM_UW ? WORD_DATA : INT_DATA) | LOAD_DATA, DR(TMP_REG1), src, srcw, dst, dstw));
 		src = TMP_REG1;
-	} else if (src & SLJIT_IMM) {
+	} else if (src == SLJIT_IMM) {
 #if (defined SLJIT_CONFIG_MIPS_64 && SLJIT_CONFIG_MIPS_64)
 		if (GET_OPCODE(op) == SLJIT_CONV_F64_FROM_U32)
 			srcw = (sljit_u32)srcw;
@@ -2779,9 +2937,9 @@
 		}
 
 		FAIL_IF(push_inst(compiler, MTC1 | flags | T(TMP_REG1) | FS(TMP_FREG1), MOVABLE_INS));
-#if (!defined SLJIT_MIPS_REV || SLJIT_MIPS_REV <= 1)
+#if !defined(SLJIT_MIPS_REV)
 		FAIL_IF(push_inst(compiler, NOP, UNMOVABLE_INS));
-#endif /* SLJIT_MIPS_REV <= 1 */
+#endif /* MIPS III */
 
 		FAIL_IF(push_inst(compiler, CVT_S_S | flags | (4 << 21) | ((~(sljit_ins)op & SLJIT_32) >> 8) | FS(TMP_FREG1) | FD(dst_r), MOVABLE_INS));
 
@@ -2795,9 +2953,9 @@
 		FAIL_IF(push_inst(compiler, SRL | T(TMP_REG2) | D(TMP_REG2) | SH_IMM(1), DR(TMP_REG2)));
 
 		FAIL_IF(push_inst(compiler, MTC1 | flags | T(TMP_REG2) | FS(TMP_FREG1), MOVABLE_INS));
-#if (!defined SLJIT_MIPS_REV || SLJIT_MIPS_REV <= 1)
+#if !defined(SLJIT_MIPS_REV) || SLJIT_MIPS_REV <= 1
 		FAIL_IF(push_inst(compiler, NOP, UNMOVABLE_INS));
-#endif /* SLJIT_MIPS_REV <= 1 */
+#endif /* MIPS III */
 
 		FAIL_IF(push_inst(compiler, CVT_S_S | flags | (4 << 21) | 1 | FS(TMP_FREG1) | FD(dst_r), MOVABLE_INS));
 
@@ -2809,11 +2967,19 @@
 
 		FAIL_IF(push_inst(compiler, LUI | T(TMP_REG2) | IMM(0x41e0), UNMOVABLE_INS));
 		FAIL_IF(push_inst(compiler, MTC1 | TA(0) | FS(TMP_FREG2), UNMOVABLE_INS));
-		FAIL_IF(push_inst(compiler, MTC1 | T(TMP_REG2) | FS(TMP_FREG2) | (1 << 11), UNMOVABLE_INS));
-
-#if (!defined SLJIT_MIPS_REV || SLJIT_MIPS_REV <= 1)
-		FAIL_IF(push_inst(compiler, NOP, UNMOVABLE_INS));
-#endif /* SLJIT_MIPS_REV <= 1 */
+		switch (cpu_feature_list & CPU_FEATURE_FR) {
+#if defined(SLJIT_MIPS_REV) && SLJIT_MIPS_REV >= 2
+		case CPU_FEATURE_FR:
+			FAIL_IF(push_inst(compiler, MTHC1 | T(TMP_REG2) | FS(TMP_FREG2), UNMOVABLE_INS));
+			break;
+#endif /* SLJIT_MIPS_REV >= 2 */
+		default:
+			FAIL_IF(push_inst(compiler, MTC1 | T(TMP_REG2) | FS(TMP_FREG2) | (1 << 11), UNMOVABLE_INS));
+#if !defined(SLJIT_MIPS_REV) || SLJIT_MIPS_REV <= 1
+			FAIL_IF(push_inst(compiler, NOP, UNMOVABLE_INS));
+#endif /* MIPS III */
+			break;
+		}
 		FAIL_IF(push_inst(compiler, ADD_S | FMT(op) | FT(TMP_FREG2) | FS(dst_r) | FD(dst_r), UNMOVABLE_INS));
 
 		if (dst & SLJIT_MEM)
@@ -2830,9 +2996,9 @@
 	FAIL_IF(push_inst(compiler, ANDI | S(src) | T(TMP_REG2) | IMM(1), DR(TMP_REG2)));
 
 	FAIL_IF(push_inst(compiler, MTC1 | flags | T(src) | FS(TMP_FREG1), MOVABLE_INS));
-#if (!defined SLJIT_MIPS_REV || SLJIT_MIPS_REV <= 1)
+#if !defined(SLJIT_MIPS_REV)
 	FAIL_IF(push_inst(compiler, NOP, UNMOVABLE_INS));
-#endif /* SLJIT_MIPS_REV <= 1 */
+#endif /* !SLJIT_MIPS_REV */
 
 	FAIL_IF(push_inst(compiler, CVT_S_S | flags | (4 << 21) | ((~(sljit_ins)op & SLJIT_32) >> 8) | FS(TMP_FREG1) | FD(dst_r), MOVABLE_INS));
 
@@ -2851,9 +3017,9 @@
 	FAIL_IF(push_inst(compiler, OR | S(TMP_REG1) | T(TMP_REG2) | D(TMP_REG1), DR(TMP_REG1)));
 
 	FAIL_IF(push_inst(compiler, MTC1 | flags | T(TMP_REG1) | FS(TMP_FREG1), MOVABLE_INS));
-#if (!defined SLJIT_MIPS_REV || SLJIT_MIPS_REV <= 1)
+#if !defined(SLJIT_MIPS_REV)
 	FAIL_IF(push_inst(compiler, NOP, UNMOVABLE_INS));
-#endif /* SLJIT_MIPS_REV <= 1 */
+#endif /* !SLJIT_MIPS_REV */
 
 	FAIL_IF(push_inst(compiler, CVT_S_S | flags | (4 << 21) | ((~(sljit_ins)op & SLJIT_32) >> 8) | FS(TMP_FREG1) | FD(dst_r), MOVABLE_INS));
 	FAIL_IF(push_inst(compiler, ADD_S | FMT(op) | FT(dst_r) | FS(dst_r) | FD(dst_r), UNMOVABLE_INS));
@@ -2939,7 +3105,7 @@
 	case SLJIT_MOV_F64:
 		if (src != dst_r) {
 			if (dst_r != TMP_FREG1)
-				FAIL_IF(push_inst(compiler, MOV_S | FMT(op) | FS(src) | FD(dst_r), MOVABLE_INS));
+				FAIL_IF(push_inst(compiler, MOV_fmt(FMT(op)) | FS(src) | FD(dst_r), MOVABLE_INS));
 			else
 				dst_r = src;
 		}
@@ -3030,13 +3196,7 @@
 		FAIL_IF(push_inst(compiler, DIV_S | FMT(op) | FT(src2) | FS(src1) | FD(dst_r), MOVABLE_INS));
 		break;
 	case SLJIT_COPYSIGN_F64:
-		FAIL_IF(push_inst(compiler, SELECT_OP(DMFC1, MFC1) | T(TMP_REG1) | FS(src1), DR(TMP_REG1)));
-		FAIL_IF(push_inst(compiler, SELECT_OP(DMFC1, MFC1) | T(TMP_REG2) | FS(src2), DR(TMP_REG2)));
-		FAIL_IF(push_inst(compiler, XOR | S(TMP_REG2) | T(TMP_REG1) | D(TMP_REG2), DR(TMP_REG2)));
-		FAIL_IF(push_inst(compiler, SELECT_OP(DSRL32, SRL) | T(TMP_REG2) | D(TMP_REG2) | SH_IMM(31), DR(TMP_REG2)));
-		FAIL_IF(push_inst(compiler, SELECT_OP(DSLL32, SLL) | T(TMP_REG2) | D(TMP_REG2) | SH_IMM(31), DR(TMP_REG2)));
-		FAIL_IF(push_inst(compiler, XOR | S(TMP_REG1) | T(TMP_REG2) | D(TMP_REG1), DR(TMP_REG1)));
-		return push_inst(compiler, SELECT_OP(DMTC1, MTC1) | T(TMP_REG1) | FS(dst_r), MOVABLE_INS);
+		return emit_copysign(compiler, op, src1, src2, dst_r);
 	}
 
 	if (dst_r == TMP_FREG2)
@@ -3221,7 +3381,7 @@
 }
 
 #define RESOLVE_IMM1() \
-	if (src1 & SLJIT_IMM) { \
+	if (src1 == SLJIT_IMM) { \
 		if (src1w) { \
 			PTR_FAIL_IF(load_immediate(compiler, DR(TMP_REG1), src1w)); \
 			src1 = TMP_REG1; \
@@ -3231,7 +3391,7 @@
 	}
 
 #define RESOLVE_IMM2() \
-	if (src2 & SLJIT_IMM) { \
+	if (src2 == SLJIT_IMM) { \
 		if (src2w) { \
 			PTR_FAIL_IF(load_immediate(compiler, DR(TMP_REG2), src2w)); \
 			src2 = TMP_REG2; \
@@ -3283,10 +3443,9 @@
 		if (compiler->delay_slot == MOVABLE_INS || (compiler->delay_slot != UNMOVABLE_INS && compiler->delay_slot != DR(src1) && compiler->delay_slot != DR(src2)))
 			jump->flags |= IS_MOVABLE;
 		PTR_FAIL_IF(push_inst(compiler, (type == SLJIT_EQUAL ? BNE : BEQ) | S(src1) | T(src2) | BRANCH_LENGTH, UNMOVABLE_INS));
-	}
-	else if (type >= SLJIT_SIG_LESS && (((src1 & SLJIT_IMM) && (src1w == 0)) || ((src2 & SLJIT_IMM) && (src2w == 0)))) {
+	} else if (type >= SLJIT_SIG_LESS && ((src1 == SLJIT_IMM && src1w == 0) || (src2 == SLJIT_IMM && src2w == 0))) {
 		inst = NOP;
-		if ((src1 & SLJIT_IMM) && (src1w == 0)) {
+		if (src1 == SLJIT_IMM && src1w == 0) {
 			RESOLVE_IMM2();
 			switch (type) {
 			case SLJIT_SIG_LESS:
@@ -3334,7 +3493,7 @@
 	else {
 		if (type == SLJIT_LESS || type == SLJIT_GREATER_EQUAL || type == SLJIT_SIG_LESS || type == SLJIT_SIG_GREATER_EQUAL) {
 			RESOLVE_IMM1();
-			if ((src2 & SLJIT_IMM) && src2w <= SIMM_MAX && src2w >= SIMM_MIN)
+			if (src2 == SLJIT_IMM && src2w <= SIMM_MAX && src2w >= SIMM_MIN)
 				PTR_FAIL_IF(push_inst(compiler, (type <= SLJIT_LESS_EQUAL ? SLTIU : SLTI) | S(src1) | T(TMP_REG1) | IMM(src2w), DR(TMP_REG1)));
 			else {
 				RESOLVE_IMM2();
@@ -3344,7 +3503,7 @@
 		}
 		else {
 			RESOLVE_IMM2();
-			if ((src1 & SLJIT_IMM) && src1w <= SIMM_MAX && src1w >= SIMM_MIN)
+			if (src1 == SLJIT_IMM && src1w <= SIMM_MAX && src1w >= SIMM_MIN)
 				PTR_FAIL_IF(push_inst(compiler, (type <= SLJIT_LESS_EQUAL ? SLTIU : SLTI) | S(src2) | T(TMP_REG1) | IMM(src1w), DR(TMP_REG1)));
 			else {
 				RESOLVE_IMM1();
@@ -3386,7 +3545,7 @@
 	CHECK_ERROR();
 	CHECK(check_sljit_emit_ijump(compiler, type, src, srcw));
 
-	if (src & SLJIT_IMM) {
+	if (src == SLJIT_IMM) {
 		jump = (struct sljit_jump*)ensure_abuf(compiler, sizeof(struct sljit_jump));
 		FAIL_IF(!jump);
 		set_jump(jump, compiler, JUMP_ADDR | ((type >= SLJIT_FAST_CALL) ? IS_JAL : 0));
@@ -3418,8 +3577,7 @@
 #endif
 	}
 
-	FAIL_IF(push_inst(compiler, NOP, UNMOVABLE_INS));
-	return SLJIT_SUCCESS;
+	return push_inst(compiler, NOP, UNMOVABLE_INS);
 }
 
 SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_op_flags(struct sljit_compiler *compiler, sljit_s32 op,
@@ -3594,12 +3752,13 @@
 
 	CHECK_ERROR();
 	CHECK(check_sljit_emit_select(compiler, type, dst_reg, src1, src1w, src2_reg));
+	ADJUST_LOCAL_OFFSET(src1, src1w);
 
 #if (defined SLJIT_MIPS_REV && SLJIT_MIPS_REV >= 1 && SLJIT_MIPS_REV < 6)
 	if (src1 & SLJIT_MEM) {
 		FAIL_IF(emit_op_mem(compiler, inp_flags, DR(TMP_REG2), src1, src1w));
 		src1 = TMP_REG2;
-	} else if (src1 & SLJIT_IMM) {
+	} else if (src1 == SLJIT_IMM) {
 #if (defined SLJIT_CONFIG_MIPS_64 && SLJIT_CONFIG_MIPS_64)
 		if (type & SLJIT_32)
 			src1w = (sljit_s32)src1w;
@@ -3645,11 +3804,11 @@
 
 	if (src1 & SLJIT_MEM) {
 		FAIL_IF(emit_op_mem(compiler, inp_flags, DR(dst_reg), src1, src1w));
-	} else if (src1 & SLJIT_IMM) {
+	} else if (src1 == SLJIT_IMM) {
 #if (defined SLJIT_CONFIG_MIPS_64 && SLJIT_CONFIG_MIPS_64)
 		if (type & SLJIT_32)
 			src1w = (sljit_s32)src1w;
-#endif /* SLJIT_CONFIG_RISCV_64 */
+#endif /* SLJIT_CONFIG_MIPS_64 */
 		FAIL_IF(load_immediate(compiler, DR(dst_reg), src1w));
 	} else
 		FAIL_IF(push_inst(compiler, mov_ins | S(src1) | TA(0) | D(dst_reg), DR(dst_reg)));
@@ -3684,7 +3843,7 @@
 			src1w = 0;
 			type ^= 0x1;
 		} else
-			FAIL_IF(push_inst(compiler, MOV_S | FMT(type) | FS(src2_freg) | FD(dst_freg), MOVABLE_INS));
+			FAIL_IF(push_inst(compiler, MOV_fmt(FMT(type)) | FS(src2_freg) | FD(dst_freg), MOVABLE_INS));
 	}
 
 #if (defined SLJIT_MIPS_REV && SLJIT_MIPS_REV >= 1 && SLJIT_MIPS_REV < 6)
@@ -3703,7 +3862,7 @@
 	if (src1 & SLJIT_MEM)
 		FAIL_IF(emit_op_mem(compiler, FLOAT_DATA(type) | LOAD_DATA, FR(dst_freg), src1, src1w));
 	else
-		FAIL_IF(push_inst(compiler, MOV_S | FMT(type) | FS(src1) | FD(dst_freg), MOVABLE_INS));
+		FAIL_IF(push_inst(compiler, MOV_fmt(FMT(type)) | FS(src1) | FD(dst_freg), MOVABLE_INS));
 
 	SLJIT_SKIP_CHECKS(compiler);
 	label = sljit_emit_label(compiler);
@@ -3759,15 +3918,27 @@
 }
 
 #if (defined SLJIT_LITTLE_ENDIAN && SLJIT_LITTLE_ENDIAN)
-#define MEM16_IMM_FIRST(memw) IMM((memw) + 1)
-#define MEM16_IMM_SECOND(memw) IMM(memw)
-#define MEMF64_FS_FIRST(freg) FS(freg)
-#define MEMF64_FS_SECOND(freg) (FS(freg) | ((sljit_ins)1 << 11))
+#define IMM_LEFT(memw)			IMM((memw) + SSIZE_OF(sw) - 1)
+#define IMM_RIGHT(memw)			IMM(memw)
+#define IMM_32_LEFT(memw)		IMM((memw) + SSIZE_OF(s32) - 1)
+#define IMM_32_RIGHT(memw)		IMM(memw)
+#define IMM_F64_FIRST_LEFT(memw)	IMM((memw) + SSIZE_OF(s32) - 1)
+#define IMM_F64_FIRST_RIGHT(memw)	IMM(memw)
+#define IMM_F64_SECOND_LEFT(memw)	IMM((memw) + SSIZE_OF(f64) - 1)
+#define IMM_F64_SECOND_RIGHT(memw)	IMM((memw) + SSIZE_OF(s32))
+#define IMM_16_FIRST(memw)		IMM((memw) + 1)
+#define IMM_16_SECOND(memw)		IMM(memw)
 #else /* !SLJIT_LITTLE_ENDIAN */
-#define MEM16_IMM_FIRST(memw) IMM(memw)
-#define MEM16_IMM_SECOND(memw) IMM((memw) + 1)
-#define MEMF64_FS_FIRST(freg) (FS(freg) | ((sljit_ins)1 << 11))
-#define MEMF64_FS_SECOND(freg) FS(freg)
+#define IMM_LEFT(memw)			IMM(memw)
+#define IMM_RIGHT(memw)			IMM((memw) + SSIZE_OF(sw) - 1)
+#define IMM_32_LEFT(memw)		IMM(memw)
+#define IMM_32_RIGHT(memw)		IMM((memw) + SSIZE_OF(s32) - 1)
+#define IMM_F64_FIRST_LEFT(memw)	IMM((memw) + SSIZE_OF(s32))
+#define IMM_F64_FIRST_RIGHT(memw)	IMM((memw) + SSIZE_OF(f64) - 1)
+#define IMM_F64_SECOND_LEFT(memw)	IMM(memw)
+#define IMM_F64_SECOND_RIGHT(memw)	IMM((memw) + SSIZE_OF(s32) - 1)
+#define IMM_16_FIRST(memw)		IMM(memw)
+#define IMM_16_SECOND(memw)		IMM((memw) + 1)
 #endif /* SLJIT_LITTLE_ENDIAN */
 
 #if (defined SLJIT_CONFIG_MIPS_32 && SLJIT_CONFIG_MIPS_32)
@@ -3810,10 +3981,10 @@
 			ins_right = ((type & SLJIT_MEM_STORE) ? SDR : LDR) | S(mem);
 #endif /* SLJIT_CONFIG_MIPS_32 */
 
-			FAIL_IF(push_inst(compiler, ins | T(REG_PAIR_FIRST(reg)) | IMM(memw), DR(REG_PAIR_FIRST(reg))));
-			FAIL_IF(push_inst(compiler, ins_right | T(REG_PAIR_FIRST(reg)) | IMM(memw + (SSIZE_OF(sw) - 1)), DR(REG_PAIR_FIRST(reg))));
-			FAIL_IF(push_inst(compiler, ins | T(REG_PAIR_SECOND(reg)) | IMM(memw + SSIZE_OF(sw)), DR(REG_PAIR_SECOND(reg))));
-			return push_inst(compiler, ins_right | T(REG_PAIR_SECOND(reg)) | IMM((memw + 2 * SSIZE_OF(sw) - 1)), DR(REG_PAIR_SECOND(reg)));
+			FAIL_IF(push_inst(compiler, ins | T(REG_PAIR_FIRST(reg)) | IMM_LEFT(memw), DR(REG_PAIR_FIRST(reg))));
+			FAIL_IF(push_inst(compiler, ins_right | T(REG_PAIR_FIRST(reg)) | IMM_RIGHT(memw), DR(REG_PAIR_FIRST(reg))));
+			FAIL_IF(push_inst(compiler, ins | T(REG_PAIR_SECOND(reg)) | IMM_LEFT(memw + SSIZE_OF(sw)), DR(REG_PAIR_SECOND(reg))));
+			return push_inst(compiler, ins_right | T(REG_PAIR_SECOND(reg)) | IMM_RIGHT(memw + SSIZE_OF(sw)), DR(REG_PAIR_SECOND(reg)));
 		}
 #endif /* !(SLJIT_MIPS_REV >= 6) */
 
@@ -3854,8 +4025,8 @@
 
 		if (type & SLJIT_MEM_STORE) {
 			FAIL_IF(push_inst(compiler, SRA_W | T(reg) | D(TMP_REG2) | SH_IMM(8), DR(TMP_REG2)));
-			FAIL_IF(push_inst(compiler, data_transfer_insts[BYTE_DATA] | S(mem) | T(TMP_REG2) | MEM16_IMM_FIRST(memw), MOVABLE_INS));
-			return push_inst(compiler, data_transfer_insts[BYTE_DATA] | S(mem) | T(reg) | MEM16_IMM_SECOND(memw), MOVABLE_INS);
+			FAIL_IF(push_inst(compiler, data_transfer_insts[BYTE_DATA] | S(mem) | T(TMP_REG2) | IMM_16_FIRST(memw), MOVABLE_INS));
+			return push_inst(compiler, data_transfer_insts[BYTE_DATA] | S(mem) | T(reg) | IMM_16_SECOND(memw), MOVABLE_INS);
 		}
 
 		flags = BYTE_DATA | LOAD_DATA;
@@ -3863,8 +4034,8 @@
 		if (op == SLJIT_MOV_S16)
 			flags |= SIGNED_DATA;
 
-		FAIL_IF(push_inst(compiler, data_transfer_insts[flags] | S(mem) | T(TMP_REG2) | MEM16_IMM_FIRST(memw), DR(TMP_REG2)));
-		FAIL_IF(push_inst(compiler, data_transfer_insts[BYTE_DATA | LOAD_DATA] | S(mem) | T(reg) | MEM16_IMM_SECOND(memw), DR(reg)));
+		FAIL_IF(push_inst(compiler, data_transfer_insts[flags] | S(mem) | T(TMP_REG2) | IMM_16_FIRST(memw), DR(TMP_REG2)));
+		FAIL_IF(push_inst(compiler, data_transfer_insts[BYTE_DATA | LOAD_DATA] | S(mem) | T(reg) | IMM_16_SECOND(memw), DR(reg)));
 		FAIL_IF(push_inst(compiler, SLL_W | T(TMP_REG2) | D(TMP_REG2) | SH_IMM(8), DR(TMP_REG2)));
 		return push_inst(compiler, OR | S(reg) | T(TMP_REG2) | D(reg), DR(reg));
 
@@ -3883,8 +4054,8 @@
 		SLJIT_ASSERT(FAST_IS_REG(mem) && mem != TMP_REG2);
 
 		if (type & SLJIT_MEM_STORE) {
-			FAIL_IF(push_inst(compiler, SDL | S(mem) | T(reg) | IMM(memw), MOVABLE_INS));
-			return push_inst(compiler, SDR | S(mem) | T(reg) | IMM(memw + 7), MOVABLE_INS);
+			FAIL_IF(push_inst(compiler, SDL | S(mem) | T(reg) | IMM_LEFT(memw), MOVABLE_INS));
+			return push_inst(compiler, SDR | S(mem) | T(reg) | IMM_RIGHT(memw), MOVABLE_INS);
 		}
 
 		if (mem == reg) {
@@ -3892,8 +4063,8 @@
 			mem = TMP_REG1;
 		}
 
-		FAIL_IF(push_inst(compiler, LDL | S(mem) | T(reg) | IMM(memw), DR(reg)));
-		return push_inst(compiler, LDR | S(mem) | T(reg) | IMM(memw + 7), DR(reg));
+		FAIL_IF(push_inst(compiler, LDL | S(mem) | T(reg) | IMM_LEFT(memw), DR(reg)));
+		return push_inst(compiler, LDR | S(mem) | T(reg) | IMM_RIGHT(memw), DR(reg));
 #endif /* SLJIT_CONFIG_MIPS_32 */
 	}
 
@@ -3901,8 +4072,8 @@
 	SLJIT_ASSERT(FAST_IS_REG(mem) && mem != TMP_REG2);
 
 	if (type & SLJIT_MEM_STORE) {
-		FAIL_IF(push_inst(compiler, SWL | S(mem) | T(reg) | IMM(memw), MOVABLE_INS));
-		return push_inst(compiler, SWR | S(mem) | T(reg) | IMM(memw + 3), MOVABLE_INS);
+		FAIL_IF(push_inst(compiler, SWL | S(mem) | T(reg) | IMM_32_LEFT(memw), MOVABLE_INS));
+		return push_inst(compiler, SWR | S(mem) | T(reg) | IMM_32_RIGHT(memw), MOVABLE_INS);
 	}
 
 	if (mem == reg) {
@@ -3910,18 +4081,18 @@
 		mem = TMP_REG1;
 	}
 
-	FAIL_IF(push_inst(compiler, LWL | S(mem) | T(reg) | IMM(memw), DR(reg)));
+	FAIL_IF(push_inst(compiler, LWL | S(mem) | T(reg) | IMM_32_LEFT(memw), DR(reg)));
 #if (defined SLJIT_CONFIG_MIPS_32 && SLJIT_CONFIG_MIPS_32)
-	return push_inst(compiler, LWR | S(mem) | T(reg) | IMM(memw + 3), DR(reg));
+	return push_inst(compiler, LWR | S(mem) | T(reg) | IMM_32_RIGHT(memw), DR(reg));
 #else /* !SLJIT_CONFIG_MIPS_32 */
-	FAIL_IF(push_inst(compiler, LWR | S(mem) | T(reg) | IMM(memw + 3), DR(reg)));
+	FAIL_IF(push_inst(compiler, LWR | S(mem) | T(reg) | IMM_32_RIGHT(memw), DR(reg)));
 
 	if (op != SLJIT_MOV_U32)
 		return SLJIT_SUCCESS;
 
 #if (defined SLJIT_MIPS_REV && SLJIT_MIPS_REV >= 2)
-	return push_inst(compiler, DINSU | T(reg) | SA(0) | (31 << 11) | (0 << 11), DR(reg));
-#else  /* SLJIT_MIPS_REV < 1 */
+	return push_inst(compiler, DINSU | T(reg) | SA(0) | (31 << 11), DR(reg));
+#else  /* SLJIT_MIPS_REV < 2 */
 	FAIL_IF(push_inst(compiler, DSLL32 | T(reg) | D(reg) | SH_IMM(0), DR(reg)));
 	return push_inst(compiler, DSRL32 | T(reg) | D(reg) | SH_IMM(0), DR(reg));
 #endif /* SLJIT_MIPS_REV >= 2 */
@@ -3944,77 +4115,97 @@
 	if (type & SLJIT_MEM_STORE) {
 		if (type & SLJIT_32) {
 			FAIL_IF(push_inst(compiler, MFC1 | T(TMP_REG2) | FS(freg), DR(TMP_REG2)));
-#if (!defined SLJIT_MIPS_REV || SLJIT_MIPS_REV <= 3)
+#if !defined(SLJIT_MIPS_REV) || (SLJIT_CONFIG_MIPS_32 && SLJIT_MIPS_REV <= 1)
 			FAIL_IF(push_inst(compiler, NOP, UNMOVABLE_INS));
-#endif
-			FAIL_IF(push_inst(compiler, SWL | S(mem) | T(TMP_REG2) | IMM(memw), MOVABLE_INS));
-			return push_inst(compiler, SWR | S(mem) | T(TMP_REG2) | IMM(memw + 3), MOVABLE_INS);
+#endif /* MIPS III */
+			FAIL_IF(push_inst(compiler, SWL | S(mem) | T(TMP_REG2) | IMM_32_LEFT(memw), MOVABLE_INS));
+			return push_inst(compiler, SWR | S(mem) | T(TMP_REG2) | IMM_32_RIGHT(memw), MOVABLE_INS);
 		}
 
 #if (defined SLJIT_CONFIG_MIPS_32 && SLJIT_CONFIG_MIPS_32)
-		FAIL_IF(push_inst(compiler, MFC1 | T(TMP_REG2) | MEMF64_FS_FIRST(freg), DR(TMP_REG2)));
-#if (!defined SLJIT_MIPS_REV || SLJIT_MIPS_REV <= 3)
+		FAIL_IF(push_inst(compiler, MFC1 | T(TMP_REG2) | FS(freg), DR(TMP_REG2)));
+#if !defined(SLJIT_MIPS_REV) || SLJIT_MIPS_REV <= 1
 		FAIL_IF(push_inst(compiler, NOP, UNMOVABLE_INS));
+#endif /* MIPS III */
+		FAIL_IF(push_inst(compiler, SWL | S(mem) | T(TMP_REG2) | IMM_F64_FIRST_LEFT(memw), MOVABLE_INS));
+		FAIL_IF(push_inst(compiler, SWR | S(mem) | T(TMP_REG2) | IMM_F64_FIRST_RIGHT(memw), MOVABLE_INS));
+		switch (cpu_feature_list & CPU_FEATURE_FR) {
+#if defined(SLJIT_MIPS_REV) && SLJIT_MIPS_REV >= 2
+		case CPU_FEATURE_FR:
+			FAIL_IF(push_inst(compiler, MFHC1 | T(TMP_REG2) | FS(freg), DR(TMP_REG2)));
+			break;
+#endif /* SLJIT_MIPS_REV >= 2 */
+		default:
+			FAIL_IF(push_inst(compiler, MFC1 | T(TMP_REG2) | FS(freg) | (1 << 11), DR(TMP_REG2)));
+#if !defined(SLJIT_MIPS_REV) || SLJIT_MIPS_REV <= 1
+			FAIL_IF(push_inst(compiler, NOP, UNMOVABLE_INS));
 #endif
-		FAIL_IF(push_inst(compiler, SWL | S(mem) | T(TMP_REG2) | IMM(memw), MOVABLE_INS));
-		FAIL_IF(push_inst(compiler, SWR | S(mem) | T(TMP_REG2) | IMM(memw + 3), MOVABLE_INS));
+			break;
+		}
 
-		FAIL_IF(push_inst(compiler, MFC1 | T(TMP_REG2) | MEMF64_FS_SECOND(freg), DR(TMP_REG2)));
-#if (!defined SLJIT_MIPS_REV || SLJIT_MIPS_REV <= 3)
-		FAIL_IF(push_inst(compiler, NOP, UNMOVABLE_INS));
-#endif
-		FAIL_IF(push_inst(compiler, SWL | S(mem) | T(TMP_REG2) | IMM(memw + 4), MOVABLE_INS));
-		return push_inst(compiler, SWR | S(mem) | T(TMP_REG2) | IMM(memw + 7), MOVABLE_INS);
+		FAIL_IF(push_inst(compiler, SWL | S(mem) | T(TMP_REG2) | IMM_F64_SECOND_LEFT(memw), MOVABLE_INS));
+		return push_inst(compiler, SWR | S(mem) | T(TMP_REG2) | IMM_F64_SECOND_RIGHT(memw), MOVABLE_INS);
 #else /* !SLJIT_CONFIG_MIPS_32 */
-		FAIL_IF(push_inst(compiler, MFC1 | (1 << 21) | T(TMP_REG2) | FS(freg), DR(TMP_REG2)));
-#if (!defined SLJIT_MIPS_REV || SLJIT_MIPS_REV <= 3)
+		FAIL_IF(push_inst(compiler, DMFC1 | T(TMP_REG2) | FS(freg), DR(TMP_REG2)));
+#if !defined(SLJIT_MIPS_REV) || SLJIT_MIPS_REV <= 1
 		FAIL_IF(push_inst(compiler, NOP, UNMOVABLE_INS));
-#endif
-		FAIL_IF(push_inst(compiler, SDL | S(mem) | T(TMP_REG2) | IMM(memw), MOVABLE_INS));
-		return push_inst(compiler, SDR | S(mem) | T(TMP_REG2) | IMM(memw + 7), MOVABLE_INS);
+#endif /* MIPS III */
+		FAIL_IF(push_inst(compiler, SDL | S(mem) | T(TMP_REG2) | IMM_LEFT(memw), MOVABLE_INS));
+		return push_inst(compiler, SDR | S(mem) | T(TMP_REG2) | IMM_RIGHT(memw), MOVABLE_INS);
 #endif /* SLJIT_CONFIG_MIPS_32 */
 	}
 
 	if (type & SLJIT_32) {
-		FAIL_IF(push_inst(compiler, LWL | S(mem) | T(TMP_REG2) | IMM(memw), DR(TMP_REG2)));
-		FAIL_IF(push_inst(compiler, LWR | S(mem) | T(TMP_REG2) | IMM(memw + 3), DR(TMP_REG2)));
+		FAIL_IF(push_inst(compiler, LWL | S(mem) | T(TMP_REG2) | IMM_32_LEFT(memw), DR(TMP_REG2)));
+		FAIL_IF(push_inst(compiler, LWR | S(mem) | T(TMP_REG2) | IMM_32_RIGHT(memw), DR(TMP_REG2)));
 
 		FAIL_IF(push_inst(compiler, MTC1 | T(TMP_REG2) | FS(freg), MOVABLE_INS));
-#if (!defined SLJIT_MIPS_REV || SLJIT_MIPS_REV <= 3)
+#if !defined(SLJIT_MIPS_REV) || (SLJIT_CONFIG_MIPS_32 && SLJIT_MIPS_REV <= 1)
 		FAIL_IF(push_inst(compiler, NOP, UNMOVABLE_INS));
-#endif
+#endif /* MIPS III */
 		return SLJIT_SUCCESS;
 	}
 
 #if (defined SLJIT_CONFIG_MIPS_32 && SLJIT_CONFIG_MIPS_32)
-	FAIL_IF(push_inst(compiler, LWL | S(mem) | T(TMP_REG2) | IMM(memw), DR(TMP_REG2)));
-	FAIL_IF(push_inst(compiler, LWR | S(mem) | T(TMP_REG2) | IMM(memw + 3), DR(TMP_REG2)));
-	FAIL_IF(push_inst(compiler, MTC1 | T(TMP_REG2) | MEMF64_FS_FIRST(freg), MOVABLE_INS));
+	FAIL_IF(push_inst(compiler, LWL | S(mem) | T(TMP_REG2) | IMM_F64_FIRST_LEFT(memw), DR(TMP_REG2)));
+	FAIL_IF(push_inst(compiler, LWR | S(mem) | T(TMP_REG2) | IMM_F64_FIRST_RIGHT(memw), DR(TMP_REG2)));
+	FAIL_IF(push_inst(compiler, MTC1 | T(TMP_REG2) | FS(freg), MOVABLE_INS));
 
-	FAIL_IF(push_inst(compiler, LWL | S(mem) | T(TMP_REG2) | IMM(memw + 4), DR(TMP_REG2)));
-	FAIL_IF(push_inst(compiler, LWR | S(mem) | T(TMP_REG2) | IMM(memw + 7), DR(TMP_REG2)));
-	FAIL_IF(push_inst(compiler, MTC1 | T(TMP_REG2) | MEMF64_FS_SECOND(freg), MOVABLE_INS));
-#if (!defined SLJIT_MIPS_REV || SLJIT_MIPS_REV <= 3)
-	FAIL_IF(push_inst(compiler, NOP, UNMOVABLE_INS));
-#endif
+	FAIL_IF(push_inst(compiler, LWL | S(mem) | T(TMP_REG2) | IMM_F64_SECOND_LEFT(memw), DR(TMP_REG2)));
+	FAIL_IF(push_inst(compiler, LWR | S(mem) | T(TMP_REG2) | IMM_F64_SECOND_RIGHT(memw), DR(TMP_REG2)));
+	switch (cpu_feature_list & CPU_FEATURE_FR) {
+#if defined(SLJIT_MIPS_REV) && SLJIT_MIPS_REV >= 2
+	case CPU_FEATURE_FR:
+		return push_inst(compiler, MTHC1 | T(TMP_REG2) | FS(freg), MOVABLE_INS);
+#endif /* SLJIT_MIPS_REV >= 2 */
+	default:
+		FAIL_IF(push_inst(compiler, MTC1 | T(TMP_REG2) | FS(freg) | (1 << 11), MOVABLE_INS));
+		break;
+	}
 #else /* !SLJIT_CONFIG_MIPS_32 */
-	FAIL_IF(push_inst(compiler, LDL | S(mem) | T(TMP_REG2) | IMM(memw), DR(TMP_REG2)));
-	FAIL_IF(push_inst(compiler, LDR | S(mem) | T(TMP_REG2) | IMM(memw + 7), DR(TMP_REG2)));
+	FAIL_IF(push_inst(compiler, LDL | S(mem) | T(TMP_REG2) | IMM_LEFT(memw), DR(TMP_REG2)));
+	FAIL_IF(push_inst(compiler, LDR | S(mem) | T(TMP_REG2) | IMM_RIGHT(memw), DR(TMP_REG2)));
 
-	FAIL_IF(push_inst(compiler, MTC1 | (1 << 21) | T(TMP_REG2) | FS(freg), MOVABLE_INS));
-#if (!defined SLJIT_MIPS_REV || SLJIT_MIPS_REV <= 3)
-	FAIL_IF(push_inst(compiler, NOP, UNMOVABLE_INS));
-#endif
+	FAIL_IF(push_inst(compiler, DMTC1 | T(TMP_REG2) | FS(freg), MOVABLE_INS));
 #endif /* SLJIT_CONFIG_MIPS_32 */
+#if !defined(SLJIT_MIPS_REV) || SLJIT_MIPS_REV <= 1
+	FAIL_IF(push_inst(compiler, NOP, UNMOVABLE_INS));
+#endif /* MIPS III */
 	return SLJIT_SUCCESS;
 }
 
 #endif /* !SLJIT_MIPS_REV || SLJIT_MIPS_REV < 6 */
 
-#undef MEM16_IMM_FIRST
-#undef MEM16_IMM_SECOND
-#undef MEMF64_FS_FIRST
-#undef MEMF64_FS_SECOND
+#undef IMM_16_SECOND
+#undef IMM_16_FIRST
+#undef IMM_F64_SECOND_RIGHT
+#undef IMM_F64_SECOND_LEFT
+#undef IMM_F64_FIRST_RIGHT
+#undef IMM_F64_FIRST_LEFT
+#undef IMM_32_RIGHT
+#undef IMM_32_LEFT
+#undef IMM_RIGHT
+#undef IMM_LEFT
 #undef MEM_CHECK_UNALIGNED
 
 #undef TO_ARGW_HI
diff --git a/src/sljit/sljitNativePPC_32.c b/src/sljit/sljitNativePPC_32.c
index a335c24..2352fad 100644
--- a/src/sljit/sljitNativePPC_32.c
+++ b/src/sljit/sljitNativePPC_32.c
@@ -332,7 +332,7 @@
 	sljit_s32 dst_r = FAST_IS_REG(dst) ? dst : TMP_FREG1;
 	sljit_s32 invert_sign = 1;
 
-	if (src & SLJIT_IMM) {
+	if (src == SLJIT_IMM) {
 		FAIL_IF(load_immediate(compiler, TMP_REG1, srcw ^ (sljit_sw)0x80000000));
 		src = TMP_REG1;
 		invert_sign = 0;
@@ -351,10 +351,10 @@
 	if (invert_sign)
 		FAIL_IF(push_inst(compiler, XORIS | S(src) | A(TMP_REG1) | 0x8000));
 	FAIL_IF(push_inst(compiler, STW | S(TMP_REG2) | A(SLJIT_SP) | TMP_MEM_OFFSET_HI));
-	FAIL_IF(push_inst(compiler, STW | S(TMP_REG1) | A(SLJIT_SP) | TMP_MEM_OFFSET_LOW));
+	FAIL_IF(push_inst(compiler, STW | S(TMP_REG1) | A(SLJIT_SP) | TMP_MEM_OFFSET_LO));
 	FAIL_IF(push_inst(compiler, ADDIS | D(TMP_REG1) | A(0) | 0x8000));
 	FAIL_IF(push_inst(compiler, LFD | FS(TMP_FREG1) | A(SLJIT_SP) | TMP_MEM_OFFSET));
-	FAIL_IF(push_inst(compiler, STW | S(TMP_REG1) | A(SLJIT_SP) | TMP_MEM_OFFSET_LOW));
+	FAIL_IF(push_inst(compiler, STW | S(TMP_REG1) | A(SLJIT_SP) | TMP_MEM_OFFSET_LO));
 	FAIL_IF(push_inst(compiler, LFD | FS(TMP_FREG2) | A(SLJIT_SP) | TMP_MEM_OFFSET));
 
 	FAIL_IF(push_inst(compiler, FSUB | FD(dst_r) | FA(TMP_FREG1) | FB(TMP_FREG2)));
@@ -373,7 +373,7 @@
 {
 	sljit_s32 dst_r = FAST_IS_REG(dst) ? dst : TMP_FREG1;
 
-	if (src & SLJIT_IMM) {
+	if (src == SLJIT_IMM) {
 		FAIL_IF(load_immediate(compiler, TMP_REG1, srcw));
 		src = TMP_REG1;
 	} else if (!FAST_IS_REG(src)) {
@@ -387,11 +387,11 @@
 	   is simply the value of the source argument. Finally we substract 2^53
 	   to get the converted value. */
 	FAIL_IF(push_inst(compiler, ADDIS | D(TMP_REG2) | A(0) | 0x4330));
-	FAIL_IF(push_inst(compiler, STW | S(src) | A(SLJIT_SP) | TMP_MEM_OFFSET_LOW));
+	FAIL_IF(push_inst(compiler, STW | S(src) | A(SLJIT_SP) | TMP_MEM_OFFSET_LO));
 	FAIL_IF(push_inst(compiler, STW | S(TMP_REG2) | A(SLJIT_SP) | TMP_MEM_OFFSET_HI));
 
 	FAIL_IF(push_inst(compiler, LFD | FS(TMP_FREG1) | A(SLJIT_SP) | TMP_MEM_OFFSET));
-	FAIL_IF(push_inst(compiler, STW | S(TMP_ZERO) | A(SLJIT_SP) | TMP_MEM_OFFSET_LOW));
+	FAIL_IF(push_inst(compiler, STW | S(TMP_ZERO) | A(SLJIT_SP) | TMP_MEM_OFFSET_LO));
 	FAIL_IF(push_inst(compiler, LFD | FS(TMP_FREG2) | A(SLJIT_SP) | TMP_MEM_OFFSET));
 
 	FAIL_IF(push_inst(compiler, FSUB | FD(dst_r) | FA(TMP_FREG1) | FB(TMP_FREG2)));
@@ -455,9 +455,9 @@
 		FAIL_IF(push_inst(compiler, STW | S(reg) | A(SLJIT_SP) | TMP_MEM_OFFSET_HI));
 
 		if (reg2 != 0)
-			FAIL_IF(push_inst(compiler, STW | S(reg2) | A(SLJIT_SP) | TMP_MEM_OFFSET_LOW));
+			FAIL_IF(push_inst(compiler, STW | S(reg2) | A(SLJIT_SP) | TMP_MEM_OFFSET_LO));
 		else
-			FAIL_IF(push_inst(compiler, STFD | FS(freg) | A(SLJIT_SP) | TMP_MEM_OFFSET_LOW));
+			FAIL_IF(push_inst(compiler, STFD | FS(freg) | A(SLJIT_SP) | TMP_MEM_OFFSET_LO));
 
 		return push_inst(compiler, LFD | FS(freg) | A(SLJIT_SP) | TMP_MEM_OFFSET);
 	}
@@ -465,7 +465,7 @@
 	FAIL_IF(push_inst(compiler, STFD | FS(freg) | A(SLJIT_SP) | TMP_MEM_OFFSET));
 
 	if (reg2 != 0)
-		FAIL_IF(push_inst(compiler, LWZ | S(reg2) | A(SLJIT_SP) | TMP_MEM_OFFSET_LOW));
+		FAIL_IF(push_inst(compiler, LWZ | S(reg2) | A(SLJIT_SP) | TMP_MEM_OFFSET_LO));
 
 	return push_inst(compiler, LWZ | S(reg) | A(SLJIT_SP) | TMP_MEM_OFFSET_HI);
 }
diff --git a/src/sljit/sljitNativePPC_64.c b/src/sljit/sljitNativePPC_64.c
index ff0de04..b3cf9d0 100644
--- a/src/sljit/sljitNativePPC_64.c
+++ b/src/sljit/sljitNativePPC_64.c
@@ -574,7 +574,7 @@
 {
 	sljit_s32 dst_r = FAST_IS_REG(dst) ? dst : TMP_FREG1;
 
-	if (src & SLJIT_IMM) {
+	if (src == SLJIT_IMM) {
 		if (GET_OPCODE(op) == SLJIT_CONV_F64_FROM_S32)
 			srcw = (sljit_s32)srcw;
 
@@ -611,7 +611,7 @@
 	sljit_s32 dst_r = FAST_IS_REG(dst) ? dst : TMP_FREG1;
 
 	if (GET_OPCODE(op) == SLJIT_CONV_F64_FROM_U32) {
-		if (src & SLJIT_IMM) {
+		if (src == SLJIT_IMM) {
 			FAIL_IF(load_immediate(compiler, TMP_REG1, (sljit_u32)srcw));
 			src = TMP_REG1;
 		} else {
@@ -626,7 +626,7 @@
 		FAIL_IF(push_inst(compiler, LFD | FS(TMP_FREG1) | A(SLJIT_SP) | TMP_MEM_OFFSET));
 		FAIL_IF(push_inst(compiler, FCFID | FD(dst_r) | FB(TMP_FREG1)));
 	} else {
-		if (src & SLJIT_IMM) {
+		if (src == SLJIT_IMM) {
 			FAIL_IF(load_immediate(compiler, TMP_REG1, srcw));
 			src = TMP_REG1;
 		} else if (src & SLJIT_MEM) {
diff --git a/src/sljit/sljitNativePPC_common.c b/src/sljit/sljitNativePPC_common.c
index fc2389c..54977f0 100644
--- a/src/sljit/sljitNativePPC_common.c
+++ b/src/sljit/sljitNativePPC_common.c
@@ -132,7 +132,7 @@
    OE and Rc flag (see ALT_SET_FLAGS). */
 #define OE(flags)	((flags) & ALT_SET_FLAGS)
 /* Rc flag (see ALT_SET_FLAGS). */
-#define RC(flags)	(((flags) & ALT_SET_FLAGS) >> 10)
+#define RC(flags)	((sljit_ins)((flags) & ALT_SET_FLAGS) >> 10)
 #define HI(opcode)	((sljit_ins)(opcode) << 26)
 #define LO(opcode)	((sljit_ins)(opcode) << 1)
 
@@ -150,6 +150,9 @@
 #define BCx		(HI(16))
 #define BCCTR		(HI(19) | LO(528) | (3 << 11))
 #define BLR		(HI(19) | LO(16) | (0x14 << 21))
+#if defined(_ARCH_PWR10) && _ARCH_PWR10
+#define BRD		(HI(31) | LO(187))
+#endif /* POWER10 */
 #define CNTLZD		(HI(31) | LO(58))
 #define CNTLZW		(HI(31) | LO(26))
 #define CMP		(HI(31) | LO(0))
@@ -184,6 +187,9 @@
 #define LD		(HI(58) | 0)
 #define LFD		(HI(50))
 #define LFS		(HI(48))
+#if defined(_ARCH_PWR7) && _ARCH_PWR7
+#define LDBRX		(HI(31) | LO(532))
+#endif /* POWER7 */
 #define LHBRX		(HI(31) | LO(790))
 #define LWBRX		(HI(31) | LO(534))
 #define LWZ		(HI(32))
@@ -222,6 +228,9 @@
 #define SRD		(HI(31) | LO(539))
 #define SRW		(HI(31) | LO(536))
 #define STD		(HI(62) | 0)
+#if defined(_ARCH_PWR7) && _ARCH_PWR7
+#define STDBRX		(HI(31) | LO(660))
+#endif /* POWER7 */
 #define STDU		(HI(62) | 1)
 #define STDUX		(HI(31) | LO(181))
 #define STFD		(HI(54))
@@ -266,11 +275,15 @@
 #endif /* SLJIT_CONFIG_PPC_32 */
 
 #if (defined SLJIT_LITTLE_ENDIAN && SLJIT_LITTLE_ENDIAN)
-#define TMP_MEM_OFFSET_LOW TMP_MEM_OFFSET
-#define TMP_MEM_OFFSET_HI (TMP_MEM_OFFSET + sizeof(sljit_s32))
+#define TMP_MEM_OFFSET_LO	(TMP_MEM_OFFSET)
+#define TMP_MEM_OFFSET_HI	(TMP_MEM_OFFSET + sizeof(sljit_s32))
+#define LWBRX_FIRST_REG		S(TMP_REG1)
+#define LWBRX_SECOND_REG	S(dst)
 #else /* !SLJIT_LITTLE_ENDIAN */
-#define TMP_MEM_OFFSET_LOW (TMP_MEM_OFFSET + sizeof(sljit_s32))
-#define TMP_MEM_OFFSET_HI TMP_MEM_OFFSET
+#define TMP_MEM_OFFSET_LO	(TMP_MEM_OFFSET + sizeof(sljit_s32))
+#define TMP_MEM_OFFSET_HI	(TMP_MEM_OFFSET)
+#define LWBRX_FIRST_REG		S(dst)
+#define LWBRX_SECOND_REG	S(TMP_REG1)
 #endif /* SLJIT_LITTLE_ENDIAN */
 
 #if (defined SLJIT_INDIRECT_CALL && SLJIT_INDIRECT_CALL)
@@ -439,6 +452,7 @@
 	reverse_buf(compiler);
 
 #if (defined SLJIT_INDIRECT_CALL && SLJIT_INDIRECT_CALL)
+	/* add to compiler->size additional instruction space to hold the trampoline and padding */
 #if (defined SLJIT_CONFIG_PPC_64 && SLJIT_CONFIG_PPC_64)
 	compiler->size += (compiler->size & 0x1) + (sizeof(struct sljit_function_context) / sizeof(sljit_ins));
 #else
@@ -639,7 +653,6 @@
 
 	compiler->error = SLJIT_ERR_COMPILED;
 	compiler->executable_offset = executable_offset;
-	compiler->executable_size = (sljit_uw)(code_ptr - code) * sizeof(sljit_ins);
 
 	code = (sljit_ins *)SLJIT_ADD_EXEC_OFFSET(code, executable_offset);
 
@@ -657,8 +670,12 @@
 	SLJIT_UPDATE_WX_FLAGS(code, code_ptr, 1);
 
 #if (defined SLJIT_INDIRECT_CALL && SLJIT_INDIRECT_CALL)
+	compiler->executable_size = (sljit_uw)(code_ptr - code) * sizeof(sljit_ins) + sizeof(struct sljit_function_context);
+
 	return code_ptr;
 #else
+	compiler->executable_size = (sljit_uw)(code_ptr - code) * sizeof(sljit_ins);
+
 	return code;
 #endif
 }
@@ -668,12 +685,17 @@
 	switch (feature_type) {
 	case SLJIT_HAS_FPU:
 #ifdef SLJIT_IS_FPU_AVAILABLE
-		return SLJIT_IS_FPU_AVAILABLE;
+		return (SLJIT_IS_FPU_AVAILABLE) != 0;
 #else
 		/* Available by default. */
 		return 1;
 #endif
-
+	case SLJIT_HAS_REV:
+#if defined(_ARCH_PWR10) && _ARCH_PWR10
+		return 1;
+#else /* !POWER10 */
+		return 2;
+#endif /* POWER10 */
 	/* A saved register is set to a zero value. */
 	case SLJIT_HAS_ZERO_REGISTER:
 	case SLJIT_HAS_CLZ:
@@ -682,7 +704,6 @@
 		return 1;
 
 	case SLJIT_HAS_CTZ:
-	case SLJIT_HAS_REV:
 		return 2;
 
 	default:
@@ -1251,7 +1272,7 @@
 		src1_r = src1;
 		flags |= REG1_SOURCE;
 	}
-	else if (src1 & SLJIT_IMM) {
+	else if (src1 == SLJIT_IMM) {
 		src1_r = TMP_ZERO;
 		if (src1w != 0) {
 			FAIL_IF(load_immediate(compiler, TMP_REG1, src1w));
@@ -1271,7 +1292,7 @@
 		if (!(flags & REG_DEST) && op >= SLJIT_MOV && op <= SLJIT_MOV_P)
 			dst_r = src2_r;
 	}
-	else if (src2 & SLJIT_IMM) {
+	else if (src2 == SLJIT_IMM) {
 		src2_r = TMP_ZERO;
 		if (src2w != 0) {
 			FAIL_IF(load_immediate(compiler, sugg_src2_r, src2w));
@@ -1371,12 +1392,16 @@
 
 #if (defined SLJIT_CONFIG_PPC_64 && SLJIT_CONFIG_PPC_64)
 		if (!is_32) {
+#if defined(_ARCH_PWR10) && _ARCH_PWR10
+			return push_inst(compiler, BRD | S(src) | A(dst));
+#else /* !POWER10 */
 			FAIL_IF(push_inst(compiler, ADDI | D(TMP_REG2) | A(0) | IMM(TMP_MEM_OFFSET_HI)));
 			FAIL_IF(push_inst(compiler, RLDICL | S(src) | A(TMP_REG1) | RLDI_SH(32) | RLDI_MB(32)));
 			FAIL_IF(push_inst(compiler, STWBRX | S(src) | A(SLJIT_SP) | B(TMP_REG2)));
-			FAIL_IF(push_inst(compiler, ADDI | D(TMP_REG2) | A(0) | IMM(TMP_MEM_OFFSET_LOW)));
+			FAIL_IF(push_inst(compiler, ADDI | D(TMP_REG2) | A(0) | IMM(TMP_MEM_OFFSET_LO)));
 			FAIL_IF(push_inst(compiler, STWBRX | S(TMP_REG1) | A(SLJIT_SP) | B(TMP_REG2)));
 			return push_inst(compiler, LD | D(dst) | A(SLJIT_SP) | TMP_MEM_OFFSET);
+#endif /* POWER10 */
 		}
 #endif /* SLJIT_CONFIG_PPC_64 */
 
@@ -1452,16 +1477,30 @@
 #if (defined SLJIT_CONFIG_PPC_64 && SLJIT_CONFIG_PPC_64)
 	if (!is_32) {
 		if (dst & SLJIT_MEM) {
+#if defined(_ARCH_PWR7) && _ARCH_PWR7
+			return push_inst(compiler, STDBRX | S(src) | A(mem) | B(offs_reg));
+#else /* !POWER7 */
+#if defined(SLJIT_LITTLE_ENDIAN) && SLJIT_LITTLE_ENDIAN
+			FAIL_IF(push_inst(compiler, RLDICL | S(src) | A(TMP_REG1) | RLDI_SH(32) | RLDI_MB(32)));
+			FAIL_IF(push_inst(compiler, STWBRX | S(TMP_REG1) | A(mem) | B(offs_reg)));
+			FAIL_IF(push_inst(compiler, ADDI | D(TMP_REG2) | A(offs_reg) | IMM(SSIZE_OF(s32))));
+			return push_inst(compiler, STWBRX | S(src) | A(mem) | B(TMP_REG2));
+#else /* !SLJIT_LITTLE_ENDIAN */
 			FAIL_IF(push_inst(compiler, STWBRX | S(src) | A(mem) | B(offs_reg)));
 			FAIL_IF(push_inst(compiler, RLDICL | S(src) | A(TMP_REG1) | RLDI_SH(32) | RLDI_MB(32)));
 			FAIL_IF(push_inst(compiler, ADDI | D(TMP_REG2) | A(offs_reg) | IMM(SSIZE_OF(s32))));
 			return push_inst(compiler, STWBRX | S(TMP_REG1) | A(mem) | B(TMP_REG2));
+#endif /* SLJIT_LITTLE_ENDIAN */
+#endif /* POWER7 */
 		}
-
-		FAIL_IF(push_inst(compiler, LWBRX | S(dst) | A(mem) | B(offs_reg)));
+#if defined(_ARCH_PWR7) && _ARCH_PWR7
+		return push_inst(compiler, LDBRX | S(dst) | A(mem) | B(offs_reg));
+#else /* !POWER7 */
+		FAIL_IF(push_inst(compiler, LWBRX | LWBRX_FIRST_REG | A(mem) | B(offs_reg)));
 		FAIL_IF(push_inst(compiler, ADDI | D(TMP_REG2) | A(offs_reg) | IMM(SSIZE_OF(s32))));
-		FAIL_IF(push_inst(compiler, LWBRX | S(TMP_REG1) | A(mem) | B(TMP_REG2)));
+		FAIL_IF(push_inst(compiler, LWBRX | LWBRX_SECOND_REG | A(mem) | B(TMP_REG2)));
 		return push_inst(compiler, RLDIMI | S(TMP_REG1) | A(dst) | RLDI_SH(32) | RLDI_MB(0));
+#endif /* POWER7 */
 	}
 #endif /* SLJIT_CONFIG_PPC_64 */
 
@@ -1477,7 +1516,7 @@
 }
 
 #define EMIT_MOV(type, type_flags, type_cast) \
-	emit_op(compiler, (src & SLJIT_IMM) ? SLJIT_MOV : type, flags | (type_flags), dst, dstw, TMP_REG1, 0, src, (src & SLJIT_IMM) ? type_cast srcw : srcw)
+	emit_op(compiler, (src == SLJIT_IMM) ? SLJIT_MOV : type, flags | (type_flags), dst, dstw, TMP_REG1, 0, src, (src == SLJIT_IMM) ? type_cast srcw : srcw)
 
 SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_op1(struct sljit_compiler *compiler, sljit_s32 op,
 	sljit_s32 dst, sljit_sw dstw,
@@ -1508,7 +1547,7 @@
 				if (op == SLJIT_MOV_S32)
 					op = SLJIT_MOV_U32;
 			}
-			else if (src & SLJIT_IMM) {
+			else if (src == SLJIT_IMM) {
 				if (op == SLJIT_MOV_U32)
 					op = SLJIT_MOV_S32;
 			}
@@ -1580,40 +1619,22 @@
 
 #undef EMIT_MOV
 
+/* Macros for checking different operand types / values. */
 #define TEST_SL_IMM(src, srcw) \
-	(((src) & SLJIT_IMM) && (srcw) <= SIMM_MAX && (srcw) >= SIMM_MIN)
-
+	((src) == SLJIT_IMM && (srcw) <= SIMM_MAX && (srcw) >= SIMM_MIN)
 #define TEST_UL_IMM(src, srcw) \
-	(((src) & SLJIT_IMM) && !((srcw) & ~0xffff))
-
-#if (defined SLJIT_CONFIG_PPC_64 && SLJIT_CONFIG_PPC_64)
-#define TEST_SH_IMM(src, srcw) \
-	(((src) & SLJIT_IMM) && !((srcw) & 0xffff) && (srcw) <= 0x7fffffffl && (srcw) >= -0x80000000l)
-#else
-#define TEST_SH_IMM(src, srcw) \
-	(((src) & SLJIT_IMM) && !((srcw) & 0xffff))
-#endif
-
+	((src) == SLJIT_IMM && !((srcw) & ~0xffff))
 #define TEST_UH_IMM(src, srcw) \
-	(((src) & SLJIT_IMM) && !((srcw) & ~(sljit_sw)0xffff0000))
+	((src) == SLJIT_IMM && !((srcw) & ~(sljit_sw)0xffff0000))
 
 #if (defined SLJIT_CONFIG_PPC_64 && SLJIT_CONFIG_PPC_64)
+#define TEST_SH_IMM(src, srcw) \
+	((src) == SLJIT_IMM && !((srcw) & 0xffff) && (srcw) <= 0x7fffffffl && (srcw) >= -0x80000000l)
 #define TEST_ADD_IMM(src, srcw) \
-	(((src) & SLJIT_IMM) && (srcw) <= 0x7fff7fffl && (srcw) >= -0x80000000l)
-#else
-#define TEST_ADD_IMM(src, srcw) \
-	((src) & SLJIT_IMM)
-#endif
-
-#if (defined SLJIT_CONFIG_PPC_64 && SLJIT_CONFIG_PPC_64)
+	((src) == SLJIT_IMM && (srcw) <= 0x7fff7fffl && (srcw) >= -0x80000000l)
 #define TEST_UI_IMM(src, srcw) \
-	(((src) & SLJIT_IMM) && !((srcw) & ~0xffffffff))
-#else
-#define TEST_UI_IMM(src, srcw) \
-	((src) & SLJIT_IMM)
-#endif
+	((src) == SLJIT_IMM && !((srcw) & ~0xffffffff))
 
-#if (defined SLJIT_CONFIG_PPC_64 && SLJIT_CONFIG_PPC_64)
 #define TEST_ADD_FORM1(op) \
 	(GET_FLAG_TYPE(op) == SLJIT_OVERFLOW \
 		|| (op & (SLJIT_32 | SLJIT_SET_Z | VARIABLE_FLAG_MASK)) == (SLJIT_32 | SLJIT_SET_Z | SLJIT_SET_CARRY))
@@ -1623,14 +1644,22 @@
 #define TEST_SUB_FORM3(op) \
 	(GET_FLAG_TYPE(op) == SLJIT_OVERFLOW \
 		|| (op & (SLJIT_32 | SLJIT_SET_Z)) == (SLJIT_32 | SLJIT_SET_Z))
-#else
+
+#else /* !SLJIT_CONFIG_PPC_64 */
+#define TEST_SH_IMM(src, srcw) \
+	((src) == SLJIT_IMM && !((srcw) & 0xffff))
+#define TEST_ADD_IMM(src, srcw) \
+	((src) == SLJIT_IMM)
+#define TEST_UI_IMM(src, srcw) \
+	((src) == SLJIT_IMM)
+
 #define TEST_ADD_FORM1(op) \
 	(GET_FLAG_TYPE(op) == SLJIT_OVERFLOW)
 #define TEST_SUB_FORM2(op) \
 	(GET_FLAG_TYPE(op) >= SLJIT_SIG_LESS && GET_FLAG_TYPE(op) <= SLJIT_SIG_LESS_EQUAL)
 #define TEST_SUB_FORM3(op) \
 	(GET_FLAG_TYPE(op) == SLJIT_OVERFLOW)
-#endif
+#endif /* SLJIT_CONFIG_PPC_64 */
 
 SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_op2(struct sljit_compiler *compiler, sljit_s32 op,
 	sljit_s32 dst, sljit_sw dstw,
@@ -1649,9 +1678,9 @@
 	if (op & SLJIT_32) {
 		/* Most operations expect sign extended arguments. */
 		flags |= INT_DATA | SIGNED_DATA;
-		if (src1 & SLJIT_IMM)
+		if (src1 == SLJIT_IMM)
 			src1w = (sljit_s32)(src1w);
-		if (src2 & SLJIT_IMM)
+		if (src2 == SLJIT_IMM)
 			src2w = (sljit_s32)(src2w);
 		if (HAS_FLAGS(op))
 			flags |= ALT_SIGN_EXT;
@@ -1667,7 +1696,7 @@
 		if (TEST_ADD_FORM1(op))
 			return emit_op(compiler, SLJIT_ADD, flags | ALT_FORM1, dst, dstw, src1, src1w, src2, src2w);
 
-		if (!HAS_FLAGS(op) && ((src1 | src2) & SLJIT_IMM)) {
+		if (!HAS_FLAGS(op) && (src1 == SLJIT_IMM || src2 == SLJIT_IMM)) {
 			if (TEST_SL_IMM(src2, src2w)) {
 				compiler->imm = (sljit_ins)src2w & 0xffff;
 				return emit_op(compiler, SLJIT_ADD, flags | ALT_FORM2, dst, dstw, src1, src1w, TMP_REG2, 0);
@@ -1736,7 +1765,7 @@
 				return emit_op(compiler, SLJIT_SUB, flags | ALT_FORM1, dst, dstw, src1, src1w, src2, src2w);
 			}
 
-			if ((src2 & SLJIT_IMM) && src2w >= 0 && src2w <= (SIMM_MAX + 1)) {
+			if (src2 == SLJIT_IMM && src2w >= 0 && src2w <= (SIMM_MAX + 1)) {
 				compiler->imm = (sljit_ins)src2w;
 				return emit_op(compiler, SLJIT_SUB, flags | ALT_FORM1 | ALT_FORM2 | ALT_FORM3, dst, dstw, src1, src1w, TMP_REG2, 0);
 			}
@@ -1752,7 +1781,7 @@
 		}
 
 		if (TEST_SUB_FORM2(op)) {
-			if ((src2 & SLJIT_IMM) && src2w >= -SIMM_MAX && src2w <= SIMM_MAX) {
+			if (src2 == SLJIT_IMM && src2w >= -SIMM_MAX && src2w <= SIMM_MAX) {
 				compiler->imm = (sljit_ins)src2w & 0xffff;
 				return emit_op(compiler, SLJIT_SUB, flags | ALT_FORM2 | ALT_FORM3 | ALT_FORM4, dst, dstw, src1, src1w, TMP_REG2, 0);
 			}
@@ -1811,10 +1840,10 @@
 		return emit_op(compiler, SLJIT_MUL, flags, dst, dstw, src1, src1w, src2, src2w);
 
 	case SLJIT_XOR:
-		if ((src2 & SLJIT_IMM) && src2w == -1) {
+		if (src2 == SLJIT_IMM && src2w == -1) {
 			return emit_op(compiler, GET_OPCODE(op), flags | ALT_FORM4, dst, dstw, TMP_REG1, 0, src1, src1w);
 		}
-		if ((src1 & SLJIT_IMM) && src1w == -1) {
+		if (src1 == SLJIT_IMM && src1w == -1) {
 			return emit_op(compiler, GET_OPCODE(op), flags | ALT_FORM4, dst, dstw, TMP_REG1, 0, src2, src2w);
 		}
 		/* fallthrough */
@@ -1864,7 +1893,7 @@
 		if (op & SLJIT_32)
 			flags |= ALT_FORM2;
 #endif
-		if (src2 & SLJIT_IMM) {
+		if (src2 == SLJIT_IMM) {
 			compiler->imm = (sljit_ins)src2w;
 			return emit_op(compiler, GET_OPCODE(op), flags | ALT_FORM1, dst, dstw, src1, src1w, TMP_REG2, 0);
 		}
@@ -1916,7 +1945,7 @@
 
 	ADJUST_LOCAL_OFFSET(src3, src3w);
 
-	if (src3 & SLJIT_IMM) {
+	if (src3 == SLJIT_IMM) {
 		src3w &= bit_length - 1;
 
 		if (src3w == 0)
@@ -2059,7 +2088,7 @@
 {
 	CHECK_REG_INDEX(check_sljit_get_register_index(type, reg));
 
-	if (type == SLJIT_INT_REGISTER)
+	if (type == SLJIT_GP_REGISTER)
 		return reg_map[reg];
 
 	if (type != SLJIT_FLOAT_REGISTER)
@@ -2071,6 +2100,8 @@
 SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_op_custom(struct sljit_compiler *compiler,
 	void *instruction, sljit_u32 size)
 {
+	SLJIT_UNUSED_ARG(size);
+
 	CHECK_ERROR();
 	CHECK(check_sljit_emit_op_custom(compiler, instruction, size));
 
@@ -2440,6 +2471,8 @@
 SLJIT_API_FUNC_ATTRIBUTE struct sljit_jump* sljit_emit_call(struct sljit_compiler *compiler, sljit_s32 type,
 	sljit_s32 arg_types)
 {
+	SLJIT_UNUSED_ARG(arg_types);
+
 	CHECK_ERROR_PTR();
 	CHECK_PTR(check_sljit_emit_call(compiler, type, arg_types));
 
@@ -2476,7 +2509,7 @@
 #else /* SLJIT_PASS_ENTRY_ADDR_TO_CALL */
 		src_r = src;
 #endif /* SLJIT_PASS_ENTRY_ADDR_TO_CALL */
-	} else if (src & SLJIT_IMM) {
+	} else if (src == SLJIT_IMM) {
 		/* These jumps are converted to jump/call instructions when possible. */
 		jump = (struct sljit_jump*)ensure_abuf(compiler, sizeof(struct sljit_jump));
 		FAIL_IF(!jump);
@@ -2506,6 +2539,8 @@
 	sljit_s32 arg_types,
 	sljit_s32 src, sljit_sw srcw)
 {
+	SLJIT_UNUSED_ARG(arg_types);
+
 	CHECK_ERROR();
 	CHECK(check_sljit_emit_icall(compiler, type, arg_types, src, srcw));
 
@@ -2737,7 +2772,7 @@
 
 	if (src1 & SLJIT_MEM) {
 		FAIL_IF(emit_op_mem(compiler, inp_flags, dst_reg, src1, src1w, TMP_REG1));
-	} else if (src1 & SLJIT_IMM) {
+	} else if (src1 == SLJIT_IMM) {
 #if (defined SLJIT_CONFIG_RISCV_64 && SLJIT_CONFIG_RISCV_64)
 		if (type & SLJIT_32)
 			src1w = (sljit_s32)src1w;
diff --git a/src/sljit/sljitNativeRISCV_common.c b/src/sljit/sljitNativeRISCV_common.c
index f05ebbf..3b54ab9 100644
--- a/src/sljit/sljitNativeRISCV_common.c
+++ b/src/sljit/sljitNativeRISCV_common.c
@@ -536,7 +536,7 @@
 	switch (feature_type) {
 	case SLJIT_HAS_FPU:
 #ifdef SLJIT_IS_FPU_AVAILABLE
-		return SLJIT_IS_FPU_AVAILABLE;
+		return (SLJIT_IS_FPU_AVAILABLE) != 0;
 #elif defined(__riscv_float_abi_soft)
 		return 0;
 #else
@@ -1068,7 +1068,7 @@
 {
 	sljit_s32 is_clz = (GET_OPCODE(op) == SLJIT_CLZ);
 #if (defined SLJIT_CONFIG_RISCV_64 && SLJIT_CONFIG_RISCV_64)
-	sljit_ins word = (op & SLJIT_32) >> 5;
+	sljit_ins word = (sljit_ins)(op & SLJIT_32) >> 5;
 	sljit_ins word_size = (op & SLJIT_32) ? 32 : 64;
 #else /* !SLJIT_CONFIG_RISCV_64 */
 	sljit_ins word_size = 32;
@@ -1154,7 +1154,7 @@
 static sljit_s32 emit_rev16(struct sljit_compiler *compiler, sljit_s32 op, sljit_s32 dst, sljit_sw src)
 {
 #if (defined SLJIT_CONFIG_RISCV_64 && SLJIT_CONFIG_RISCV_64)
-	sljit_ins word = (op & SLJIT_32) >> 5;
+	sljit_ins word = (sljit_ins)(op & SLJIT_32) >> 5;
 	sljit_ins word_size = (op & SLJIT_32) ? 32 : 64;
 #else /* !SLJIT_CONFIG_RISCV_64 */
 	sljit_ins word_size = 32;
@@ -1191,7 +1191,7 @@
 	sljit_s32 is_overflow, is_carry, carry_src_r, is_handled;
 	sljit_ins op_imm, op_reg;
 #if (defined SLJIT_CONFIG_RISCV_64 && SLJIT_CONFIG_RISCV_64)
-	sljit_ins word = (op & SLJIT_32) >> 5;
+	sljit_ins word = (sljit_ins)(op & SLJIT_32) >> 5;
 #endif /* SLJIT_CONFIG_RISCV_64 */
 
 	SLJIT_ASSERT(WORD == 0 || WORD == 0x8);
@@ -1653,11 +1653,11 @@
 		flags |= SLOW_DEST;
 
 	if (flags & IMM_OP) {
-		if ((src2 & SLJIT_IMM) && src2w != 0 && src2w <= SIMM_MAX && src2w >= SIMM_MIN) {
+		if (src2 == SLJIT_IMM && src2w != 0 && src2w <= SIMM_MAX && src2w >= SIMM_MIN) {
 			flags |= SRC2_IMM;
 			src2_r = src2w;
 		}
-		else if ((flags & CUMULATIVE_OP) && (src1 & SLJIT_IMM) && src1w != 0 && src1w <= SIMM_MAX && src1w >= SIMM_MIN) {
+		else if ((flags & CUMULATIVE_OP) && src1 == SLJIT_IMM && src1w != 0 && src1w <= SIMM_MAX && src1w >= SIMM_MIN) {
 			flags |= SRC2_IMM;
 			src2_r = src1w;
 
@@ -1674,7 +1674,7 @@
 		src1_r = src1;
 		flags |= REG1_SOURCE;
 	}
-	else if (src1 & SLJIT_IMM) {
+	else if (src1 == SLJIT_IMM) {
 		if (src1w) {
 			FAIL_IF(load_immediate(compiler, TMP_REG1, src1w, TMP_REG3));
 			src1_r = TMP_REG1;
@@ -1697,7 +1697,7 @@
 		if ((flags & (REG_DEST | MOVE_OP)) == MOVE_OP)
 			dst_r = (sljit_s32)src2_r;
 	}
-	else if (src2 & SLJIT_IMM) {
+	else if (src2 == SLJIT_IMM) {
 		if (!(flags & SRC2_IMM)) {
 			if (src2w) {
 				FAIL_IF(load_immediate(compiler, sugg_src2_r, src2w, TMP_REG3));
@@ -1754,7 +1754,7 @@
 SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_op0(struct sljit_compiler *compiler, sljit_s32 op)
 {
 #if (defined SLJIT_CONFIG_RISCV_64 && SLJIT_CONFIG_RISCV_64)
-	sljit_ins word = (op & SLJIT_32) >> 5;
+	sljit_ins word = (sljit_ins)(op & SLJIT_32) >> 5;
 
 	SLJIT_ASSERT(word == 0 || word == 0x8);
 #endif /* SLJIT_CONFIG_RISCV_64 */
@@ -1823,25 +1823,25 @@
 
 #if (defined SLJIT_CONFIG_RISCV_64 && SLJIT_CONFIG_RISCV_64)
 	case SLJIT_MOV_U32:
-		return emit_op(compiler, SLJIT_MOV_U32, INT_DATA | MOVE_OP, dst, dstw, TMP_REG1, 0, src, (src & SLJIT_IMM) ? (sljit_u32)srcw : srcw);
+		return emit_op(compiler, SLJIT_MOV_U32, INT_DATA | MOVE_OP, dst, dstw, TMP_REG1, 0, src, (src == SLJIT_IMM) ? (sljit_u32)srcw : srcw);
 
 	case SLJIT_MOV_S32:
 	/* Logical operators have no W variant, so sign extended input is necessary for them. */
 	case SLJIT_MOV32:
-		return emit_op(compiler, SLJIT_MOV_S32, INT_DATA | SIGNED_DATA | MOVE_OP, dst, dstw, TMP_REG1, 0, src, (src & SLJIT_IMM) ? (sljit_s32)srcw : srcw);
+		return emit_op(compiler, SLJIT_MOV_S32, INT_DATA | SIGNED_DATA | MOVE_OP, dst, dstw, TMP_REG1, 0, src, (src == SLJIT_IMM) ? (sljit_s32)srcw : srcw);
 #endif
 
 	case SLJIT_MOV_U8:
-		return emit_op(compiler, op, BYTE_DATA | MOVE_OP, dst, dstw, TMP_REG1, 0, src, (src & SLJIT_IMM) ? (sljit_u8)srcw : srcw);
+		return emit_op(compiler, op, BYTE_DATA | MOVE_OP, dst, dstw, TMP_REG1, 0, src, (src == SLJIT_IMM) ? (sljit_u8)srcw : srcw);
 
 	case SLJIT_MOV_S8:
-		return emit_op(compiler, op, BYTE_DATA | SIGNED_DATA | MOVE_OP, dst, dstw, TMP_REG1, 0, src, (src & SLJIT_IMM) ? (sljit_s8)srcw : srcw);
+		return emit_op(compiler, op, BYTE_DATA | SIGNED_DATA | MOVE_OP, dst, dstw, TMP_REG1, 0, src, (src == SLJIT_IMM) ? (sljit_s8)srcw : srcw);
 
 	case SLJIT_MOV_U16:
-		return emit_op(compiler, op, HALF_DATA | MOVE_OP, dst, dstw, TMP_REG1, 0, src, (src & SLJIT_IMM) ? (sljit_u16)srcw : srcw);
+		return emit_op(compiler, op, HALF_DATA | MOVE_OP, dst, dstw, TMP_REG1, 0, src, (src == SLJIT_IMM) ? (sljit_u16)srcw : srcw);
 
 	case SLJIT_MOV_S16:
-		return emit_op(compiler, op, HALF_DATA | SIGNED_DATA | MOVE_OP, dst, dstw, TMP_REG1, 0, src, (src & SLJIT_IMM) ? (sljit_s16)srcw : srcw);
+		return emit_op(compiler, op, HALF_DATA | SIGNED_DATA | MOVE_OP, dst, dstw, TMP_REG1, 0, src, (src == SLJIT_IMM) ? (sljit_s16)srcw : srcw);
 
 	case SLJIT_CLZ:
 	case SLJIT_CTZ:
@@ -1877,9 +1877,9 @@
 #if (defined SLJIT_CONFIG_RISCV_64 && SLJIT_CONFIG_RISCV_64)
 	if (op & SLJIT_32) {
 		flags |= INT_DATA | SIGNED_DATA;
-		if (src1 & SLJIT_IMM)
+		if (src1 == SLJIT_IMM)
 			src1w = (sljit_s32)src1w;
-		if (src2 & SLJIT_IMM)
+		if (src2 == SLJIT_IMM)
 			src2w = (sljit_s32)src2w;
 	}
 #endif
@@ -1912,7 +1912,7 @@
 	case SLJIT_MASHR:
 	case SLJIT_ROTL:
 	case SLJIT_ROTR:
-		if (src2 & SLJIT_IMM) {
+		if (src2 == SLJIT_IMM) {
 #if (defined SLJIT_CONFIG_RISCV_32 && SLJIT_CONFIG_RISCV_32)
 			src2w &= 0x1f;
 #else /* !SLJIT_CONFIG_RISCV_32 */
@@ -1950,7 +1950,7 @@
 	sljit_s32 is_left;
 	sljit_ins ins1, ins2, ins3;
 #if (defined SLJIT_CONFIG_RISCV_64 && SLJIT_CONFIG_RISCV_64)
-	sljit_ins word = (op & SLJIT_32) >> 5;
+	sljit_ins word = (sljit_ins)(op & SLJIT_32) >> 5;
 	sljit_s32 inp_flags = ((op & SLJIT_32) ? INT_DATA : WORD_DATA) | LOAD_DATA;
 	sljit_sw bit_length = (op & SLJIT_32) ? 32 : 64;
 #else /* !SLJIT_CONFIG_RISCV_64 */
@@ -1972,7 +1972,7 @@
 
 	ADJUST_LOCAL_OFFSET(src3, src3w);
 
-	if (src3 & SLJIT_IMM) {
+	if (src3 == SLJIT_IMM) {
 		src3w &= bit_length - 1;
 
 		if (src3w == 0)
@@ -2083,7 +2083,7 @@
 {
 	CHECK_REG_INDEX(check_sljit_get_register_index(type, reg));
 
-	if (type == SLJIT_INT_REGISTER)
+	if (type == SLJIT_GP_REGISTER)
 		return reg_map[reg];
 
 	if (type != SLJIT_FLOAT_REGISTER)
@@ -2095,6 +2095,8 @@
 SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_op_custom(struct sljit_compiler *compiler,
 	void *instruction, sljit_u32 size)
 {
+	SLJIT_UNUSED_ARG(size);
+
 	CHECK_ERROR();
 	CHECK(check_sljit_emit_op_custom(compiler, instruction, size));
 
@@ -2154,7 +2156,7 @@
 		FAIL_IF(emit_op_mem2(compiler, ((ins & (1 << 21)) ? WORD_DATA : INT_DATA) | LOAD_DATA, TMP_REG1, src, srcw, dst, dstw));
 #endif /* !SLJIT_CONFIG_RISCV_32 */
 		src = TMP_REG1;
-	} else if (src & SLJIT_IMM) {
+	} else if (src == SLJIT_IMM) {
 		FAIL_IF(load_immediate(compiler, TMP_REG1, srcw, TMP_REG3));
 		src = TMP_REG1;
 	}
@@ -2178,7 +2180,7 @@
 #else /* !SLJIT_CONFIG_RISCV_32 */
 	if (GET_OPCODE(op) == SLJIT_CONV_F64_FROM_SW)
 		ins |= (1 << 21);
-	else if (src & SLJIT_IMM)
+	else if (src == SLJIT_IMM)
 		srcw = (sljit_s32)srcw;
 
 	if (op != SLJIT_CONV_F64_FROM_S32)
@@ -2200,7 +2202,7 @@
 #else /* !SLJIT_CONFIG_RISCV_32 */
 	if (GET_OPCODE(op) == SLJIT_CONV_F64_FROM_UW)
 		ins |= (1 << 21);
-	else if (src & SLJIT_IMM)
+	else if (src == SLJIT_IMM)
 		srcw = (sljit_u32)srcw;
 
 	if (op != SLJIT_CONV_F64_FROM_S32)
@@ -2575,7 +2577,7 @@
 		src2 = TMP_REG2;
 	}
 
-	if (src1 & SLJIT_IMM) {
+	if (src1 == SLJIT_IMM) {
 		if (src1w != 0) {
 			PTR_FAIL_IF(load_immediate(compiler, TMP_REG1, src1w, TMP_REG3));
 			src1 = TMP_REG1;
@@ -2584,7 +2586,7 @@
 			src1 = TMP_ZERO;
 	}
 
-	if (src2 & SLJIT_IMM) {
+	if (src2 == SLJIT_IMM) {
 		if (src2w != 0) {
 			PTR_FAIL_IF(load_immediate(compiler, TMP_REG2, src2w, TMP_REG3));
 			src2 = TMP_REG2;
@@ -2654,7 +2656,7 @@
 	CHECK_ERROR();
 	CHECK(check_sljit_emit_ijump(compiler, type, src, srcw));
 
-	if (!(src & SLJIT_IMM)) {
+	if (src != SLJIT_IMM) {
 		if (src & SLJIT_MEM) {
 			ADJUST_LOCAL_OFFSET(src, srcw);
 			FAIL_IF(emit_op_mem(compiler, WORD_DATA | LOAD_DATA, TMP_REG1, src, srcw));
@@ -2804,7 +2806,7 @@
 	sljit_ins *ptr;
 	sljit_uw size;
 #if (defined SLJIT_CONFIG_RISCV_64 && SLJIT_CONFIG_RISCV_64)
-	sljit_ins word = (type & SLJIT_32) >> 5;
+	sljit_ins word = (sljit_ins)(type & SLJIT_32) >> 5;
 	sljit_s32 inp_flags = ((type & SLJIT_32) ? INT_DATA : WORD_DATA) | LOAD_DATA;
 #else /* !SLJIT_CONFIG_RISCV_64 */
         sljit_s32 inp_flags = WORD_DATA | LOAD_DATA;
@@ -2845,7 +2847,7 @@
 
 	if (src1 & SLJIT_MEM) {
 		FAIL_IF(emit_op_mem(compiler, inp_flags, dst_reg, src1, src1w));
-	} else if (src1 & SLJIT_IMM) {
+	} else if (src1 == SLJIT_IMM) {
 #if (defined SLJIT_CONFIG_RISCV_64 && SLJIT_CONFIG_RISCV_64)
 		if (word)
 			src1w = (sljit_s32)src1w;
diff --git a/src/sljit/sljitNativeS390X.c b/src/sljit/sljitNativeS390X.c
index 5b5e7c6..97521b5 100644
--- a/src/sljit/sljitNativeS390X.c
+++ b/src/sljit/sljitNativeS390X.c
@@ -451,10 +451,12 @@
 
 static SLJIT_INLINE sljit_ins disp_s20(sljit_s32 d)
 {
+	sljit_uw dh, dl;
+
 	SLJIT_ASSERT(is_s20(d));
 
-	sljit_uw dh = (d >> 12) & 0xff;
-	sljit_uw dl = (d << 8) & 0xfff00;
+	dh = (d >> 12) & 0xff;
+	dl = ((sljit_uw)d << 8) & 0xfff00;
 	return (dh | dl) << 8;
 }
 
@@ -1065,15 +1067,17 @@
 	sljit_gpr dst_r,
 	sljit_s32 src, sljit_sw srcw)
 {
+	sljit_gpr src_r;
+
 	SLJIT_ASSERT(!IS_GPR_REG(src) || dst_r != gpr(src & REG_MASK));
 
-	if (src & SLJIT_IMM)
+	if (src == SLJIT_IMM)
 		return push_load_imm_inst(compiler, dst_r, srcw);
 
 	if (src & SLJIT_MEM)
 		return load_word(compiler, dst_r, src, srcw, (compiler->mode & SLJIT_32) != 0);
 
-	sljit_gpr src_r = gpr(src & REG_MASK);
+	src_r = gpr(src & REG_MASK);
 	return push_inst(compiler, (compiler->mode & SLJIT_32) ? lr(dst_r, src_r) : lgr(dst_r, src_r));
 }
 
@@ -1266,10 +1270,10 @@
 	sljit_s32 dst, sljit_sw dstw,
 	sljit_sw srcw)
 {
-	SLJIT_ASSERT(dst & SLJIT_MEM);
-
 	sljit_gpr dst_r = tmp1;
 
+	SLJIT_ASSERT(dst & SLJIT_MEM);
+
 	if (dst & OFFS_REG_MASK) {
 		sljit_gpr index = tmp1;
 
@@ -1574,6 +1578,8 @@
 			if (jump && jump->addr == j) {
 				sljit_sw target = (sljit_sw)((jump->flags & JUMP_LABEL) ? jump->u.label->addr : jump->u.target);
 				if ((jump->flags & SLJIT_REWRITABLE_JUMP) || (jump->flags & JUMP_ADDR)) {
+					sljit_ins op, arg;
+
 					jump->addr = (sljit_uw)pool_ptr;
 
 					/* load address into tmp1 */
@@ -1590,8 +1596,8 @@
 					*(pool_ptr++) = (sljit_uw)target;
 
 					/* branch to tmp1 */
-					sljit_ins op = (ins >> 32) & 0xf;
-					sljit_ins arg = (ins >> 36) & 0xf;
+					op = (ins >> 32) & 0xf;
+					arg = (ins >> 36) & 0xf;
 					switch (op) {
 					case 4: /* brcl -> bcr */
 						ins = bcr(arg, tmp1);
@@ -1645,6 +1651,8 @@
 	compiler->error = SLJIT_ERR_COMPILED;
 	compiler->executable_offset = executable_offset;
 	compiler->executable_size = ins_size;
+	if (pool_size)
+		compiler->executable_size += (pad_size + pool_size);
 	code = SLJIT_ADD_EXEC_OFFSET(code, executable_offset);
 	code_ptr = SLJIT_ADD_EXEC_OFFSET(code_ptr, executable_offset);
 	SLJIT_CACHE_FLUSH(code, code_ptr);
@@ -1657,15 +1665,24 @@
 	/* TODO(mundaym): implement all */
 	switch (feature_type) {
 	case SLJIT_HAS_FPU:
+#ifdef SLJIT_IS_FPU_AVAILABLE
+		return (SLJIT_IS_FPU_AVAILABLE) != 0;
+#else
+		return 1;
+#endif /* SLJIT_IS_FPU_AVAILABLE */
+
 	case SLJIT_HAS_CLZ:
 	case SLJIT_HAS_REV:
 	case SLJIT_HAS_ROT:
 	case SLJIT_HAS_PREFETCH:
 	case SLJIT_HAS_COPY_F32:
 	case SLJIT_HAS_COPY_F64:
+	case SLJIT_HAS_ATOMIC:
 		return 1;
+
 	case SLJIT_HAS_CTZ:
 		return 2;
+
 	case SLJIT_HAS_CMOV:
 		return have_lscond1() ? 1 : 0;
 	}
@@ -2176,7 +2193,7 @@
 			return SLJIT_SUCCESS;
 		}
 		/* LOAD IMMEDIATE */
-		if (FAST_IS_REG(dst) && (src & SLJIT_IMM)) {
+		if (FAST_IS_REG(dst) && src == SLJIT_IMM) {
 			switch (opcode) {
 			case SLJIT_MOV_U8:
 				srcw = (sljit_sw)((sljit_u8)(srcw));
@@ -2255,14 +2272,14 @@
 			return SLJIT_SUCCESS;
 		}
 		/* STORE and STORE IMMEDIATE */
-		if ((dst & SLJIT_MEM)
-			&& (FAST_IS_REG(src) || (src & SLJIT_IMM))) {
+		if ((dst & SLJIT_MEM) && (FAST_IS_REG(src) || src == SLJIT_IMM)) {
+			struct addr mem;
 			sljit_gpr reg = FAST_IS_REG(src) ? gpr(src) : tmp0;
-			if (src & SLJIT_IMM) {
+
+			if (src == SLJIT_IMM) {
 				/* TODO(mundaym): MOVE IMMEDIATE? */
 				FAIL_IF(push_load_imm_inst(compiler, reg, srcw));
 			}
-			struct addr mem;
 			FAIL_IF(make_addr_bxy(compiler, &mem, dst, dstw, tmp1));
 			switch (opcode) {
 			case SLJIT_MOV_U8:
@@ -2329,7 +2346,7 @@
 		SLJIT_UNREACHABLE();
 	}
 
-	SLJIT_ASSERT((src & SLJIT_IMM) == 0); /* no immediates */
+	SLJIT_ASSERT(src != SLJIT_IMM);
 
 	dst_r = FAST_IS_REG(dst) ? gpr(dst) : tmp0;
 	src_r = FAST_IS_REG(src) ? gpr(src) : tmp0;
@@ -2407,7 +2424,7 @@
 	const struct ins_forms *forms;
 	sljit_ins ins;
 
-	if (src2 & SLJIT_IMM) {
+	if (src2 == SLJIT_IMM) {
 		if (!sets_zero_overflow && is_s8(src2w) && (src1 & SLJIT_MEM) && (dst == src1 && dstw == src1w)) {
 			if (sets_overflow)
 				ins = (op & SLJIT_32) ? 0xeb000000006a /* asi */ : 0xeb000000007a /* agsi */;
@@ -2492,9 +2509,8 @@
 
 		compiler->status_flags_state |= SLJIT_CURRENT_FLAGS_COMPARE;
 
-		if (src2 & SLJIT_IMM) {
-			if (compare_signed || ((op & VARIABLE_FLAG_MASK) == 0 && is_s32(src2w)))
-			{
+		if (src2 == SLJIT_IMM) {
+			if (compare_signed || ((op & VARIABLE_FLAG_MASK) == 0 && is_s32(src2w))) {
 				if ((op & SLJIT_32) || is_s32(src2w)) {
 					ins = (op & SLJIT_32) ? 0xc20d00000000 /* cfi */ : 0xc20c00000000 /* cgfi */;
 					return emit_ri(compiler, ins, src1, src1, src1w, src2w, RIL_A);
@@ -2535,7 +2551,7 @@
 		goto done;
 	}
 
-	if (src2 & SLJIT_IMM) {
+	if (src2 == SLJIT_IMM) {
 		sljit_sw neg_src2w = -src2w;
 
 		if (sets_signed || neg_src2w != 0 || (op & (SLJIT_SET_Z | VARIABLE_FLAG_MASK)) == 0) {
@@ -2643,7 +2659,7 @@
 		return emit_commutative(compiler, &multiply_overflow_forms, dst, src1, src1w, src2, src2w);
 	}
 
-	if (src2 & SLJIT_IMM) {
+	if (src2 == SLJIT_IMM) {
 		if (is_s16(src2w)) {
 			ins = (op & SLJIT_32) ? 0xa70c0000 /* mhi */ : 0xa70d0000 /* mghi */;
 			return emit_ri(compiler, ins, dst, src1, src1w, src2w, RI_A);
@@ -2750,7 +2766,7 @@
 	sljit_s32 type = GET_OPCODE(op);
 	const struct ins_forms *forms;
 
-	if ((src2 & SLJIT_IMM) && (!(op & SLJIT_SET_Z) || (type == SLJIT_AND && dst == (sljit_s32)tmp0))) {
+	if (src2 == SLJIT_IMM && (!(op & SLJIT_SET_Z) || (type == SLJIT_AND && dst == (sljit_s32)tmp0))) {
 		sljit_s32 count16 = 0;
 		sljit_uw imm = (sljit_uw)src2w;
 
@@ -2814,7 +2830,7 @@
 	else
 		FAIL_IF(emit_move(compiler, tmp0, src1, src1w));
 
-	if (!(src2 & SLJIT_IMM)) {
+	if (src2 != SLJIT_IMM) {
 		if (FAST_IS_REG(src2))
 			base_r = gpr(src2);
 		else {
@@ -2874,7 +2890,7 @@
 	else
 		FAIL_IF(emit_move(compiler, tmp0, src1, src1w));
 
-	if (!(src2 & SLJIT_IMM)) {
+	if (src2 != SLJIT_IMM) {
 		if (FAST_IS_REG(src2))
 			base_r = gpr(src2);
 		else {
@@ -2884,7 +2900,7 @@
 	}
 
 	if (GET_OPCODE(op) == SLJIT_ROTR) {
-		if (!(src2 & SLJIT_IMM)) {
+		if (src2 != SLJIT_IMM) {
 			ins = (op & SLJIT_32) ? 0x1300 /* lcr */ : 0xb9030000 /* lcgr */;
 			FAIL_IF(push_inst(compiler, ins | R4A(tmp1) | R0A(base_r)));
 			base_r = tmp1;
@@ -2892,7 +2908,7 @@
 			src2w = -src2w;
 	}
 
-	if (src2 & SLJIT_IMM)
+	if (src2 == SLJIT_IMM)
 		imm = (sljit_ins)(src2w & ((op & SLJIT_32) ? 0x1f : 0x3f));
 
 	ins = (op & SLJIT_32) ? 0xeb000000001d /* rll */ : 0xeb000000001c /* rllg */;
@@ -2933,7 +2949,7 @@
 	compiler->mode = op & SLJIT_32;
 	compiler->status_flags_state = op & (VARIABLE_FLAG_MASK | SLJIT_SET_Z);
 
-	if (is_commutative(op) && (src1 & SLJIT_IMM) && !(src2 & SLJIT_IMM)) {
+	if (is_commutative(op) && src1 == SLJIT_IMM && src2 != SLJIT_IMM) {
 		src1 ^= src2;
 		src2 ^= src1;
 		src1 ^= src2;
@@ -3026,7 +3042,7 @@
 
 	ADJUST_LOCAL_OFFSET(src3, src3w);
 
-	if (src3 & SLJIT_IMM) {
+	if (src3 == SLJIT_IMM) {
 		src3w &= bit_length - 1;
 
 		if (src3w == 0)
@@ -3183,7 +3199,7 @@
 {
 	CHECK_REG_INDEX(check_sljit_get_register_index(type, reg));
 
-	if (type == SLJIT_INT_REGISTER)
+	if (type == SLJIT_GP_REGISTER)
 		return (sljit_s32)gpr(reg);
 
 	if (type != SLJIT_FLOAT_REGISTER)
@@ -3286,7 +3302,7 @@
 {
 	sljit_s32 dst_r = FAST_IS_REG(dst) ? dst : TMP_FREG1;
 
-	if (src & SLJIT_IMM) {
+	if (src == SLJIT_IMM) {
 		FAIL_IF(push_load_imm_inst(compiler, tmp0, srcw));
 		src = (sljit_s32)tmp0;
 	}
@@ -3309,7 +3325,7 @@
 {
 	sljit_ins ins;
 
-	if ((src & SLJIT_IMM) && GET_OPCODE(op) == SLJIT_CONV_F64_FROM_S32)
+	if (src == SLJIT_IMM && GET_OPCODE(op) == SLJIT_CONV_F64_FROM_S32)
 		srcw = (sljit_s32)srcw;
 
 	if (GET_OPCODE(op) == SLJIT_CONV_F64_FROM_SW)
@@ -3326,7 +3342,7 @@
 {
 	sljit_ins ins;
 
-	if ((src & SLJIT_IMM) && GET_OPCODE(op) == SLJIT_CONV_F64_FROM_U32)
+	if (src == SLJIT_IMM && GET_OPCODE(op) == SLJIT_CONV_F64_FROM_U32)
 		srcw = (sljit_u32)srcw;
 
 	if (GET_OPCODE(op) == SLJIT_CONV_F64_FROM_UW)
@@ -3525,7 +3541,7 @@
 
 	u.value = value;
 
-	FAIL_IF(push_load_imm_inst(compiler, tmp1, (sljit_sw)u.imm << 32));
+	FAIL_IF(push_load_imm_inst(compiler, tmp1, (sljit_sw)(((sljit_uw)u.imm << 32))));
 	return push_inst(compiler, 0xb3c10000 /* ldgr */ | F4(freg) | R0A(tmp1));
 }
 
@@ -3595,14 +3611,14 @@
 
 SLJIT_API_FUNC_ATTRIBUTE struct sljit_jump* sljit_emit_jump(struct sljit_compiler *compiler, sljit_s32 type)
 {
+	struct sljit_jump *jump;
 	sljit_u8 mask = ((type & 0xff) < SLJIT_JUMP) ? get_cc(compiler, type & 0xff) : 0xf;
 
 	CHECK_ERROR_PTR();
 	CHECK_PTR(check_sljit_emit_jump(compiler, type));
 
 	/* record jump */
-	struct sljit_jump *jump = (struct sljit_jump *)
-		ensure_abuf(compiler, sizeof(struct sljit_jump));
+	jump = (struct sljit_jump *)ensure_abuf(compiler, sizeof(struct sljit_jump));
 	PTR_FAIL_IF(!jump);
 	set_jump(jump, compiler, type & SLJIT_REWRITABLE_JUMP);
 	jump->addr = compiler->size;
@@ -3640,7 +3656,7 @@
 	CHECK_ERROR();
 	CHECK(check_sljit_emit_ijump(compiler, type, src, srcw));
 
-	if (src & SLJIT_IMM) {
+	if (src == SLJIT_IMM) {
 		SLJIT_ASSERT(!(srcw & 1)); /* target address must be even */
 		FAIL_IF(push_load_imm_inst(compiler, src_r, srcw));
 	}
@@ -3660,6 +3676,8 @@
 	sljit_s32 arg_types,
 	sljit_s32 src, sljit_sw srcw)
 {
+	SLJIT_UNUSED_ARG(arg_types);
+
 	CHECK_ERROR();
 	CHECK(check_sljit_emit_icall(compiler, type, arg_types, src, srcw));
 
@@ -3691,13 +3709,13 @@
 	sljit_s32 dst, sljit_sw dstw,
 	sljit_s32 type)
 {
+	sljit_gpr dst_r = FAST_IS_REG(dst) ? gpr(dst & REG_MASK) : tmp0;
+	sljit_gpr loc_r = tmp1;
 	sljit_u8 mask = get_cc(compiler, type);
 
 	CHECK_ERROR();
 	CHECK(check_sljit_emit_op_flags(compiler, op, dst, dstw, type));
 
-	sljit_gpr dst_r = FAST_IS_REG(dst) ? gpr(dst & REG_MASK) : tmp0;
-	sljit_gpr loc_r = tmp1;
 	switch (GET_OPCODE(op)) {
 	case SLJIT_AND:
 	case SLJIT_OR:
@@ -3817,7 +3835,7 @@
 		return push_inst(compiler, ins | R36A(dst_r) | (mask << 32) | R28A(src_r) | disp_s20((sljit_s32)src1w));
 	}
 
-	if (src1 & SLJIT_IMM) {
+	if (src1 == SLJIT_IMM) {
 		if (type & SLJIT_32)
 			src1w = (sljit_s32)src1w;
 
diff --git a/src/sljit/sljitNativeX86_32.c b/src/sljit/sljitNativeX86_32.c
index ed69e6b..1bba883 100644
--- a/src/sljit/sljitNativeX86_32.c
+++ b/src/sljit/sljitNativeX86_32.c
@@ -62,21 +62,19 @@
 	/* Both size flags cannot be switched on. */
 	SLJIT_ASSERT((flags & (EX86_BYTE_ARG | EX86_HALF_ARG)) != (EX86_BYTE_ARG | EX86_HALF_ARG));
 	/* SSE2 and immediate is not possible. */
-	SLJIT_ASSERT(!(a & SLJIT_IMM) || !(flags & EX86_SSE2));
-	SLJIT_ASSERT((flags & (EX86_PREF_F2 | EX86_PREF_F3)) != (EX86_PREF_F2 | EX86_PREF_F3)
-		&& (flags & (EX86_PREF_F2 | EX86_PREF_66)) != (EX86_PREF_F2 | EX86_PREF_66)
-		&& (flags & (EX86_PREF_F3 | EX86_PREF_66)) != (EX86_PREF_F3 | EX86_PREF_66));
+	SLJIT_ASSERT(a != SLJIT_IMM || !(flags & EX86_SSE2));
+	SLJIT_ASSERT(((flags & (EX86_PREF_F2 | EX86_PREF_F3 | EX86_PREF_66))
+			& ((flags & (EX86_PREF_F2 | EX86_PREF_F3 | EX86_PREF_66)) - 1)) == 0);
+	SLJIT_ASSERT((flags & (EX86_VEX_EXT | EX86_REX)) != EX86_VEX_EXT);
 
 	size &= 0xf;
-	inst_size = size;
+	/* The mod r/m byte is always present. */
+	inst_size = size + 1;
 
-	if (flags & (EX86_PREF_F2 | EX86_PREF_F3))
-		inst_size++;
-	if (flags & EX86_PREF_66)
+	if (flags & (EX86_PREF_F2 | EX86_PREF_F3 | EX86_PREF_66))
 		inst_size++;
 
 	/* Calculate size of b. */
-	inst_size += 1; /* mod r/m byte. */
 	if (b & SLJIT_MEM) {
 		if (!(b & REG_MASK))
 			inst_size += sizeof(sljit_sw);
@@ -87,8 +85,7 @@
 					inst_size += sizeof(sljit_s8);
 				else
 					inst_size += sizeof(sljit_sw);
-			}
-			else if (reg_map[b & REG_MASK] == 5) {
+			} else if (reg_map[b & REG_MASK] == 5) {
 				/* Swap registers if possible. */
 				if ((b & OFFS_REG_MASK) && (immb & 0x3) == 0 && reg_map[OFFS_REG(b)] != 5)
 					b = SLJIT_MEM | OFFS_REG(b) | TO_OFFS_REG(b & REG_MASK);
@@ -105,15 +102,14 @@
 	}
 
 	/* Calculate size of a. */
-	if (a & SLJIT_IMM) {
+	if (a == SLJIT_IMM) {
 		if (flags & EX86_BIN_INS) {
 			if (imma <= 127 && imma >= -128) {
 				inst_size += 1;
 				flags |= EX86_BYTE_ARG;
 			} else
 				inst_size += 4;
-		}
-		else if (flags & EX86_SHIFT_INS) {
+		} else if (flags & EX86_SHIFT_INS) {
 			SLJIT_ASSERT(imma <= 0x1f);
 			if (imma != 1) {
 				inst_size++;
@@ -125,8 +121,7 @@
 			inst_size += sizeof(short);
 		else
 			inst_size += sizeof(sljit_sw);
-	}
-	else
+	} else
 		SLJIT_ASSERT(!(flags & EX86_SHIFT_INS) || a == SLJIT_PREF_SHIFT_REG);
 
 	inst = (sljit_u8*)ensure_buf(compiler, 1 + inst_size);
@@ -136,27 +131,26 @@
 	INC_SIZE(inst_size);
 	if (flags & EX86_PREF_F2)
 		*inst++ = 0xf2;
-	if (flags & EX86_PREF_F3)
+	else if (flags & EX86_PREF_F3)
 		*inst++ = 0xf3;
-	if (flags & EX86_PREF_66)
+	else if (flags & EX86_PREF_66)
 		*inst++ = 0x66;
 
 	buf_ptr = inst + size;
 
 	/* Encode mod/rm byte. */
 	if (!(flags & EX86_SHIFT_INS)) {
-		if ((flags & EX86_BIN_INS) && (a & SLJIT_IMM))
+		if ((flags & EX86_BIN_INS) && a == SLJIT_IMM)
 			*inst = (flags & EX86_BYTE_ARG) ? GROUP_BINARY_83 : GROUP_BINARY_81;
 
-		if (a & SLJIT_IMM)
+		if (a == SLJIT_IMM)
 			*buf_ptr = 0;
 		else if (!(flags & EX86_SSE2_OP1))
 			*buf_ptr = U8(reg_map[a] << 3);
 		else
 			*buf_ptr = U8(a << 3);
-	}
-	else {
-		if (a & SLJIT_IMM) {
+	} else {
+		if (a == SLJIT_IMM) {
 			if (imma == 1)
 				*inst = GROUP_SHIFT_1;
 			else
@@ -183,8 +177,9 @@
 			if (!(b & OFFS_REG_MASK))
 				*buf_ptr++ |= reg_map_b;
 			else {
-				*buf_ptr++ |= 0x04;
-				*buf_ptr++ = U8(reg_map_b | (reg_map[OFFS_REG(b)] << 3));
+				buf_ptr[0] |= 0x04;
+				buf_ptr[1] = U8(reg_map_b | (reg_map[OFFS_REG(b)] << 3));
+				buf_ptr += 2;
 			}
 
 			if (immb != 0 || reg_map_b == 5) {
@@ -195,25 +190,24 @@
 					buf_ptr += sizeof(sljit_sw);
 				}
 			}
-		}
-		else {
+		} else {
 			if (reg_map_b == 5)
 				*buf_ptr |= 0x40;
 
-			*buf_ptr++ |= 0x04;
-			*buf_ptr++ = U8(reg_map_b | (reg_map[OFFS_REG(b)] << 3) | (immb << 6));
+			buf_ptr[0] |= 0x04;
+			buf_ptr[1] = U8(reg_map_b | (reg_map[OFFS_REG(b)] << 3) | (immb << 6));
+			buf_ptr += 2;
 
 			if (reg_map_b == 5)
 				*buf_ptr++ = 0;
 		}
-	}
-	else {
+	} else {
 		*buf_ptr++ |= 0x05;
 		sljit_unaligned_store_sw(buf_ptr, immb); /* 32 bit displacement. */
 		buf_ptr += sizeof(sljit_sw);
 	}
 
-	if (a & SLJIT_IMM) {
+	if (a == SLJIT_IMM) {
 		if (flags & EX86_BYTE_ARG)
 			*buf_ptr = U8(imma);
 		else if (flags & EX86_HALF_ARG)
@@ -222,7 +216,67 @@
 			sljit_unaligned_store_sw(buf_ptr, imma);
 	}
 
-	return !(flags & EX86_SHIFT_INS) ? inst : (inst + 1);
+	return inst;
+}
+
+static sljit_s32 emit_vex_instruction(struct sljit_compiler *compiler, sljit_uw op,
+	/* The first and second register operand. */
+	sljit_s32 a, sljit_s32 v,
+	/* The general operand (not immediate). */
+	sljit_s32 b, sljit_sw immb)
+{
+	sljit_u8 *inst;
+	sljit_u8 vex = 0;
+	sljit_u8 vex_m = 0;
+	sljit_uw size;
+
+	SLJIT_ASSERT(((op & (EX86_PREF_F2 | EX86_PREF_F3 | EX86_PREF_66))
+			& ((op & (EX86_PREF_F2 | EX86_PREF_F3 | EX86_PREF_66)) - 1)) == 0);
+
+	if (op & VEX_OP_0F38)
+		vex_m = 0x2;
+	else if (op & VEX_OP_0F3A)
+		vex_m = 0x3;
+
+	if (op & VEX_W) {
+		if (vex_m == 0)
+			vex_m = 0x1;
+
+		vex |= 0x80;
+	}
+
+	if (op & EX86_PREF_66)
+		vex |= 0x1;
+	else if (op & EX86_PREF_F2)
+		vex |= 0x3;
+	else if (op & EX86_PREF_F3)
+		vex |= 0x2;
+
+	op &= ~(EX86_PREF_66 | EX86_PREF_F2 | EX86_PREF_F3);
+
+	if (op & VEX_256)
+		vex |= 0x4;
+
+	vex = U8(vex | ((((op & VEX_SSE2_OPV) ? v : reg_map[v]) ^ 0xf) << 3));
+
+	size = op & ~(sljit_uw)0xff;
+	size |= (vex_m == 0) ? 3 : 4;
+
+	inst = emit_x86_instruction(compiler, size, a, 0, b, immb);
+	FAIL_IF(!inst);
+
+	if (vex_m == 0) {
+		inst[0] = 0xc5;
+		inst[1] = U8(vex | 0x80);
+		inst[2] = U8(op);
+		return SLJIT_SUCCESS;
+	}
+
+	inst[0] = 0xc4;
+	inst[1] = U8(vex_m | 0xe0);
+	inst[2] = vex;
+	inst[3] = U8(op);
+	return SLJIT_SUCCESS;
 }
 
 /* --------------------------------------------------------------------- */
@@ -776,7 +830,7 @@
 
 		offset = stack_size + compiler->local_size;
 
-		if (!(src & SLJIT_IMM) && src != SLJIT_R0) {
+		if (src != SLJIT_IMM && src != SLJIT_R0) {
 			if (word_arg_count >= 1) {
 				EMIT_MOV(compiler, SLJIT_MEM1(SLJIT_SP), 0, SLJIT_R0, 0);
 				r2_offset = sizeof(sljit_sw);
@@ -830,7 +884,7 @@
 
 	stack_size = args_size + SSIZE_OF(sw);
 
-	if (word_arg_count >= 1 && !(src & SLJIT_IMM) && src != SLJIT_R0) {
+	if (word_arg_count >= 1 && src != SLJIT_IMM && src != SLJIT_R0) {
 		r2_offset = SSIZE_OF(sw);
 		stack_size += SSIZE_OF(sw);
 	}
@@ -859,7 +913,7 @@
 			EMIT_MOV(compiler, SLJIT_R2, 0, SLJIT_MEM1(SLJIT_SP), word_arg4_offset);
 	}
 
-	if (!(src & SLJIT_IMM) && src != SLJIT_R0) {
+	if (src != SLJIT_IMM && src != SLJIT_R0) {
 		if (word_arg_count >= 1) {
 			SLJIT_ASSERT(r2_offset == sizeof(sljit_sw));
 			EMIT_MOV(compiler, SLJIT_MEM1(SLJIT_SP), 0, SLJIT_R0, 0);
@@ -1063,7 +1117,7 @@
 		stack_size = type;
 		FAIL_IF(tail_call_with_args(compiler, &stack_size, arg_types, src, srcw));
 
-		if (!(src & SLJIT_IMM)) {
+		if (src != SLJIT_IMM) {
 			src = SLJIT_R0;
 			srcw = 0;
 		}
@@ -1291,16 +1345,13 @@
 
 		inst = emit_x86_instruction(compiler, 1 | EX86_SHIFT_INS, SLJIT_IMM, 1, TMP_REG1, 0);
 		FAIL_IF(!inst);
-		*inst |= ROL;
+		inst[1] |= ROL;
 
 		inst = emit_x86_instruction(compiler, 1 | EX86_SHIFT_INS, SLJIT_IMM, 1, TMP_REG1, 0);
 		FAIL_IF(!inst);
-		*inst |= SHR;
+		inst[1] |= SHR;
 
-		inst = emit_x86_instruction(compiler, 2 | EX86_PREF_F2 | EX86_SSE2_OP1, dst_r, 0, TMP_REG1, 0);
-		FAIL_IF(!inst);
-		inst[0] = GROUP_0F;
-		inst[1] = CVTSI2SD_x_rm;
+		FAIL_IF(emit_groupf(compiler, CVTSI2SD_x_rm, EX86_PREF_F2 | EX86_SSE2_OP1, dst_r, TMP_REG1, 0));
 
 		inst = (sljit_u8*)ensure_buf(compiler, 1 + 2);
 		FAIL_IF(!inst);
@@ -1308,7 +1359,7 @@
 		inst[0] = U8(get_jump_code(SLJIT_NOT_CARRY) - 0x10);
 
 		size1 = compiler->size;
-		FAIL_IF(emit_sse2(compiler, ADDSD_x_xm, 0, dst_r, SLJIT_MEM0(), (sljit_sw)&f64_high_bit));
+		FAIL_IF(emit_groupf(compiler, ADDSD_x_xm, EX86_PREF_F2 | EX86_SSE2, dst_r, SLJIT_MEM0(), (sljit_sw)&f64_high_bit));
 
 		inst[1] = U8(compiler->size - size1);
 
@@ -1332,10 +1383,7 @@
 
 	size1 = compiler->size;
 
-	inst = emit_x86_instruction(compiler, 2 | ((op & SLJIT_32) ? EX86_PREF_F3 : EX86_PREF_F2) | EX86_SSE2_OP1, dst_r, 0, src, 0);
-	FAIL_IF(!inst);
-	inst[0] = GROUP_0F;
-	inst[1] = CVTSI2SD_x_rm;
+	FAIL_IF(emit_groupf(compiler, CVTSI2SD_x_rm, EX86_SELECT_F2_F3(op) | EX86_SSE2_OP1, dst_r, src, 0));
 
 	inst = (sljit_u8*)ensure_buf(compiler, 1 + 2);
 	FAIL_IF(!inst);
@@ -1352,7 +1400,7 @@
 
 	inst = emit_x86_instruction(compiler, 1 | EX86_SHIFT_INS, SLJIT_IMM, 1, TMP_REG1, 0);
 	FAIL_IF(!inst);
-	*inst |= SHR;
+	inst[1] |= SHR;
 
 	inst = (sljit_u8*)ensure_buf(compiler, 1 + 2);
 	FAIL_IF(!inst);
@@ -1365,12 +1413,8 @@
 	BINARY_IMM32(OR, 1, TMP_REG1, 0);
 	jump_inst1[1] = U8(compiler->size - size1);
 
-	inst = emit_x86_instruction(compiler, 2 | ((op & SLJIT_32) ? EX86_PREF_F3 : EX86_PREF_F2) | EX86_SSE2_OP1, dst_r, 0, TMP_REG1, 0);
-	FAIL_IF(!inst);
-	inst[0] = GROUP_0F;
-	inst[1] = CVTSI2SD_x_rm;
-
-	FAIL_IF(emit_sse2(compiler, ADDSD_x_xm, op & SLJIT_32, dst_r, dst_r, 0));
+	FAIL_IF(emit_groupf(compiler, CVTSI2SD_x_rm, EX86_SELECT_F2_F3(op) | EX86_SSE2_OP1, dst_r, TMP_REG1, 0));
+	FAIL_IF(emit_groupf(compiler, ADDSD_x_xm, EX86_SELECT_F2_F3(op) | EX86_SSE2, dst_r, dst_r, 0));
 
 	jump_inst2[1] = U8(compiler->size - size2);
 
@@ -1430,21 +1474,14 @@
 	u.value = value;
 
 	if (u.imm[0] == 0) {
-		if (u.imm[1] == 0) {
-			inst = emit_x86_instruction(compiler, 2 | EX86_PREF_66 | EX86_SSE2, freg, 0, freg, 0);
-			FAIL_IF(!inst);
-			inst[0] = GROUP_0F;
-			inst[1] = PXOR_x_xm;
-			return SLJIT_SUCCESS;
-		}
+		if (u.imm[1] == 0)
+			return emit_groupf(compiler, PXOR_x_xm, EX86_PREF_66 | EX86_SSE2, freg, freg, 0);
 
 		EMIT_MOV(compiler, TMP_REG1, 0, SLJIT_IMM, u.imm[1]);
 	} else
 		EMIT_MOV(compiler, TMP_REG1, 0, SLJIT_IMM, u.imm[0]);
 
-	inst = emit_x86_instruction(compiler, 2 | EX86_PREF_66 | EX86_SSE2_OP1, freg, 0, TMP_REG1, 0);
-	inst[0] = GROUP_0F;
-	inst[1] = MOVD_x_rm;
+	FAIL_IF(emit_groupf(compiler, MOVD_x_rm, EX86_PREF_66 | EX86_SSE2_OP1, freg, TMP_REG1, 0));
 
 	if (u.imm[1] == 0)
 		return SLJIT_SUCCESS;
@@ -1466,20 +1503,12 @@
 
 		EMIT_MOV(compiler, TMP_REG1, 0, SLJIT_IMM, u.imm[1]);
 
-		if (cpu_feature_list && CPU_FEATURE_SSE41) {
-			inst = emit_x86_instruction(compiler, 3 | EX86_PREF_66 | EX86_SSE2_OP1, freg, 0, TMP_REG1, 0);
-			FAIL_IF(!inst);
-			inst[0] = GROUP_0F;
-			inst[1] = 0x3a;
-			inst[2] = PINSRD_x_rm_i8;
-
+		if (cpu_feature_list & CPU_FEATURE_SSE41) {
+			FAIL_IF(emit_groupf_ext(compiler, PINSRD_x_rm_i8, EX86_PREF_66 | VEX_OP_0F3A | EX86_SSE2_OP1, freg, TMP_REG1, 0));
 			return emit_byte(compiler, 1);
 		}
 
-		inst = emit_x86_instruction(compiler, 2 | EX86_PREF_66 | EX86_SSE2_OP1, TMP_FREG, 0, TMP_REG1, 0);
-		inst[0] = GROUP_0F;
-		inst[1] = MOVD_x_rm;
-
+		FAIL_IF(emit_groupf(compiler, MOVD_x_rm, EX86_PREF_66 | EX86_SSE2_OP1, TMP_FREG, TMP_REG1, 0));
 		tmp_freg = TMP_FREG;
 	}
 
@@ -1509,28 +1538,22 @@
 
 	SLJIT_ASSERT(cpu_feature_list != 0);
 
-	if (!(op & SLJIT_32) && (cpu_feature_list && CPU_FEATURE_SSE41)) {
+	if (!(op & SLJIT_32) && (cpu_feature_list & CPU_FEATURE_SSE41)) {
 		if (reg & REG_PAIR_MASK) {
 			reg2 = REG_PAIR_FIRST(reg);
 			reg = REG_PAIR_SECOND(reg);
 
 			CHECK_EXTRA_REGS(reg, regw, (void)0);
 
-			inst = emit_x86_instruction(compiler, 2 | EX86_PREF_66 | EX86_SSE2_OP1, freg, 0, reg, regw);
-			FAIL_IF(!inst);
-			inst[0] = GROUP_0F;
-			inst[1] = GET_OPCODE(op) == SLJIT_COPY_TO_F64 ? MOVD_x_rm : MOVD_rm_x;
+			FAIL_IF(emit_groupf(compiler, GET_OPCODE(op) == SLJIT_COPY_TO_F64 ? MOVD_x_rm : MOVD_rm_x,
+				EX86_PREF_66 | EX86_SSE2_OP1, freg, reg, regw));
 		} else
 			reg2 = reg;
 
 		CHECK_EXTRA_REGS(reg2, reg2w, (void)0);
 
-		inst = emit_x86_instruction(compiler, 3 | EX86_PREF_66 | EX86_SSE2_OP1, freg, 0, reg2, reg2w);
-		FAIL_IF(!inst);
-		inst[0] = GROUP_0F;
-		inst[1] = 0x3a;
-		inst[2] = GET_OPCODE(op) == SLJIT_COPY_TO_F64 ? PINSRD_x_rm_i8 : PEXTRD_rm_x_i8;
-
+		FAIL_IF(emit_groupf_ext(compiler, GET_OPCODE(op) == SLJIT_COPY_TO_F64 ? PINSRD_x_rm_i8 : PEXTRD_rm_x_i8,
+			EX86_PREF_66 | VEX_OP_0F3A | EX86_SSE2_OP1, freg, reg2, reg2w));
 		return emit_byte(compiler, 1);
 	}
 
@@ -1546,13 +1569,9 @@
 
 	CHECK_EXTRA_REGS(reg, regw, (void)0);
 
-	if (op & SLJIT_32) {
-		inst = emit_x86_instruction(compiler, 2 | EX86_PREF_66 | EX86_SSE2_OP1, freg, 0, reg, regw);
-		FAIL_IF(!inst);
-		inst[0] = GROUP_0F;
-		inst[1] = GET_OPCODE(op) == SLJIT_COPY_TO_F64 ? MOVD_x_rm : MOVD_rm_x;
-		return SLJIT_SUCCESS;
-	}
+	if (op & SLJIT_32)
+		return emit_groupf(compiler, GET_OPCODE(op) == SLJIT_COPY_TO_F64 ? MOVD_x_rm : MOVD_rm_x,
+			EX86_PREF_66 | EX86_SSE2_OP1, freg, reg, regw);
 
 	if (op == SLJIT_COPY_FROM_F64) {
 		inst = (sljit_u8*)ensure_buf(compiler, 1 + 5);
@@ -1564,18 +1583,12 @@
 		inst[2] = PSHUFD_x_xm;
 		inst[3] = U8(MOD_REG | (TMP_FREG << 3) | freg);
 		inst[4] = 1;
-	} else if (reg != 0) {
-		inst = emit_x86_instruction(compiler, 2 | EX86_PREF_66 | EX86_SSE2_OP1, TMP_FREG, 0, reg, regw);
-		inst[0] = GROUP_0F;
-		inst[1] = MOVD_x_rm;
-	}
+	} else if (reg != 0)
+		FAIL_IF(emit_groupf(compiler, MOVD_x_rm, EX86_PREF_66 | EX86_SSE2_OP1, TMP_FREG, reg, regw));
 
-	if (reg2 != 0) {
-		inst = emit_x86_instruction(compiler, 2 | EX86_PREF_66 | EX86_SSE2_OP1, freg, 0, reg2, reg2w);
-		FAIL_IF(!inst);
-		inst[0] = GROUP_0F;
-		inst[1] = GET_OPCODE(op) == SLJIT_COPY_TO_F64 ? MOVD_x_rm : MOVD_rm_x;
-	}
+	if (reg2 != 0)
+		FAIL_IF(emit_groupf(compiler, GET_OPCODE(op) == SLJIT_COPY_TO_F64 ? MOVD_x_rm : MOVD_rm_x,
+			EX86_PREF_66 | EX86_SSE2_OP1, freg, reg2, reg2w));
 
 	if (GET_OPCODE(op) == SLJIT_COPY_TO_F64) {
 		inst = (sljit_u8*)ensure_buf(compiler, 1 + 3);
@@ -1585,12 +1598,8 @@
 		inst[0] = GROUP_0F;
 		inst[1] = UNPCKLPS_x_xm;
 		inst[2] = U8(MOD_REG | (freg << 3) | (reg == 0 ? freg : TMP_FREG));
-	} else {
-		inst = emit_x86_instruction(compiler, 2 | EX86_PREF_66 | EX86_SSE2_OP1, TMP_FREG, 0, reg, regw);
-		FAIL_IF(!inst);
-		inst[0] = GROUP_0F;
-		inst[1] = MOVD_rm_x;
-	}
+	} else
+		FAIL_IF(emit_groupf(compiler, MOVD_rm_x, EX86_PREF_66 | EX86_SSE2_OP1, TMP_FREG, reg, regw));
 
 	return SLJIT_SUCCESS;
 }
diff --git a/src/sljit/sljitNativeX86_64.c b/src/sljit/sljitNativeX86_64.c
index d245923..39114c2 100644
--- a/src/sljit/sljitNativeX86_64.c
+++ b/src/sljit/sljitNativeX86_64.c
@@ -72,7 +72,7 @@
 	sljit_uw inst_size;
 
 	/* The immediate operand must be 32 bit. */
-	SLJIT_ASSERT(!(a & SLJIT_IMM) || compiler->mode32 || IS_HALFWORD(imma));
+	SLJIT_ASSERT(a != SLJIT_IMM || compiler->mode32 || IS_HALFWORD(imma));
 	/* Both cannot be switched on. */
 	SLJIT_ASSERT((flags & (EX86_BIN_INS | EX86_SHIFT_INS)) != (EX86_BIN_INS | EX86_SHIFT_INS));
 	/* Size flags not allowed for typed instructions. */
@@ -80,26 +80,24 @@
 	/* Both size flags cannot be switched on. */
 	SLJIT_ASSERT((flags & (EX86_BYTE_ARG | EX86_HALF_ARG)) != (EX86_BYTE_ARG | EX86_HALF_ARG));
 	/* SSE2 and immediate is not possible. */
-	SLJIT_ASSERT(!(a & SLJIT_IMM) || !(flags & EX86_SSE2));
-	SLJIT_ASSERT((flags & (EX86_PREF_F2 | EX86_PREF_F3)) != (EX86_PREF_F2 | EX86_PREF_F3)
-		&& (flags & (EX86_PREF_F2 | EX86_PREF_66)) != (EX86_PREF_F2 | EX86_PREF_66)
-		&& (flags & (EX86_PREF_F3 | EX86_PREF_66)) != (EX86_PREF_F3 | EX86_PREF_66));
+	SLJIT_ASSERT(a != SLJIT_IMM || !(flags & EX86_SSE2));
+	SLJIT_ASSERT(((flags & (EX86_PREF_F2 | EX86_PREF_F3 | EX86_PREF_66))
+			& ((flags & (EX86_PREF_F2 | EX86_PREF_F3 | EX86_PREF_66)) - 1)) == 0);
+	SLJIT_ASSERT((flags & (EX86_VEX_EXT | EX86_REX)) != EX86_VEX_EXT);
 
 	size &= 0xf;
-	inst_size = size;
+	/* The mod r/m byte is always present. */
+	inst_size = size + 1;
 
 	if (!compiler->mode32 && !(flags & EX86_NO_REXW))
 		rex |= REX_W;
 	else if (flags & EX86_REX)
 		rex |= REX;
 
-	if (flags & (EX86_PREF_F2 | EX86_PREF_F3))
-		inst_size++;
-	if (flags & EX86_PREF_66)
+	if (flags & (EX86_PREF_F2 | EX86_PREF_F3 | EX86_PREF_66))
 		inst_size++;
 
 	/* Calculate size of b. */
-	inst_size += 1; /* mod r/m byte. */
 	if (b & SLJIT_MEM) {
 		if (!(b & OFFS_REG_MASK) && NOT_HALFWORD(immb)) {
 			PTR_FAIL_IF(emit_load_imm64(compiler, TMP_REG2, immb));
@@ -119,8 +117,7 @@
 					inst_size += sizeof(sljit_s8);
 				else
 					inst_size += sizeof(sljit_s32);
-			}
-			else if (reg_lmap[b & REG_MASK] == 5) {
+			} else if (reg_lmap[b & REG_MASK] == 5) {
 				/* Swap registers if possible. */
 				if ((b & OFFS_REG_MASK) && (immb & 0x3) == 0 && reg_lmap[OFFS_REG(b)] != 5)
 					b = SLJIT_MEM | OFFS_REG(b) | TO_OFFS_REG(b & REG_MASK);
@@ -140,23 +137,26 @@
 					rex |= REX_X;
 			}
 		}
-	}
-	else if (!(flags & EX86_SSE2_OP2)) {
+	} else if (!(flags & EX86_SSE2_OP2)) {
 		if (reg_map[b] >= 8)
 			rex |= REX_B;
-	}
-	else if (freg_map[b] >= 8)
+	} else if (freg_map[b] >= 8)
 		rex |= REX_B;
 
-	if (a & SLJIT_IMM) {
+	if ((flags & EX86_VEX_EXT) && (rex & 0x3)) {
+		SLJIT_ASSERT(size == 2);
+		size++;
+		inst_size++;
+	}
+
+	if (a == SLJIT_IMM) {
 		if (flags & EX86_BIN_INS) {
 			if (imma <= 127 && imma >= -128) {
 				inst_size += 1;
 				flags |= EX86_BYTE_ARG;
 			} else
 				inst_size += 4;
-		}
-		else if (flags & EX86_SHIFT_INS) {
+		} else if (flags & EX86_SHIFT_INS) {
 			SLJIT_ASSERT(imma <= (compiler->mode32 ? 0x1f : 0x3f));
 			if (imma != 1) {
 				inst_size++;
@@ -168,8 +168,7 @@
 			inst_size += sizeof(short);
 		else
 			inst_size += sizeof(sljit_s32);
-	}
-	else {
+	} else {
 		SLJIT_ASSERT(!(flags & EX86_SHIFT_INS) || a == SLJIT_PREF_SHIFT_REG);
 		/* reg_map[SLJIT_PREF_SHIFT_REG] is less than 8. */
 		if (!(flags & EX86_SSE2_OP1)) {
@@ -186,14 +185,16 @@
 	inst = (sljit_u8*)ensure_buf(compiler, 1 + inst_size);
 	PTR_FAIL_IF(!inst);
 
-	/* Encoding the byte. */
+	/* Encoding prefixes. */
 	INC_SIZE(inst_size);
 	if (flags & EX86_PREF_F2)
 		*inst++ = 0xf2;
-	if (flags & EX86_PREF_F3)
+	else if (flags & EX86_PREF_F3)
 		*inst++ = 0xf3;
-	if (flags & EX86_PREF_66)
+	else if (flags & EX86_PREF_66)
 		*inst++ = 0x66;
+
+	/* Rex is always the last prefix. */
 	if (rex)
 		*inst++ = rex;
 
@@ -201,18 +202,17 @@
 
 	/* Encode mod/rm byte. */
 	if (!(flags & EX86_SHIFT_INS)) {
-		if ((flags & EX86_BIN_INS) && (a & SLJIT_IMM))
+		if ((flags & EX86_BIN_INS) && a == SLJIT_IMM)
 			*inst = (flags & EX86_BYTE_ARG) ? GROUP_BINARY_83 : GROUP_BINARY_81;
 
-		if (a & SLJIT_IMM)
+		if (a == SLJIT_IMM)
 			*buf_ptr = 0;
 		else if (!(flags & EX86_SSE2_OP1))
 			*buf_ptr = U8(reg_lmap[a] << 3);
 		else
 			*buf_ptr = U8(freg_lmap[a] << 3);
-	}
-	else {
-		if (a & SLJIT_IMM) {
+	} else {
+		if (a == SLJIT_IMM) {
 			if (imma == 1)
 				*inst = GROUP_SHIFT_1;
 			else
@@ -239,8 +239,9 @@
 			if (!(b & OFFS_REG_MASK))
 				*buf_ptr++ |= reg_lmap_b;
 			else {
-				*buf_ptr++ |= 0x04;
-				*buf_ptr++ = U8(reg_lmap_b | (reg_lmap[OFFS_REG(b)] << 3));
+				buf_ptr[0] |= 0x04;
+				buf_ptr[1] = U8(reg_lmap_b | (reg_lmap[OFFS_REG(b)] << 3));
+				buf_ptr += 2;
 			}
 
 			if (immb != 0 || reg_lmap_b == 5) {
@@ -251,26 +252,26 @@
 					buf_ptr += sizeof(sljit_s32);
 				}
 			}
-		}
-		else {
+		} else {
 			if (reg_lmap_b == 5)
 				*buf_ptr |= 0x40;
 
-			*buf_ptr++ |= 0x04;
-			*buf_ptr++ = U8(reg_lmap_b | (reg_lmap[OFFS_REG(b)] << 3) | (immb << 6));
+			buf_ptr[0] |= 0x04;
+			buf_ptr[1] = U8(reg_lmap_b | (reg_lmap[OFFS_REG(b)] << 3) | (immb << 6));
+			buf_ptr += 2;
 
 			if (reg_lmap_b == 5)
 				*buf_ptr++ = 0;
 		}
-	}
-	else {
-		*buf_ptr++ |= 0x04;
-		*buf_ptr++ = 0x25;
+	} else {
+		buf_ptr[0] |= 0x04;
+		buf_ptr[1] = 0x25;
+		buf_ptr += 2;
 		sljit_unaligned_store_s32(buf_ptr, (sljit_s32)immb); /* 32 bit displacement. */
 		buf_ptr += sizeof(sljit_s32);
 	}
 
-	if (a & SLJIT_IMM) {
+	if (a == SLJIT_IMM) {
 		if (flags & EX86_BYTE_ARG)
 			*buf_ptr = U8(imma);
 		else if (flags & EX86_HALF_ARG)
@@ -279,7 +280,78 @@
 			sljit_unaligned_store_s32(buf_ptr, (sljit_s32)imma);
 	}
 
-	return !(flags & EX86_SHIFT_INS) ? inst : (inst + 1);
+	return inst;
+}
+
+static sljit_s32 emit_vex_instruction(struct sljit_compiler *compiler, sljit_uw op,
+	/* The first and second register operand. */
+	sljit_s32 a, sljit_s32 v,
+	/* The general operand (not immediate). */
+	sljit_s32 b, sljit_sw immb)
+{
+	sljit_u8 *inst;
+	sljit_u8 vex = 0;
+	sljit_u8 vex_m = 0;
+	sljit_uw size;
+
+	SLJIT_ASSERT(((op & (EX86_PREF_F2 | EX86_PREF_F3 | EX86_PREF_66))
+			& ((op & (EX86_PREF_F2 | EX86_PREF_F3 | EX86_PREF_66)) - 1)) == 0);
+
+	op |= EX86_REX;
+
+	if (op & VEX_OP_0F38)
+		vex_m = 0x2;
+	else if (op & VEX_OP_0F3A)
+		vex_m = 0x3;
+
+	if ((op & VEX_W) || ((op & VEX_AUTO_W) && !compiler->mode32)) {
+		if (vex_m == 0)
+			vex_m = 0x1;
+
+		vex |= 0x80;
+	}
+
+	if (op & EX86_PREF_66)
+		vex |= 0x1;
+	else if (op & EX86_PREF_F2)
+		vex |= 0x3;
+	else if (op & EX86_PREF_F3)
+		vex |= 0x2;
+
+	op &= ~(EX86_PREF_66 | EX86_PREF_F2 | EX86_PREF_F3);
+
+	if (op & VEX_256)
+		vex |= 0x4;
+
+	vex = U8(vex | ((((op & VEX_SSE2_OPV) ? freg_map[v] : reg_map[v]) ^ 0xf) << 3));
+
+	size = op & ~(sljit_uw)0xff;
+	size |= (vex_m == 0) ? (EX86_VEX_EXT | 2) : 3;
+
+	inst = emit_x86_instruction(compiler, size, a, 0, b, immb);
+	FAIL_IF(!inst);
+
+	SLJIT_ASSERT((inst[-1] & 0xf0) == REX);
+
+	/* If X or B is present in REX prefix. */
+	if (vex_m == 0 && inst[-1] & 0x3)
+		vex_m = 0x1;
+
+	if (vex_m == 0) {
+		vex |= U8(((inst[-1] >> 2) ^ 0x1) << 7);
+
+		inst[-1] = 0xc5;
+		inst[0] = vex;
+		inst[1] = U8(op);
+		return SLJIT_SUCCESS;
+	}
+
+	vex_m |= U8((inst[-1] ^ 0x7) << 5);
+	inst[-1] = 0xc4;
+	inst[0] = vex_m;
+	inst[1] = vex;
+	inst[2] = U8(op);
+	return SLJIT_SUCCESS;
 }
 
 /* --------------------------------------------------------------------- */
@@ -539,16 +611,12 @@
 
 		tmp = SLJIT_FS0 - fsaveds;
 		for (i = SLJIT_FS0; i > tmp; i--) {
-			inst = emit_x86_instruction(compiler, 2 | EX86_SSE2, i, 0, SLJIT_MEM1(SLJIT_SP), saved_float_regs_offset);
-			inst[0] = GROUP_0F;
-			inst[1] = MOVAPS_xm_x;
+			FAIL_IF(emit_groupf(compiler, MOVAPS_xm_x, EX86_SSE2, i, SLJIT_MEM1(SLJIT_SP), saved_float_regs_offset));
 			saved_float_regs_offset += 16;
 		}
 
 		for (i = fscratches; i >= SLJIT_FIRST_SAVED_FLOAT_REG; i--) {
-			inst = emit_x86_instruction(compiler, 2 | EX86_SSE2, i, 0, SLJIT_MEM1(SLJIT_SP), saved_float_regs_offset);
-			inst[0] = GROUP_0F;
-			inst[1] = MOVAPS_xm_x;
+			FAIL_IF(emit_groupf(compiler, MOVAPS_xm_x, EX86_SSE2, i, SLJIT_MEM1(SLJIT_SP), saved_float_regs_offset));
 			saved_float_regs_offset += 16;
 		}
 	}
@@ -606,16 +674,12 @@
 
 		tmp = SLJIT_FS0 - fsaveds;
 		for (i = SLJIT_FS0; i > tmp; i--) {
-			inst = emit_x86_instruction(compiler, 2 | EX86_SSE2, i, 0, SLJIT_MEM1(SLJIT_SP), saved_float_regs_offset);
-			inst[0] = GROUP_0F;
-			inst[1] = MOVAPS_x_xm;
+			FAIL_IF(emit_groupf(compiler, MOVAPS_x_xm, EX86_SSE2, i, SLJIT_MEM1(SLJIT_SP), saved_float_regs_offset));
 			saved_float_regs_offset += 16;
 		}
 
 		for (i = fscratches; i >= SLJIT_FIRST_SAVED_FLOAT_REG; i--) {
-			inst = emit_x86_instruction(compiler, 2 | EX86_SSE2, i, 0, SLJIT_MEM1(SLJIT_SP), saved_float_regs_offset);
-			inst[0] = GROUP_0F;
-			inst[1] = MOVAPS_x_xm;
+			FAIL_IF(emit_groupf(compiler, MOVAPS_x_xm, EX86_SSE2, i, SLJIT_MEM1(SLJIT_SP), saved_float_regs_offset));
 			saved_float_regs_offset += 16;
 		}
 
@@ -1028,7 +1092,7 @@
 
 	compiler->mode32 = 0;
 
-	if (src & SLJIT_IMM) {
+	if (src == SLJIT_IMM) {
 		if (FAST_IS_REG(dst)) {
 			if (!sign || ((sljit_u32)srcw <= 0x7fffffff))
 				return emit_do_imm32(compiler, reg_map[dst] <= 7 ? 0 : REX_B, U8(MOV_r_i32 | reg_lmap[dst]), srcw);
@@ -1084,17 +1148,14 @@
 	compiler->mode32 = 0;
 
 	if (GET_OPCODE(op) == SLJIT_CONV_F64_FROM_U32) {
-		if (!(src & SLJIT_IMM)) {
+		if (src != SLJIT_IMM) {
 			compiler->mode32 = 1;
 			EMIT_MOV(compiler, TMP_REG1, 0, src, srcw);
 			compiler->mode32 = 0;
 		} else
 			FAIL_IF(emit_do_imm32(compiler, reg_map[TMP_REG1] <= 7 ? 0 : REX_B, U8(MOV_r_i32 | reg_lmap[TMP_REG1]), srcw));
 
-		inst = emit_x86_instruction(compiler, 2 | ((op & SLJIT_32) ? EX86_PREF_F3 : EX86_PREF_F2) | EX86_SSE2_OP1, dst_r, 0, TMP_REG1, 0);
-		FAIL_IF(!inst);
-		inst[0] = GROUP_0F;
-		inst[1] = CVTSI2SD_x_rm;
+		FAIL_IF(emit_groupf(compiler, CVTSI2SD_x_rm, EX86_SELECT_F2_F3(op) | EX86_SSE2_OP1, dst_r, TMP_REG1, 0));
 
 		compiler->mode32 = 1;
 
@@ -1119,10 +1180,7 @@
 	size1 = compiler->size;
 
 	compiler->mode32 = 0;
-	inst = emit_x86_instruction(compiler, 2 | ((op & SLJIT_32) ? EX86_PREF_F3 : EX86_PREF_F2) | EX86_SSE2_OP1, dst_r, 0, src, 0);
-	FAIL_IF(!inst);
-	inst[0] = GROUP_0F;
-	inst[1] = CVTSI2SD_x_rm;
+	FAIL_IF(emit_groupf(compiler, CVTSI2SD_x_rm, EX86_SELECT_F2_F3(op) | EX86_SSE2_OP1, dst_r, src, 0));
 
 	inst = (sljit_u8*)ensure_buf(compiler, 1 + 2);
 	FAIL_IF(!inst);
@@ -1141,7 +1199,7 @@
 
 	inst = emit_x86_instruction(compiler, 1 | EX86_SHIFT_INS, SLJIT_IMM, 1, TMP_REG1, 0);
 	FAIL_IF(!inst);
-	*inst |= SHR;
+	inst[1] |= SHR;
 
 	compiler->mode32 = 1;
 	BINARY_IMM32(AND, 1, TMP_REG2, 0);
@@ -1151,13 +1209,9 @@
 	FAIL_IF(!inst);
 	inst[0] = OR_r_rm;
 
-	inst = emit_x86_instruction(compiler, 2 | ((op & SLJIT_32) ? EX86_PREF_F3 : EX86_PREF_F2) | EX86_SSE2_OP1, dst_r, 0, TMP_REG1, 0);
-	FAIL_IF(!inst);
-	inst[0] = GROUP_0F;
-	inst[1] = CVTSI2SD_x_rm;
-
+	FAIL_IF(emit_groupf(compiler, CVTSI2SD_x_rm, EX86_SELECT_F2_F3(op) | EX86_SSE2_OP1, dst_r, TMP_REG1, 0));
 	compiler->mode32 = 1;
-	FAIL_IF(emit_sse2(compiler, ADDSD_x_xm, op & SLJIT_32, dst_r, dst_r, 0));
+	FAIL_IF(emit_groupf(compiler, ADDSD_x_xm, EX86_SELECT_F2_F3(op) | EX86_SSE2, dst_r, dst_r, 0));
 
 	jump_inst2[1] = U8(compiler->size - size2);
 
diff --git a/src/sljit/sljitNativeX86_common.c b/src/sljit/sljitNativeX86_common.c
index d003e3b..cc330d4 100644
--- a/src/sljit/sljitNativeX86_common.c
+++ b/src/sljit/sljitNativeX86_common.c
@@ -140,192 +140,232 @@
 
 #define U8(v)			((sljit_u8)(v))
 
-
 /* Size flags for emit_x86_instruction: */
-#define EX86_BIN_INS		0x0010
-#define EX86_SHIFT_INS		0x0020
-#define EX86_REX		0x0040
-#define EX86_NO_REXW		0x0080
-#define EX86_BYTE_ARG		0x0100
-#define EX86_HALF_ARG		0x0200
-#define EX86_PREF_66		0x0400
-#define EX86_PREF_F2		0x0800
-#define EX86_PREF_F3		0x1000
-#define EX86_SSE2_OP1		0x2000
-#define EX86_SSE2_OP2		0x4000
+#define EX86_BIN_INS		((sljit_uw)0x000010)
+#define EX86_SHIFT_INS		((sljit_uw)0x000020)
+#define EX86_BYTE_ARG		((sljit_uw)0x000040)
+#define EX86_HALF_ARG		((sljit_uw)0x000080)
+/* Size flags for both emit_x86_instruction and emit_vex_instruction: */
+#define EX86_REX		((sljit_uw)0x000100)
+#define EX86_NO_REXW		((sljit_uw)0x000200)
+#define EX86_PREF_66		((sljit_uw)0x000400)
+#define EX86_PREF_F2		((sljit_uw)0x000800)
+#define EX86_PREF_F3		((sljit_uw)0x001000)
+#define EX86_SSE2_OP1		((sljit_uw)0x002000)
+#define EX86_SSE2_OP2		((sljit_uw)0x004000)
 #define EX86_SSE2		(EX86_SSE2_OP1 | EX86_SSE2_OP2)
+#define EX86_VEX_EXT		((sljit_uw)0x008000)
+/* Op flags for emit_vex_instruction: */
+#define VEX_OP_0F38		((sljit_uw)0x010000)
+#define VEX_OP_0F3A		((sljit_uw)0x020000)
+#define VEX_SSE2_OPV		((sljit_uw)0x040000)
+#define VEX_AUTO_W		((sljit_uw)0x080000)
+#define VEX_W			((sljit_uw)0x100000)
+#define VEX_256			((sljit_uw)0x200000)
+
+#define EX86_SELECT_66(op)	(((op) & SLJIT_32) ? 0 : EX86_PREF_66)
+#define EX86_SELECT_F2_F3(op)	(((op) & SLJIT_32) ? EX86_PREF_F3 : EX86_PREF_F2)
 
 /* --------------------------------------------------------------------- */
 /*  Instruction forms                                                    */
 /* --------------------------------------------------------------------- */
 
-#define ADD		(/* BINARY */ 0 << 3)
-#define ADD_EAX_i32	0x05
-#define ADD_r_rm	0x03
-#define ADD_rm_r	0x01
-#define ADDSD_x_xm	0x58
-#define ADC		(/* BINARY */ 2 << 3)
-#define ADC_EAX_i32	0x15
-#define ADC_r_rm	0x13
-#define ADC_rm_r	0x11
-#define AND		(/* BINARY */ 4 << 3)
-#define AND_EAX_i32	0x25
-#define AND_r_rm	0x23
-#define AND_rm_r	0x21
-#define ANDPD_x_xm	0x54
-#define BSR_r_rm	(/* GROUP_0F */ 0xbd)
-#define BSF_r_rm	(/* GROUP_0F */ 0xbc)
-#define BSWAP_r		(/* GROUP_0F */ 0xc8)
-#define CALL_i32	0xe8
-#define CALL_rm		(/* GROUP_FF */ 2 << 3)
-#define CDQ		0x99
-#define CMOVE_r_rm	(/* GROUP_0F */ 0x44)
-#define CMP		(/* BINARY */ 7 << 3)
-#define CMP_EAX_i32	0x3d
-#define CMP_r_rm	0x3b
-#define CMP_rm_r	0x39
-#define CMPS_x_xm	0xc2
-#define CMPXCHG_rm_r	0xb1
-#define CMPXCHG_rm8_r	0xb0
-#define CVTPD2PS_x_xm	0x5a
-#define CVTSI2SD_x_rm	0x2a
-#define CVTTSD2SI_r_xm	0x2c
-#define DIV		(/* GROUP_F7 */ 6 << 3)
-#define DIVSD_x_xm	0x5e
-#define EXTRACTPS_x_xm	0x17
-#define FLDS		0xd9
-#define FLDL		0xdd
-#define FSTPS		0xd9
-#define FSTPD		0xdd
-#define INSERTPS_x_xm	0x21
-#define INT3		0xcc
-#define IDIV		(/* GROUP_F7 */ 7 << 3)
-#define IMUL		(/* GROUP_F7 */ 5 << 3)
-#define IMUL_r_rm	(/* GROUP_0F */ 0xaf)
-#define IMUL_r_rm_i8	0x6b
-#define IMUL_r_rm_i32	0x69
-#define JL_i8		0x7c
-#define JE_i8		0x74
-#define JNC_i8		0x73
-#define JNE_i8		0x75
-#define JMP_i8		0xeb
-#define JMP_i32		0xe9
-#define JMP_rm		(/* GROUP_FF */ 4 << 3)
-#define LEA_r_m		0x8d
-#define LOOP_i8		0xe2
-#define LZCNT_r_rm	(/* GROUP_F3 */ /* GROUP_0F */ 0xbd)
-#define MOV_r_rm	0x8b
-#define MOV_r_i32	0xb8
-#define MOV_rm_r	0x89
-#define MOV_rm_i32	0xc7
-#define MOV_rm8_i8	0xc6
-#define MOV_rm8_r8	0x88
-#define MOVAPS_x_xm	0x28
-#define MOVAPS_xm_x	0x29
-#define MOVD_x_rm	0x6e
-#define MOVD_rm_x	0x7e
-#define MOVDDUP_x_xm	0x12
-#define MOVDQA_x_xm	0x6f
-#define MOVDQA_xm_x	0x7f
-#define MOVHLPS_x_x	0x12
-#define MOVHPD_m_x	0x17
-#define MOVHPD_x_m	0x16
-#define MOVLHPS_x_x	0x16
-#define MOVLPD_m_x	0x13
-#define MOVLPD_x_m	0x12
-#define MOVSD_x_xm	0x10
-#define MOVSD_xm_x	0x11
-#define MOVSHDUP_x_xm	0x16
-#define MOVSXD_r_rm	0x63
-#define MOVSX_r_rm8	(/* GROUP_0F */ 0xbe)
-#define MOVSX_r_rm16	(/* GROUP_0F */ 0xbf)
-#define MOVUPS_x_xm	0x10
-#define MOVZX_r_rm8	(/* GROUP_0F */ 0xb6)
-#define MOVZX_r_rm16	(/* GROUP_0F */ 0xb7)
-#define MUL		(/* GROUP_F7 */ 4 << 3)
-#define MULSD_x_xm	0x59
-#define NEG_rm		(/* GROUP_F7 */ 3 << 3)
-#define NOP		0x90
-#define NOT_rm		(/* GROUP_F7 */ 2 << 3)
-#define OR		(/* BINARY */ 1 << 3)
-#define OR_r_rm		0x0b
-#define OR_EAX_i32	0x0d
-#define OR_rm_r		0x09
-#define OR_rm8_r8	0x08
-#define PCMPEQB_x_xm	0x74
-#define PINSRB_x_rm_i8	0x20
-#define PINSRW_x_rm_i8	0xc4
-#define PINSRD_x_rm_i8	0x22
-#define PEXTRB_rm_x_i8	0x14
-#define PEXTRW_rm_x_i8	0x15
-#define PEXTRD_rm_x_i8	0x16
-#define POP_r		0x58
-#define POP_rm		0x8f
-#define POPF		0x9d
-#define PREFETCH	0x18
-#define PSHUFB_x_xm	0x00
-#define PSHUFD_x_xm	0x70
-#define PSHUFLW_x_xm	0x70
-#define PSRLDQ_x	0x73
-#define PUSH_i32	0x68
-#define PUSH_r		0x50
-#define PUSH_rm		(/* GROUP_FF */ 6 << 3)
-#define PUSHF		0x9c
-#define PXOR_x_xm	0xef
-#define ROL		(/* SHIFT */ 0 << 3)
-#define ROR		(/* SHIFT */ 1 << 3)
-#define RET_near	0xc3
-#define RET_i16		0xc2
-#define SBB		(/* BINARY */ 3 << 3)
-#define SBB_EAX_i32	0x1d
-#define SBB_r_rm	0x1b
-#define SBB_rm_r	0x19
-#define SAR		(/* SHIFT */ 7 << 3)
-#define SHL		(/* SHIFT */ 4 << 3)
-#define SHLD		(/* GROUP_0F */ 0xa5)
-#define SHRD		(/* GROUP_0F */ 0xad)
-#define SHR		(/* SHIFT */ 5 << 3)
-#define SHUFPS_x_xm	0xc6
-#define SUB		(/* BINARY */ 5 << 3)
-#define SUB_EAX_i32	0x2d
-#define SUB_r_rm	0x2b
-#define SUB_rm_r	0x29
-#define SUBSD_x_xm	0x5c
-#define TEST_EAX_i32	0xa9
-#define TEST_rm_r	0x85
-#define TZCNT_r_rm	(/* GROUP_F3 */ /* GROUP_0F */ 0xbc)
-#define UCOMISD_x_xm	0x2e
-#define UNPCKLPD_x_xm	0x14
-#define UNPCKLPS_x_xm	0x14
-#define XCHG_EAX_r	0x90
-#define XCHG_r_rm	0x87
-#define XOR		(/* BINARY */ 6 << 3)
-#define XOR_EAX_i32	0x35
-#define XOR_r_rm	0x33
-#define XOR_rm_r	0x31
-#define XORPD_x_xm	0x57
+#define ADD			(/* BINARY */ 0 << 3)
+#define ADD_EAX_i32		0x05
+#define ADD_r_rm		0x03
+#define ADD_rm_r		0x01
+#define ADDSD_x_xm		0x58
+#define ADC			(/* BINARY */ 2 << 3)
+#define ADC_EAX_i32		0x15
+#define ADC_r_rm		0x13
+#define ADC_rm_r		0x11
+#define AND			(/* BINARY */ 4 << 3)
+#define AND_EAX_i32		0x25
+#define AND_r_rm		0x23
+#define AND_rm_r		0x21
+#define ANDPD_x_xm		0x54
+#define BSR_r_rm		(/* GROUP_0F */ 0xbd)
+#define BSF_r_rm		(/* GROUP_0F */ 0xbc)
+#define BSWAP_r			(/* GROUP_0F */ 0xc8)
+#define CALL_i32		0xe8
+#define CALL_rm			(/* GROUP_FF */ 2 << 3)
+#define CDQ			0x99
+#define CMOVE_r_rm		(/* GROUP_0F */ 0x44)
+#define CMP			(/* BINARY */ 7 << 3)
+#define CMP_EAX_i32		0x3d
+#define CMP_r_rm		0x3b
+#define CMP_rm_r		0x39
+#define CMPS_x_xm		0xc2
+#define CMPXCHG_rm_r		0xb1
+#define CMPXCHG_rm8_r		0xb0
+#define CVTPD2PS_x_xm		0x5a
+#define CVTPS2PD_x_xm		0x5a
+#define CVTSI2SD_x_rm		0x2a
+#define CVTTSD2SI_r_xm		0x2c
+#define DIV			(/* GROUP_F7 */ 6 << 3)
+#define DIVSD_x_xm		0x5e
+#define EXTRACTPS_x_xm		0x17
+#define FLDS			0xd9
+#define FLDL			0xdd
+#define FSTPS			0xd9
+#define FSTPD			0xdd
+#define INSERTPS_x_xm		0x21
+#define INT3			0xcc
+#define IDIV			(/* GROUP_F7 */ 7 << 3)
+#define IMUL			(/* GROUP_F7 */ 5 << 3)
+#define IMUL_r_rm		(/* GROUP_0F */ 0xaf)
+#define IMUL_r_rm_i8		0x6b
+#define IMUL_r_rm_i32		0x69
+#define JL_i8			0x7c
+#define JE_i8			0x74
+#define JNC_i8			0x73
+#define JNE_i8			0x75
+#define JMP_i8			0xeb
+#define JMP_i32			0xe9
+#define JMP_rm			(/* GROUP_FF */ 4 << 3)
+#define LEA_r_m			0x8d
+#define LOOP_i8			0xe2
+#define LZCNT_r_rm		(/* GROUP_F3 */ /* GROUP_0F */ 0xbd)
+#define MOV_r_rm		0x8b
+#define MOV_r_i32		0xb8
+#define MOV_rm_r		0x89
+#define MOV_rm_i32		0xc7
+#define MOV_rm8_i8		0xc6
+#define MOV_rm8_r8		0x88
+#define MOVAPS_x_xm		0x28
+#define MOVAPS_xm_x		0x29
+#define MOVD_x_rm		0x6e
+#define MOVD_rm_x		0x7e
+#define MOVDDUP_x_xm		0x12
+#define MOVDQA_x_xm		0x6f
+#define MOVDQA_xm_x		0x7f
+#define MOVHLPS_x_x		0x12
+#define MOVHPD_m_x		0x17
+#define MOVHPD_x_m		0x16
+#define MOVLHPS_x_x		0x16
+#define MOVLPD_m_x		0x13
+#define MOVLPD_x_m		0x12
+#define MOVMSKPS_r_x		(/* GROUP_0F */ 0x50)
+#define MOVQ_x_xm		(/* GROUP_0F */ 0x7e)
+#define MOVSD_x_xm		0x10
+#define MOVSD_xm_x		0x11
+#define MOVSHDUP_x_xm		0x16
+#define MOVSXD_r_rm		0x63
+#define MOVSX_r_rm8		(/* GROUP_0F */ 0xbe)
+#define MOVSX_r_rm16		(/* GROUP_0F */ 0xbf)
+#define MOVUPS_x_xm		0x10
+#define MOVZX_r_rm8		(/* GROUP_0F */ 0xb6)
+#define MOVZX_r_rm16		(/* GROUP_0F */ 0xb7)
+#define MUL			(/* GROUP_F7 */ 4 << 3)
+#define MULSD_x_xm		0x59
+#define NEG_rm			(/* GROUP_F7 */ 3 << 3)
+#define NOP			0x90
+#define NOT_rm			(/* GROUP_F7 */ 2 << 3)
+#define OR			(/* BINARY */ 1 << 3)
+#define OR_r_rm			0x0b
+#define OR_EAX_i32		0x0d
+#define OR_rm_r			0x09
+#define OR_rm8_r8		0x08
+#define PACKSSWB_x_xm		(/* GROUP_0F */ 0x63)
+#define PCMPEQB_x_xm		0x74
+#define PINSRB_x_rm_i8		0x20
+#define PINSRW_x_rm_i8		0xc4
+#define PINSRD_x_rm_i8		0x22
+#define PEXTRB_rm_x_i8		0x14
+#define PEXTRW_rm_x_i8		0x15
+#define PEXTRD_rm_x_i8		0x16
+#define PMOVMSKB_r_x		(/* GROUP_0F */ 0xd7)
+#define PMOVSXBD_x_xm		0x21
+#define PMOVSXBQ_x_xm		0x22
+#define PMOVSXBW_x_xm		0x20
+#define PMOVSXDQ_x_xm		0x25
+#define PMOVSXWD_x_xm		0x23
+#define PMOVSXWQ_x_xm		0x24
+#define PMOVZXBD_x_xm		0x31
+#define PMOVZXBQ_x_xm		0x32
+#define PMOVZXBW_x_xm		0x30
+#define PMOVZXDQ_x_xm		0x35
+#define PMOVZXWD_x_xm		0x33
+#define PMOVZXWQ_x_xm		0x34
+#define POP_r			0x58
+#define POP_rm			0x8f
+#define POPF			0x9d
+#define PREFETCH		0x18
+#define PSHUFB_x_xm		0x00
+#define PSHUFD_x_xm		0x70
+#define PSHUFLW_x_xm		0x70
+#define PSRLDQ_x		0x73
+#define PUSH_i32		0x68
+#define PUSH_r			0x50
+#define PUSH_rm			(/* GROUP_FF */ 6 << 3)
+#define PUSHF			0x9c
+#define PXOR_x_xm		0xef
+#define ROL			(/* SHIFT */ 0 << 3)
+#define ROR			(/* SHIFT */ 1 << 3)
+#define RET_near		0xc3
+#define RET_i16			0xc2
+#define SBB			(/* BINARY */ 3 << 3)
+#define SBB_EAX_i32		0x1d
+#define SBB_r_rm		0x1b
+#define SBB_rm_r		0x19
+#define SAR			(/* SHIFT */ 7 << 3)
+#define SHL			(/* SHIFT */ 4 << 3)
+#define SHLD			(/* GROUP_0F */ 0xa5)
+#define SHRD			(/* GROUP_0F */ 0xad)
+#define SHR			(/* SHIFT */ 5 << 3)
+#define SHUFPS_x_xm		0xc6
+#define SUB			(/* BINARY */ 5 << 3)
+#define SUB_EAX_i32		0x2d
+#define SUB_r_rm		0x2b
+#define SUB_rm_r		0x29
+#define SUBSD_x_xm		0x5c
+#define TEST_EAX_i32		0xa9
+#define TEST_rm_r		0x85
+#define TZCNT_r_rm		(/* GROUP_F3 */ /* GROUP_0F */ 0xbc)
+#define UCOMISD_x_xm		0x2e
+#define UNPCKLPD_x_xm		0x14
+#define UNPCKLPS_x_xm		0x14
+#define VBROADCASTSD_x_xm	0x19
+#define VBROADCASTSS_x_xm	0x18
+#define VEXTRACTF128_x_ym	0x19
+#define VEXTRACTI128_x_ym	0x39
+#define VINSERTF128_y_y_xm	0x18
+#define VINSERTI128_y_y_xm	0x38
+#define VPBROADCASTB_x_xm	0x78
+#define VPBROADCASTD_x_xm	0x58
+#define VPBROADCASTQ_x_xm	0x59
+#define VPBROADCASTW_x_xm	0x79
+#define VPERMPD_y_ym		0x01
+#define VPERMQ_y_ym		0x00
+#define XCHG_EAX_r		0x90
+#define XCHG_r_rm		0x87
+#define XOR			(/* BINARY */ 6 << 3)
+#define XOR_EAX_i32		0x35
+#define XOR_r_rm		0x33
+#define XOR_rm_r		0x31
+#define XORPD_x_xm		0x57
 
-#define GROUP_0F	0x0f
-#define GROUP_66	0x66
-#define GROUP_F3	0xf3
-#define GROUP_F7	0xf7
-#define GROUP_FF	0xff
-#define GROUP_BINARY_81	0x81
-#define GROUP_BINARY_83	0x83
-#define GROUP_SHIFT_1	0xd1
-#define GROUP_SHIFT_N	0xc1
-#define GROUP_SHIFT_CL	0xd3
-#define GROUP_LOCK	0xf0
+#define GROUP_0F		0x0f
+#define GROUP_66		0x66
+#define GROUP_F3		0xf3
+#define GROUP_F7		0xf7
+#define GROUP_FF		0xff
+#define GROUP_BINARY_81		0x81
+#define GROUP_BINARY_83		0x83
+#define GROUP_SHIFT_1		0xd1
+#define GROUP_SHIFT_N		0xc1
+#define GROUP_SHIFT_CL		0xd3
+#define GROUP_LOCK		0xf0
 
-#define MOD_REG		0xc0
-#define MOD_DISP8	0x40
+#define MOD_REG			0xc0
+#define MOD_DISP8		0x40
 
-#define INC_SIZE(s)			(*inst++ = U8(s), compiler->size += (s))
+#define INC_SIZE(s)		(*inst++ = U8(s), compiler->size += (s))
 
-#define PUSH_REG(r)			(*inst++ = U8(PUSH_r + (r)))
-#define POP_REG(r)			(*inst++ = U8(POP_r + (r)))
-#define RET()				(*inst++ = RET_near)
-#define RET_I16(n)			(*inst++ = RET_i16, *inst++ = U8(n), *inst++ = 0)
+#define PUSH_REG(r)		(*inst++ = U8(PUSH_r + (r)))
+#define POP_REG(r)		(*inst++ = U8(POP_r + (r)))
+#define RET()			(*inst++ = RET_near)
+#define RET_I16(n)		(*inst++ = RET_i16, *inst++ = U8(n), *inst++ = 0)
 
 /* Multithreading does not affect these static variables, since they store
    built-in CPU features. Therefore they can be overwritten by different threads
@@ -338,6 +378,8 @@
 #define CPU_FEATURE_LZCNT		0x008
 #define CPU_FEATURE_TZCNT		0x010
 #define CPU_FEATURE_CMOV		0x020
+#define CPU_FEATURE_AVX			0x040
+#define CPU_FEATURE_AVX2		0x080
 
 static sljit_u32 cpu_feature_list = 0;
 
@@ -370,129 +412,115 @@
 /*    Utility functions                               */
 /******************************************************/
 
-static void get_cpu_features(void)
+static void execute_cpu_id(sljit_u32 info[4])
 {
-	sljit_u32 feature_list = CPU_FEATURE_DETECTED;
-	sljit_u32 value_ecx, value_edx;
-
 #if defined(_MSC_VER) && _MSC_VER >= 1400
 
-	int CPUInfo[4];
-
-	__cpuid(CPUInfo, 0);
-	if (CPUInfo[0] >= 7) {
-		__cpuidex(CPUInfo, 7, 0);
-		if (CPUInfo[1] & 0x8)
-			feature_list |= CPU_FEATURE_TZCNT;
-	}
-
-	__cpuid(CPUInfo, (int)0x80000001);
-	if (CPUInfo[2] & 0x20)
-		feature_list |= CPU_FEATURE_LZCNT;
-
-	__cpuid(CPUInfo, 1);
-	value_ecx = (sljit_u32)CPUInfo[2];
-	value_edx = (sljit_u32)CPUInfo[3];
+	__cpuidex((int*)info, (int)info[0], (int)info[2]);
 
 #elif defined(__GNUC__) || defined(__INTEL_COMPILER) || defined(__SUNPRO_C) || defined(__TINYC__)
 
 	/* AT&T syntax. */
 	__asm__ (
-		"movl $0x0, %%eax\n"
-		"lzcnt %%eax, %%eax\n"
-		"setnz %%al\n"
-		"movl %%eax, %0\n"
-		: "=g" (value_ecx)
-		:
 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
-		: "eax"
-#else
-		: "rax"
-#endif
-	);
-
-	if (value_ecx & 0x1)
-		feature_list |= CPU_FEATURE_LZCNT;
-
-	__asm__ (
-		"movl $0x0, %%eax\n"
-		"tzcnt %%eax, %%eax\n"
-		"setnz %%al\n"
-		"movl %%eax, %0\n"
-		: "=g" (value_ecx)
-		:
-#if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
-		: "eax"
-#else
-		: "rax"
-#endif
-	);
-
-	if (value_ecx & 0x1)
-		feature_list |= CPU_FEATURE_TZCNT;
-
-	__asm__ (
-		"movl $0x1, %%eax\n"
-#if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
-		/* On x86-32, there is no red zone, so this
-		   should work (no need for a local variable). */
-		"push %%ebx\n"
-#endif
+		"movl %0, %%esi\n"
+		"movl (%%esi), %%eax\n"
+		"movl 8(%%esi), %%ecx\n"
 		"cpuid\n"
-#if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
-		"pop %%ebx\n"
-#endif
-		"movl %%edx, %0\n"
-		"movl %%edx, %1\n"
-		: "=g" (value_ecx), "=g" (value_edx)
+		"movl %%eax, (%%esi)\n"
+		"movl %%ebx, 4(%%esi)\n"
+		"movl %%ecx, 8(%%esi)\n"
+		"movl %%edx, 12(%%esi)\n"
+#else /* !SLJIT_CONFIG_X86_32 */
+		"movq %0, %%rsi\n"
+		"movl (%%rsi), %%eax\n"
+		"movl 8(%%rsi), %%ecx\n"
+		"cpuid\n"
+		"movl %%eax, (%%rsi)\n"
+		"movl %%ebx, 4(%%rsi)\n"
+		"movl %%ecx, 8(%%rsi)\n"
+		"movl %%edx, 12(%%rsi)\n"
+#endif /* SLJIT_CONFIG_X86_32 */
 		:
+		: "r" (info)
 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
-		: "eax", "ecx", "edx"
-#else
-		: "rax", "rbx", "rcx", "rdx"
-#endif
+		: "memory", "eax", "ebx", "ecx", "edx", "esi"
+#else /* !SLJIT_CONFIG_X86_32 */
+		: "memory", "rax", "rbx", "rcx", "rdx", "rsi"
+#endif /* SLJIT_CONFIG_X86_32 */
 	);
 
-#else /* _MSC_VER && _MSC_VER >= 1400 */
+#else /* _MSC_VER < 1400 */
 
 	/* Intel syntax. */
 	__asm {
-		mov eax, 0
-		lzcnt eax, eax
-		setnz al
-		mov value_ecx, eax
-	}
-
-	if (value_ecx & 0x1)
-		feature_list |= CPU_FEATURE_LZCNT;
-
-	__asm {
-		mov eax, 0
-		tzcnt eax, eax
-		setnz al
-		mov value_ecx, eax
-	}
-
-	if (value_ecx & 0x1)
-		feature_list |= CPU_FEATURE_TZCNT;
-
-	__asm {
-		mov eax, 1
+#if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
+		mov esi, info
+		mov eax, [esi]
+		mov ecx, [esi + 8]
 		cpuid
-		mov value_ecx, ecx
-		mov value_edx, edx
+		mov [esi], eax
+		mov [esi + 4], ebx
+		mov [esi + 8], ecx
+		mov [esi + 12], edx
+#else /* !SLJIT_CONFIG_X86_32 */
+		mov rsi, info
+		mov eax, [rsi]
+		mov ecx, [rsi + 8]
+		cpuid
+		mov [rsi], eax
+		mov [rsi + 4], ebx
+		mov [rsi + 8], ecx
+		mov [rsi + 12], edx
+#endif /* SLJIT_CONFIG_X86_32 */
 	}
 
 #endif /* _MSC_VER && _MSC_VER >= 1400 */
+}
 
+static void get_cpu_features(void)
+{
+	sljit_u32 feature_list = CPU_FEATURE_DETECTED;
+	sljit_u32 info[4];
+	sljit_u32 max_id;
+
+	info[0] = 0;
+	execute_cpu_id(info);
+	max_id = info[0];
+
+	if (max_id >= 7) {
+		info[0] = 7;
+		info[2] = 0;
+		execute_cpu_id(info);
+
+		if (info[1] & 0x8)
+			feature_list |= CPU_FEATURE_TZCNT;
+		if (info[1] & 0x20)
+			feature_list |= CPU_FEATURE_AVX2;
+	}
+
+	if (max_id >= 1) {
+		info[0] = 1;
+		execute_cpu_id(info);
+
+		if (info[2] & 0x80000)
+			feature_list |= CPU_FEATURE_SSE41;
+		if (info[2] & 0x10000000)
+			feature_list |= CPU_FEATURE_AVX;
 #if (defined SLJIT_DETECT_SSE2 && SLJIT_DETECT_SSE2)
-	if (value_edx & 0x4000000)
-		feature_list |= CPU_FEATURE_SSE2;
+		if (info[3] & 0x4000000)
+			feature_list |= CPU_FEATURE_SSE2;
 #endif
-	if (value_ecx & 0x80000)
-		feature_list |= CPU_FEATURE_SSE41;
-	if (value_edx & 0x8000)
-		feature_list |= CPU_FEATURE_CMOV;
+		if (info[3] & 0x8000)
+			feature_list |= CPU_FEATURE_CMOV;
+	}
+
+	info[0] = 0x80000001;
+	info[2] = 0; /* Silences an incorrect compiler warning. */
+	execute_cpu_id(info);
+
+	if (info[2] & 0x20)
+		feature_list |= CPU_FEATURE_LZCNT;
 
 	cpu_feature_list = feature_list;
 }
@@ -586,7 +614,7 @@
 		label_addr = jump->u.target - (sljit_uw)executable_offset;
 
 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
-	if ((sljit_sw)(label_addr - (jump->addr + 1)) > HALFWORD_MAX || (sljit_sw)(label_addr - (jump->addr + 1)) < HALFWORD_MIN)
+	if ((sljit_sw)(label_addr - (jump->addr + 2)) > HALFWORD_MAX || (sljit_sw)(label_addr - (jump->addr + 6)) < HALFWORD_MIN)
 		return generate_far_jump_code(jump, code_ptr);
 #endif
 
@@ -782,7 +810,7 @@
 	switch (feature_type) {
 	case SLJIT_HAS_FPU:
 #ifdef SLJIT_IS_FPU_AVAILABLE
-		return SLJIT_IS_FPU_AVAILABLE;
+		return (SLJIT_IS_FPU_AVAILABLE) != 0;
 #elif (defined SLJIT_DETECT_SSE2 && SLJIT_DETECT_SSE2)
 		if (cpu_feature_list == 0)
 			get_cpu_features();
@@ -818,18 +846,23 @@
 	case SLJIT_HAS_PREFETCH:
 	case SLJIT_HAS_COPY_F32:
 	case SLJIT_HAS_COPY_F64:
+	case SLJIT_HAS_ATOMIC:
 		return 1;
 
-	case SLJIT_HAS_SSE2:
-	case SLJIT_HAS_SIMD:
-#if (defined SLJIT_DETECT_SSE2 && SLJIT_DETECT_SSE2)
+#if !(defined SLJIT_IS_FPU_AVAILABLE) || SLJIT_IS_FPU_AVAILABLE
+	case SLJIT_HAS_AVX:
 		if (cpu_feature_list == 0)
 			get_cpu_features();
-		return (cpu_feature_list & CPU_FEATURE_SSE2) != 0;
-#else /* !SLJIT_DETECT_SSE2 */
-		return 1;
-#endif /* SLJIT_DETECT_SSE2 */
-
+		return (cpu_feature_list & CPU_FEATURE_AVX) != 0;
+	case SLJIT_HAS_AVX2:
+		if (cpu_feature_list == 0)
+			get_cpu_features();
+		return (cpu_feature_list & CPU_FEATURE_AVX2) != 0;
+	case SLJIT_HAS_SIMD:
+		if (cpu_feature_list == 0)
+			get_cpu_features();
+		return (cpu_feature_list & CPU_FEATURE_SSE41) != 0;
+#endif /* SLJIT_IS_FPU_AVAILABLE */
 	default:
 		return 0;
 	}
@@ -903,8 +936,13 @@
 #define EMIT_MOV(compiler, dst, dstw, src, srcw) \
 	FAIL_IF(emit_mov(compiler, dst, dstw, src, srcw));
 
-static sljit_s32 emit_sse2(struct sljit_compiler *compiler, sljit_u8 opcode,
-	sljit_s32 single, sljit_s32 xmm1, sljit_s32 xmm2, sljit_sw xmm2w);
+static sljit_s32 emit_groupf(struct sljit_compiler *compiler,
+	sljit_u8 opcode, sljit_uw pref,
+	sljit_s32 dst, sljit_s32 src, sljit_sw srcw);
+
+static sljit_s32 emit_groupf_ext(struct sljit_compiler *compiler,
+	sljit_u8 opcode, sljit_uw pref,
+	sljit_s32 dst, sljit_s32 src, sljit_sw srcw);
 
 static SLJIT_INLINE sljit_s32 emit_sse2_store(struct sljit_compiler *compiler,
 	sljit_s32 single, sljit_s32 dst, sljit_sw dstw, sljit_s32 src);
@@ -1078,7 +1116,8 @@
 		*inst = MOV_rm_r;
 		return SLJIT_SUCCESS;
 	}
-	if (src & SLJIT_IMM) {
+
+	if (src == SLJIT_IMM) {
 		if (FAST_IS_REG(dst)) {
 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
 			return emit_do_imm(compiler, MOV_r_i32 | reg_map[dst], srcw);
@@ -1287,7 +1326,7 @@
 	compiler->mode32 = 0;
 #endif
 
-	if (src & SLJIT_IMM) {
+	if (src == SLJIT_IMM) {
 		if (FAST_IS_REG(dst)) {
 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
 			return emit_do_imm(compiler, MOV_r_i32 | reg_map[dst], srcw);
@@ -1316,47 +1355,27 @@
 #else
 		dst_r = src;
 #endif
-	}
+	} else {
 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
-	else if (FAST_IS_REG(src) && reg_map[src] >= 4) {
-		/* src, dst are registers. */
-		SLJIT_ASSERT(FAST_IS_REG(dst));
-		if (reg_map[dst] < 4) {
-			if (dst != src)
-				EMIT_MOV(compiler, dst, 0, src, 0);
-			inst = emit_x86_instruction(compiler, 2, dst, 0, dst, 0);
-			FAIL_IF(!inst);
-			inst[0] = GROUP_0F;
-			inst[1] = sign ? MOVSX_r_rm8 : MOVZX_r_rm8;
-		}
-		else {
-			if (dst != src)
-				EMIT_MOV(compiler, dst, 0, src, 0);
-			if (sign) {
-				/* shl reg, 24 */
-				inst = emit_x86_instruction(compiler, 1 | EX86_SHIFT_INS, SLJIT_IMM, 24, dst, 0);
-				FAIL_IF(!inst);
-				*inst |= SHL;
-				/* sar reg, 24 */
-				inst = emit_x86_instruction(compiler, 1 | EX86_SHIFT_INS, SLJIT_IMM, 24, dst, 0);
-				FAIL_IF(!inst);
-				*inst |= SAR;
-			}
-			else {
+		if (FAST_IS_REG(src) && reg_map[src] >= 4) {
+			/* Both src and dst are registers. */
+			SLJIT_ASSERT(FAST_IS_REG(dst));
+
+			if (src == dst && !sign) {
 				inst = emit_x86_instruction(compiler, 1 | EX86_BIN_INS, SLJIT_IMM, 0xff, dst, 0);
 				FAIL_IF(!inst);
 				*(inst + 1) |= AND;
+				return SLJIT_SUCCESS;
 			}
+
+			EMIT_MOV(compiler, TMP_REG1, 0, src, 0);
+			src = TMP_REG1;
+			srcw = 0;
 		}
-		return SLJIT_SUCCESS;
-	}
-#endif
-	else {
+#endif /* !SLJIT_CONFIG_X86_32 */
+
 		/* src can be memory addr or reg_map[src] < 4 on x86_32 architectures. */
-		inst = emit_x86_instruction(compiler, 2, dst_r, 0, src, srcw);
-		FAIL_IF(!inst);
-		inst[0] = GROUP_0F;
-		inst[1] = sign ? MOVSX_r_rm8 : MOVZX_r_rm8;
+		FAIL_IF(emit_groupf(compiler, sign ? MOVSX_r_rm8 : MOVZX_r_rm8, 0, dst_r, src, srcw));
 	}
 
 	if (dst & SLJIT_MEM) {
@@ -1403,7 +1422,7 @@
 	compiler->mode32 = 0;
 #endif
 
-	if (src & SLJIT_IMM) {
+	if (src == SLJIT_IMM) {
 		if (FAST_IS_REG(dst)) {
 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
 			return emit_do_imm(compiler, MOV_r_i32 | reg_map[dst], srcw);
@@ -1424,12 +1443,8 @@
 
 	if ((dst & SLJIT_MEM) && FAST_IS_REG(src))
 		dst_r = src;
-	else {
-		inst = emit_x86_instruction(compiler, 2, dst_r, 0, src, srcw);
-		FAIL_IF(!inst);
-		inst[0] = GROUP_0F;
-		inst[1] = sign ? MOVSX_r_rm16 : MOVZX_r_rm16;
-	}
+	else
+		FAIL_IF(emit_groupf(compiler, sign ? MOVSX_r_rm16 : MOVZX_r_rm16, 0, dst_r, src, srcw));
 
 	if (dst & SLJIT_MEM) {
 		inst = emit_x86_instruction(compiler, 1 | EX86_NO_REXW | EX86_PREF_66, dst_r, 0, dst, dstw);
@@ -1491,20 +1506,14 @@
 	dst_r = FAST_IS_REG(dst) ? dst : TMP_REG1;
 
 	if (is_clz ? (cpu_feature_list & CPU_FEATURE_LZCNT) : (cpu_feature_list & CPU_FEATURE_TZCNT)) {
-		inst = emit_x86_instruction(compiler, 2 | EX86_PREF_F3, dst_r, 0, src, srcw);
-		FAIL_IF(!inst);
-		inst[0] = GROUP_0F;
-		inst[1] = is_clz ? LZCNT_r_rm : TZCNT_r_rm;
+		FAIL_IF(emit_groupf(compiler, is_clz ? LZCNT_r_rm : TZCNT_r_rm, EX86_PREF_F3, dst_r, src, srcw));
 
 		if (dst & SLJIT_MEM)
 			EMIT_MOV(compiler, dst, dstw, TMP_REG1, 0);
 		return SLJIT_SUCCESS;
 	}
 
-	inst = emit_x86_instruction(compiler, 2, dst_r, 0, src, srcw);
-	FAIL_IF(!inst);
-	inst[0] = GROUP_0F;
-	inst[1] = is_clz ? BSR_r_rm : BSF_r_rm;
+	FAIL_IF(emit_groupf(compiler, is_clz ? BSR_r_rm : BSF_r_rm, 0, dst_r, src, srcw));
 
 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
 	max = is_clz ? (32 + 31) : 32;
@@ -1537,13 +1546,8 @@
 
 	if (cpu_feature_list & CPU_FEATURE_CMOV) {
 		EMIT_MOV(compiler, TMP_REG2, 0, SLJIT_IMM, max);
-
-		inst = emit_x86_instruction(compiler, 2, dst_r, 0, TMP_REG2, 0);
-		FAIL_IF(!inst);
-		inst[0] = GROUP_0F;
-		inst[1] = CMOVE_r_rm;
-	}
-	else
+		FAIL_IF(emit_groupf(compiler, CMOVE_r_rm, 0, dst_r, TMP_REG2, 0));
+	} else
 		FAIL_IF(emit_cmov_generic(compiler, SLJIT_EQUAL, dst_r, SLJIT_IMM, max));
 
 	if (is_clz) {
@@ -1624,9 +1628,9 @@
 		inst = emit_x86_instruction(compiler, 1 | EX86_SHIFT_INS, SLJIT_IMM, (sljit_sw)size, dst_r, 0);
 		FAIL_IF(!inst);
 		if (op == SLJIT_REV_U16)
-			*inst |= SHR;
+			inst[1] |= SHR;
 		else
-			*inst |= SAR;
+			inst[1] |= SAR;
 	}
 
 	if (dst & SLJIT_MEM) {
@@ -1691,14 +1695,14 @@
 				if (op == SLJIT_MOV_S32)
 					op = SLJIT_MOV_U32;
 			}
-			else if (src & SLJIT_IMM) {
+			else if (src == SLJIT_IMM) {
 				if (op == SLJIT_MOV_U32)
 					op = SLJIT_MOV_S32;
 			}
 		}
 #endif /* SLJIT_CONFIG_X86_64 */
 
-		if (src & SLJIT_IMM) {
+		if (src == SLJIT_IMM) {
 			switch (op) {
 			case SLJIT_MOV_U8:
 				srcw = (sljit_u8)srcw;
@@ -1810,7 +1814,7 @@
 	sljit_u8 op_imm = U8(op_types & 0xff);
 
 	if (dst == src1 && dstw == src1w) {
-		if (src2 & SLJIT_IMM) {
+		if (src2 == SLJIT_IMM) {
 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
 			if ((dst == SLJIT_R0) && (src2w > 127 || src2w < -128) && (compiler->mode32 || IS_HALFWORD(src2w))) {
 #else
@@ -1844,7 +1848,7 @@
 
 	/* Only for cumulative operations. */
 	if (dst == src2 && dstw == src2w) {
-		if (src1 & SLJIT_IMM) {
+		if (src1 == SLJIT_IMM) {
 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
 			if ((dst == SLJIT_R0) && (src1w > 127 || src1w < -128) && (compiler->mode32 || IS_HALFWORD(src1w))) {
 #else
@@ -1878,7 +1882,7 @@
 	/* General version. */
 	if (FAST_IS_REG(dst)) {
 		EMIT_MOV(compiler, dst, 0, src1, src1w);
-		if (src2 & SLJIT_IMM) {
+		if (src2 == SLJIT_IMM) {
 			BINARY_IMM(op_imm, op_mr, src2w, dst, 0);
 		}
 		else {
@@ -1890,7 +1894,7 @@
 	else {
 		/* This version requires less memory writing. */
 		EMIT_MOV(compiler, TMP_REG1, 0, src1, src1w);
-		if (src2 & SLJIT_IMM) {
+		if (src2 == SLJIT_IMM) {
 			BINARY_IMM(op_imm, op_mr, src2w, TMP_REG1, 0);
 		}
 		else {
@@ -1917,7 +1921,7 @@
 	sljit_u8 op_imm = U8(op_types & 0xff);
 
 	if (dst == src1 && dstw == src1w) {
-		if (src2 & SLJIT_IMM) {
+		if (src2 == SLJIT_IMM) {
 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
 			if ((dst == SLJIT_R0) && (src2w > 127 || src2w < -128) && (compiler->mode32 || IS_HALFWORD(src2w))) {
 #else
@@ -1951,7 +1955,7 @@
 	/* General version. */
 	if (FAST_IS_REG(dst) && dst != src2) {
 		EMIT_MOV(compiler, dst, 0, src1, src1w);
-		if (src2 & SLJIT_IMM) {
+		if (src2 == SLJIT_IMM) {
 			BINARY_IMM(op_imm, op_mr, src2w, dst, 0);
 		}
 		else {
@@ -1963,7 +1967,7 @@
 	else {
 		/* This version requires less memory writing. */
 		EMIT_MOV(compiler, TMP_REG1, 0, src1, src1w);
-		if (src2 & SLJIT_IMM) {
+		if (src2 == SLJIT_IMM) {
 			BINARY_IMM(op_imm, op_mr, src2w, TMP_REG1, 0);
 		}
 		else {
@@ -1986,20 +1990,12 @@
 	sljit_s32 dst_r = FAST_IS_REG(dst) ? dst : TMP_REG1;
 
 	/* Register destination. */
-	if (dst_r == src1 && !(src2 & SLJIT_IMM)) {
-		inst = emit_x86_instruction(compiler, 2, dst_r, 0, src2, src2w);
-		FAIL_IF(!inst);
-		inst[0] = GROUP_0F;
-		inst[1] = IMUL_r_rm;
-	}
-	else if (dst_r == src2 && !(src1 & SLJIT_IMM)) {
-		inst = emit_x86_instruction(compiler, 2, dst_r, 0, src1, src1w);
-		FAIL_IF(!inst);
-		inst[0] = GROUP_0F;
-		inst[1] = IMUL_r_rm;
-	}
-	else if (src1 & SLJIT_IMM) {
-		if (src2 & SLJIT_IMM) {
+	if (dst_r == src1 && src2 != SLJIT_IMM) {
+		FAIL_IF(emit_groupf(compiler, IMUL_r_rm, 0, dst_r, src2, src2w));
+	} else if (dst_r == src2 && src1 != SLJIT_IMM) {
+		FAIL_IF(emit_groupf(compiler, IMUL_r_rm, 0, dst_r, src1, src1w));
+	} else if (src1 == SLJIT_IMM) {
+		if (src2 == SLJIT_IMM) {
 			EMIT_MOV(compiler, dst_r, 0, SLJIT_IMM, src2w);
 			src2 = dst_r;
 			src2w = 0;
@@ -2036,14 +2032,11 @@
 			if (dst_r != src2)
 				EMIT_MOV(compiler, dst_r, 0, src2, src2w);
 			FAIL_IF(emit_load_imm64(compiler, TMP_REG2, src1w));
-			inst = emit_x86_instruction(compiler, 2, dst_r, 0, TMP_REG2, 0);
-			FAIL_IF(!inst);
-			inst[0] = GROUP_0F;
-			inst[1] = IMUL_r_rm;
+			FAIL_IF(emit_groupf(compiler, IMUL_r_rm, 0, dst_r, TMP_REG2, 0));
 		}
 #endif
 	}
-	else if (src2 & SLJIT_IMM) {
+	else if (src2 == SLJIT_IMM) {
 		/* Note: src1 is NOT immediate. */
 
 		if (src2w <= 127 && src2w >= -128) {
@@ -2078,11 +2071,7 @@
 			if (dst_r != src1)
 				EMIT_MOV(compiler, dst_r, 0, src1, src1w);
 			FAIL_IF(emit_load_imm64(compiler, TMP_REG2, src2w));
-
-			inst = emit_x86_instruction(compiler, 2, dst_r, 0, TMP_REG2, 0);
-			FAIL_IF(!inst);
-			inst[0] = GROUP_0F;
-			inst[1] = IMUL_r_rm;
+			FAIL_IF(emit_groupf(compiler, IMUL_r_rm, 0, dst_r, TMP_REG2, 0));
 		}
 #endif
 	} else {
@@ -2090,10 +2079,7 @@
 		if (ADDRESSING_DEPENDS_ON(src2, dst_r))
 			dst_r = TMP_REG1;
 		EMIT_MOV(compiler, dst_r, 0, src1, src1w);
-		inst = emit_x86_instruction(compiler, 2, dst_r, 0, src2, src2w);
-		FAIL_IF(!inst);
-		inst[0] = GROUP_0F;
-		inst[1] = IMUL_r_rm;
+		FAIL_IF(emit_groupf(compiler, IMUL_r_rm, 0, dst_r, src2, src2w));
 	}
 
 	if (dst & SLJIT_MEM)
@@ -2126,10 +2112,10 @@
 			done = 1;
 		}
 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
-		if ((src2 & SLJIT_IMM) && (compiler->mode32 || IS_HALFWORD(src2w))) {
+		if (src2 == SLJIT_IMM && (compiler->mode32 || IS_HALFWORD(src2w))) {
 			inst = emit_x86_instruction(compiler, 1, dst_r, 0, SLJIT_MEM1(src1), (sljit_s32)src2w);
 #else
-		if (src2 & SLJIT_IMM) {
+		if (src2 == SLJIT_IMM) {
 			inst = emit_x86_instruction(compiler, 1, dst_r, 0, SLJIT_MEM1(src1), src2w);
 #endif
 			FAIL_IF(!inst);
@@ -2139,10 +2125,10 @@
 	}
 	else if (FAST_IS_REG(src2)) {
 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
-		if ((src1 & SLJIT_IMM) && (compiler->mode32 || IS_HALFWORD(src1w))) {
+		if (src1 == SLJIT_IMM && (compiler->mode32 || IS_HALFWORD(src1w))) {
 			inst = emit_x86_instruction(compiler, 1, dst_r, 0, SLJIT_MEM1(src2), (sljit_s32)src1w);
 #else
-		if (src1 & SLJIT_IMM) {
+		if (src1 == SLJIT_IMM) {
 			inst = emit_x86_instruction(compiler, 1, dst_r, 0, SLJIT_MEM1(src2), src1w);
 #endif
 			FAIL_IF(!inst);
@@ -2166,16 +2152,16 @@
 	sljit_u8* inst;
 
 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
-	if (src1 == SLJIT_R0 && (src2 & SLJIT_IMM) && (src2w > 127 || src2w < -128) && (compiler->mode32 || IS_HALFWORD(src2w))) {
+	if (src1 == SLJIT_R0 && src2 == SLJIT_IMM && (src2w > 127 || src2w < -128) && (compiler->mode32 || IS_HALFWORD(src2w))) {
 #else
-	if (src1 == SLJIT_R0 && (src2 & SLJIT_IMM) && (src2w > 127 || src2w < -128)) {
+	if (src1 == SLJIT_R0 && src2 == SLJIT_IMM && (src2w > 127 || src2w < -128)) {
 #endif
 		BINARY_EAX_IMM(CMP_EAX_i32, src2w);
 		return SLJIT_SUCCESS;
 	}
 
 	if (FAST_IS_REG(src1)) {
-		if (src2 & SLJIT_IMM) {
+		if (src2 == SLJIT_IMM) {
 			BINARY_IMM(CMP, CMP_rm_r, src2w, src1, 0);
 		}
 		else {
@@ -2186,15 +2172,15 @@
 		return SLJIT_SUCCESS;
 	}
 
-	if (FAST_IS_REG(src2) && !(src1 & SLJIT_IMM)) {
+	if (FAST_IS_REG(src2) && src1 != SLJIT_IMM) {
 		inst = emit_x86_instruction(compiler, 1, src2, 0, src1, src1w);
 		FAIL_IF(!inst);
 		*inst = CMP_rm_r;
 		return SLJIT_SUCCESS;
 	}
 
-	if (src2 & SLJIT_IMM) {
-		if (src1 & SLJIT_IMM) {
+	if (src2 == SLJIT_IMM) {
+		if (src1 == SLJIT_IMM) {
 			EMIT_MOV(compiler, TMP_REG1, 0, src1, src1w);
 			src1 = TMP_REG1;
 			src1w = 0;
@@ -2217,25 +2203,25 @@
 	sljit_u8* inst;
 
 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
-	if (src1 == SLJIT_R0 && (src2 & SLJIT_IMM) && (src2w > 127 || src2w < -128) && (compiler->mode32 || IS_HALFWORD(src2w))) {
+	if (src1 == SLJIT_R0 && src2 == SLJIT_IMM && (src2w > 127 || src2w < -128) && (compiler->mode32 || IS_HALFWORD(src2w))) {
 #else
-	if (src1 == SLJIT_R0 && (src2 & SLJIT_IMM) && (src2w > 127 || src2w < -128)) {
+	if (src1 == SLJIT_R0 && src2 == SLJIT_IMM && (src2w > 127 || src2w < -128)) {
 #endif
 		BINARY_EAX_IMM(TEST_EAX_i32, src2w);
 		return SLJIT_SUCCESS;
 	}
 
 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
-	if (src2 == SLJIT_R0 && (src1 & SLJIT_IMM) && (src1w > 127 || src1w < -128) && (compiler->mode32 || IS_HALFWORD(src1w))) {
+	if (src2 == SLJIT_R0 && src1 == SLJIT_IMM && (src1w > 127 || src1w < -128) && (compiler->mode32 || IS_HALFWORD(src1w))) {
 #else
-	if (src2 == SLJIT_R0 && (src1 & SLJIT_IMM) && (src1w > 127 || src1w < -128)) {
+	if (src2 == SLJIT_R0 && src1 == SLJIT_IMM && (src1w > 127 || src1w < -128)) {
 #endif
 		BINARY_EAX_IMM(TEST_EAX_i32, src1w);
 		return SLJIT_SUCCESS;
 	}
 
-	if (!(src1 & SLJIT_IMM)) {
-		if (src2 & SLJIT_IMM) {
+	if (src1 != SLJIT_IMM) {
+		if (src2 == SLJIT_IMM) {
 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
 			if (IS_HALFWORD(src2w) || compiler->mode32) {
 				inst = emit_x86_instruction(compiler, 1, SLJIT_IMM, src2w, src1, src1w);
@@ -2263,8 +2249,8 @@
 		}
 	}
 
-	if (!(src2 & SLJIT_IMM)) {
-		if (src1 & SLJIT_IMM) {
+	if (src2 != SLJIT_IMM) {
+		if (src1 == SLJIT_IMM) {
 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
 			if (IS_HALFWORD(src1w) || compiler->mode32) {
 				inst = emit_x86_instruction(compiler, 1, SLJIT_IMM, src1w, src2, src2w);
@@ -2293,7 +2279,7 @@
 	}
 
 	EMIT_MOV(compiler, TMP_REG1, 0, src1, src1w);
-	if (src2 & SLJIT_IMM) {
+	if (src2 == SLJIT_IMM) {
 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
 		if (IS_HALFWORD(src2w) || compiler->mode32) {
 			inst = emit_x86_instruction(compiler, 1, SLJIT_IMM, src2w, TMP_REG1, 0);
@@ -2331,18 +2317,18 @@
 #endif
 	sljit_u8* inst;
 
-	if ((src2 & SLJIT_IMM) || (src2 == SLJIT_PREF_SHIFT_REG)) {
+	if (src2 == SLJIT_IMM || src2 == SLJIT_PREF_SHIFT_REG) {
 		if (dst == src1 && dstw == src1w) {
 			inst = emit_x86_instruction(compiler, 1 | EX86_SHIFT_INS, src2, src2w, dst, dstw);
 			FAIL_IF(!inst);
-			*inst |= mode;
+			inst[1] |= mode;
 			return SLJIT_SUCCESS;
 		}
 		if (dst == SLJIT_PREF_SHIFT_REG && src2 == SLJIT_PREF_SHIFT_REG) {
 			EMIT_MOV(compiler, TMP_REG1, 0, src1, src1w);
 			inst = emit_x86_instruction(compiler, 1 | EX86_SHIFT_INS, SLJIT_PREF_SHIFT_REG, 0, TMP_REG1, 0);
 			FAIL_IF(!inst);
-			*inst |= mode;
+			inst[1] |= mode;
 			EMIT_MOV(compiler, SLJIT_PREF_SHIFT_REG, 0, TMP_REG1, 0);
 			return SLJIT_SUCCESS;
 		}
@@ -2350,14 +2336,14 @@
 			EMIT_MOV(compiler, dst, 0, src1, src1w);
 			inst = emit_x86_instruction(compiler, 1 | EX86_SHIFT_INS, src2, src2w, dst, 0);
 			FAIL_IF(!inst);
-			*inst |= mode;
+			inst[1] |= mode;
 			return SLJIT_SUCCESS;
 		}
 
 		EMIT_MOV(compiler, TMP_REG1, 0, src1, src1w);
 		inst = emit_x86_instruction(compiler, 1 | EX86_SHIFT_INS, src2, src2w, TMP_REG1, 0);
 		FAIL_IF(!inst);
-		*inst |= mode;
+		inst[1] |= mode;
 		EMIT_MOV(compiler, dst, dstw, TMP_REG1, 0);
 		return SLJIT_SUCCESS;
 	}
@@ -2367,7 +2353,7 @@
 		EMIT_MOV(compiler, SLJIT_PREF_SHIFT_REG, 0, src2, src2w);
 		inst = emit_x86_instruction(compiler, 1 | EX86_SHIFT_INS, SLJIT_PREF_SHIFT_REG, 0, TMP_REG1, 0);
 		FAIL_IF(!inst);
-		*inst |= mode;
+		inst[1] |= mode;
 		return emit_mov(compiler, SLJIT_PREF_SHIFT_REG, 0, TMP_REG1, 0);
 	}
 
@@ -2385,7 +2371,7 @@
 		EMIT_MOV(compiler, SLJIT_PREF_SHIFT_REG, 0, src2, src2w);
 		inst = emit_x86_instruction(compiler, 1 | EX86_SHIFT_INS, SLJIT_PREF_SHIFT_REG, 0, dst, 0);
 		FAIL_IF(!inst);
-		*inst |= mode;
+		inst[1] |= mode;
 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
 		compiler->mode32 = 0;
 #endif
@@ -2411,7 +2397,7 @@
 	EMIT_MOV(compiler, SLJIT_PREF_SHIFT_REG, 0, src2, src2w);
 	inst = emit_x86_instruction(compiler, 1 | EX86_SHIFT_INS, SLJIT_PREF_SHIFT_REG, 0, TMP_REG1, 0);
 	FAIL_IF(!inst);
-	*inst |= mode;
+	inst[1] |= mode;
 
 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
 	EMIT_MOV(compiler, SLJIT_PREF_SHIFT_REG, 0, SLJIT_MEM1(SLJIT_SP), 0);
@@ -2434,7 +2420,7 @@
 	sljit_s32 src2, sljit_sw src2w)
 {
 	/* The CPU does not set flags if the shift count is 0. */
-	if (src2 & SLJIT_IMM) {
+	if (src2 == SLJIT_IMM) {
 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
 		src2w &= compiler->mode32 ? 0x1f : 0x3f;
 #else /* !SLJIT_CONFIG_X86_64 */
@@ -2499,7 +2485,7 @@
 			return emit_unary(compiler, NEG_rm, dst, dstw, src2, src2w);
 
 		if (!HAS_FLAGS(op)) {
-			if ((src2 & SLJIT_IMM) && emit_lea_binary(compiler, dst, dstw, src1, src1w, SLJIT_IMM, -src2w) != SLJIT_ERR_UNSUPPORTED)
+			if (src2 == SLJIT_IMM && emit_lea_binary(compiler, dst, dstw, src1, src1w, SLJIT_IMM, -src2w) != SLJIT_ERR_UNSUPPORTED)
 				return compiler->error;
 			if (FAST_IS_REG(dst) && src2 == dst) {
 				FAIL_IF(emit_non_cum_binary(compiler, BINARY_OPCODE(SUB), dst, 0, dst, 0, src1, src1w));
@@ -2522,9 +2508,9 @@
 			dst, dstw, src1, src1w, src2, src2w);
 	case SLJIT_XOR:
 		if (!HAS_FLAGS(op)) {
-			if ((src2 & SLJIT_IMM) && src2w == -1)
+			if (src2 == SLJIT_IMM && src2w == -1)
 				return emit_unary(compiler, NOT_rm, dst, dstw, src1, src1w);
-			if ((src1 & SLJIT_IMM) && src1w == -1)
+			if (src1 == SLJIT_IMM && src1w == -1)
 				return emit_unary(compiler, NOT_rm, dst, dstw, src2, src2w);
 		}
 
@@ -2610,7 +2596,7 @@
 	compiler->mode32 = op & SLJIT_32;
 #endif /* SLJIT_CONFIG_X86_64 */
 
-	if (src3 & SLJIT_IMM) {
+	if (src3 == SLJIT_IMM) {
 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
 		src3w &= 0x1f;
 #else /* !SLJIT_CONFIG_X86_32 */
@@ -2637,7 +2623,7 @@
 	}
 #endif /* SLJIT_CONFIG_X86_32 */
 
-	if (dst_reg == SLJIT_PREF_SHIFT_REG && !(src3 & SLJIT_IMM) && (src3 != SLJIT_PREF_SHIFT_REG || src1_reg != SLJIT_PREF_SHIFT_REG)) {
+	if (dst_reg == SLJIT_PREF_SHIFT_REG && src3 != SLJIT_IMM && (src3 != SLJIT_PREF_SHIFT_REG || src1_reg != SLJIT_PREF_SHIFT_REG)) {
 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
 		EMIT_MOV(compiler, TMP_REG1, 0, src1_reg, src1w);
 		src1_reg = TMP_REG1;
@@ -2662,7 +2648,7 @@
 		if (src3 != SLJIT_PREF_SHIFT_REG)
 			EMIT_MOV(compiler, SLJIT_PREF_SHIFT_REG, 0, src3, src3w);
 	} else {
-		if (src2_reg == SLJIT_PREF_SHIFT_REG && !(src3 & SLJIT_IMM) && src3 != SLJIT_PREF_SHIFT_REG) {
+		if (src2_reg == SLJIT_PREF_SHIFT_REG && src3 != SLJIT_IMM && src3 != SLJIT_PREF_SHIFT_REG) {
 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
 			compiler->mode32 = 0;
 #endif /* SLJIT_CONFIG_X86_64 */
@@ -2710,7 +2696,7 @@
 		}
 #endif /* SLJIT_CONFIG_X86_64 */
 
-		if (!(src3 & SLJIT_IMM) && src3 != SLJIT_PREF_SHIFT_REG) {
+		if (src3 != SLJIT_IMM && src3 != SLJIT_PREF_SHIFT_REG) {
 			if (!restore_ecx) {
 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
 				compiler->mode32 = 0;
@@ -2741,7 +2727,7 @@
 	FAIL_IF(!inst);
 	inst[0] = GROUP_0F;
 
-	if (src3 & SLJIT_IMM) {
+	if (src3 == SLJIT_IMM) {
 		inst[1] = U8((is_left ? SHLD : SHRD) - 1);
 
 		/* Immediate argument is added separately. */
@@ -2823,7 +2809,7 @@
 {
 	CHECK_REG_INDEX(check_sljit_get_register_index(type, reg));
 
-	if (type == SLJIT_INT_REGISTER) {
+	if (type == SLJIT_GP_REGISTER) {
 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
 		if (reg >= SLJIT_R3 && reg <= SLJIT_R8)
 			return -1;
@@ -2881,40 +2867,43 @@
 	sse2_buffer[13] = 0x7fffffff;
 }
 
-static sljit_s32 emit_sse2(struct sljit_compiler *compiler, sljit_u8 opcode,
-	sljit_s32 single, sljit_s32 xmm1, sljit_s32 xmm2, sljit_sw xmm2w)
+static sljit_s32 emit_groupf(struct sljit_compiler *compiler,
+	sljit_u8 opcode, sljit_uw pref,
+	sljit_s32 dst, sljit_s32 src, sljit_sw srcw)
 {
-	sljit_u8 *inst;
-
-	inst = emit_x86_instruction(compiler, 2 | (single ? EX86_PREF_F3 : EX86_PREF_F2) | EX86_SSE2, xmm1, 0, xmm2, xmm2w);
+	sljit_u8 *inst = emit_x86_instruction(compiler, 2 | pref, dst, 0, src, srcw);
 	FAIL_IF(!inst);
 	inst[0] = GROUP_0F;
 	inst[1] = opcode;
 	return SLJIT_SUCCESS;
 }
 
-static sljit_s32 emit_sse2_logic(struct sljit_compiler *compiler, sljit_u8 opcode,
-	sljit_s32 pref66, sljit_s32 xmm1, sljit_s32 xmm2, sljit_sw xmm2w)
+static sljit_s32 emit_groupf_ext(struct sljit_compiler *compiler,
+	sljit_u8 opcode, sljit_uw pref,
+	sljit_s32 dst, sljit_s32 src, sljit_sw srcw)
 {
 	sljit_u8 *inst;
 
-	inst = emit_x86_instruction(compiler, 2 | (pref66 ? EX86_PREF_66 : 0) | EX86_SSE2, xmm1, 0, xmm2, xmm2w);
+	SLJIT_ASSERT((pref & EX86_SSE2) && ((pref & VEX_OP_0F38) || (pref & VEX_OP_0F3A)));
+
+	inst = emit_x86_instruction(compiler, 3 | (pref & ~(VEX_OP_0F38 | VEX_OP_0F3A)), dst, 0, src, srcw);
 	FAIL_IF(!inst);
 	inst[0] = GROUP_0F;
-	inst[1] = opcode;
+	inst[1] = U8((pref & VEX_OP_0F38) ? 0x38 : 0x3A);
+	inst[2] = opcode;
 	return SLJIT_SUCCESS;
 }
 
 static SLJIT_INLINE sljit_s32 emit_sse2_load(struct sljit_compiler *compiler,
 	sljit_s32 single, sljit_s32 dst, sljit_s32 src, sljit_sw srcw)
 {
-	return emit_sse2(compiler, MOVSD_x_xm, single, dst, src, srcw);
+	return emit_groupf(compiler, MOVSD_x_xm, (single ? EX86_PREF_F3 : EX86_PREF_F2) | EX86_SSE2, dst, src, srcw);
 }
 
 static SLJIT_INLINE sljit_s32 emit_sse2_store(struct sljit_compiler *compiler,
 	sljit_s32 single, sljit_s32 dst, sljit_sw dstw, sljit_s32 src)
 {
-	return emit_sse2(compiler, MOVSD_xm_x, single, src, dst, dstw);
+	return emit_groupf(compiler, MOVSD_xm_x, (single ? EX86_PREF_F3 : EX86_PREF_F2) | EX86_SSE2, src, dst, dstw);
 }
 
 static SLJIT_INLINE sljit_s32 sljit_emit_fop1_conv_sw_from_f64(struct sljit_compiler *compiler, sljit_s32 op,
@@ -2922,7 +2911,6 @@
 	sljit_s32 src, sljit_sw srcw)
 {
 	sljit_s32 dst_r;
-	sljit_u8 *inst;
 
 	CHECK_EXTRA_REGS(dst, dstw, (void)0);
 	dst_r = FAST_IS_REG(dst) ? dst : TMP_REG1;
@@ -2932,10 +2920,7 @@
 		compiler->mode32 = 0;
 #endif
 
-	inst = emit_x86_instruction(compiler, 2 | ((op & SLJIT_32) ? EX86_PREF_F3 : EX86_PREF_F2) | EX86_SSE2_OP2, dst_r, 0, src, srcw);
-	FAIL_IF(!inst);
-	inst[0] = GROUP_0F;
-	inst[1] = CVTTSD2SI_r_xm;
+	FAIL_IF(emit_groupf(compiler, CVTTSD2SI_r_xm, EX86_SELECT_F2_F3(op) | EX86_SSE2_OP2, dst_r, src, srcw));
 
 	if (dst & SLJIT_MEM)
 		return emit_mov(compiler, dst, dstw, TMP_REG1, 0);
@@ -2947,7 +2932,6 @@
 	sljit_s32 src, sljit_sw srcw)
 {
 	sljit_s32 dst_r = FAST_IS_REG(dst) ? dst : TMP_FREG;
-	sljit_u8 *inst;
 
 	CHECK_EXTRA_REGS(src, srcw, (void)0);
 
@@ -2956,7 +2940,7 @@
 		compiler->mode32 = 0;
 #endif
 
-	if (src & SLJIT_IMM) {
+	if (src == SLJIT_IMM) {
 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
 		if (GET_OPCODE(op) == SLJIT_CONV_F64_FROM_S32)
 			srcw = (sljit_s32)srcw;
@@ -2966,10 +2950,7 @@
 		srcw = 0;
 	}
 
-	inst = emit_x86_instruction(compiler, 2 | ((op & SLJIT_32) ? EX86_PREF_F3 : EX86_PREF_F2) | EX86_SSE2_OP1, dst_r, 0, src, srcw);
-	FAIL_IF(!inst);
-	inst[0] = GROUP_0F;
-	inst[1] = CVTSI2SD_x_rm;
+	FAIL_IF(emit_groupf(compiler, CVTSI2SD_x_rm, EX86_SELECT_F2_F3(op) | EX86_SSE2_OP1, dst_r, src, srcw));
 
 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
 	compiler->mode32 = 1;
@@ -2987,7 +2968,7 @@
 	case SLJIT_ORDERED_EQUAL:
 		/* Also: SLJIT_UNORDERED_OR_NOT_EQUAL */
 		FAIL_IF(emit_sse2_load(compiler, op & SLJIT_32, TMP_FREG, src1, src1w));
-		FAIL_IF(emit_sse2(compiler, CMPS_x_xm, op & SLJIT_32, TMP_FREG, src2, src2w));
+		FAIL_IF(emit_groupf(compiler, CMPS_x_xm, EX86_SELECT_F2_F3(op) | EX86_SSE2, TMP_FREG, src2, src2w));
 
 		/* EQ */
 		FAIL_IF(emit_byte(compiler, 0));
@@ -3005,7 +2986,7 @@
 			src2 = TMP_FREG;
 		}
 
-		return emit_sse2_logic(compiler, UCOMISD_x_xm, !(op & SLJIT_32), src2, src1, src1w);
+		return emit_groupf(compiler, UCOMISD_x_xm, EX86_SELECT_66(op) | EX86_SSE2, src2, src1, src1w);
 	}
 
 	if (!FAST_IS_REG(src1)) {
@@ -3013,7 +2994,7 @@
 		src1 = TMP_FREG;
 	}
 
-	return emit_sse2_logic(compiler, UCOMISD_x_xm, !(op & SLJIT_32), src1, src2, src2w);
+	return emit_groupf(compiler, UCOMISD_x_xm, EX86_SELECT_66(op) | EX86_SSE2, src1, src2, src2w);
 }
 
 SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_fop1(struct sljit_compiler *compiler, sljit_s32 op,
@@ -3044,14 +3025,13 @@
 			/* We overwrite the high bits of source. From SLJIT point of view,
 			   this is not an issue.
 			   Note: In SSE3, we could also use MOVDDUP and MOVSLDUP. */
-			FAIL_IF(emit_sse2_logic(compiler, UNPCKLPD_x_xm, op & SLJIT_32, src, src, 0));
-		}
-		else {
+			FAIL_IF(emit_groupf(compiler, UNPCKLPD_x_xm, ((op & SLJIT_32) ? EX86_PREF_66 : 0) | EX86_SSE2, src, src, 0));
+		} else {
 			FAIL_IF(emit_sse2_load(compiler, !(op & SLJIT_32), TMP_FREG, src, srcw));
 			src = TMP_FREG;
 		}
 
-		FAIL_IF(emit_sse2_logic(compiler, CVTPD2PS_x_xm, op & SLJIT_32, dst_r, src, 0));
+		FAIL_IF(emit_groupf(compiler, CVTPD2PS_x_xm, ((op & SLJIT_32) ? EX86_PREF_66 : 0) | EX86_SSE2, dst_r, src, 0));
 		if (dst_r == TMP_FREG)
 			return emit_sse2_store(compiler, op & SLJIT_32, dst, dstw, TMP_FREG);
 		return SLJIT_SUCCESS;
@@ -3069,11 +3049,11 @@
 
 	switch (GET_OPCODE(op)) {
 	case SLJIT_NEG_F64:
-		FAIL_IF(emit_sse2_logic(compiler, XORPD_x_xm, !(op & SLJIT_32), dst_r, SLJIT_MEM0(), (sljit_sw)((op & SLJIT_32) ? sse2_buffer : sse2_buffer + 8)));
+		FAIL_IF(emit_groupf(compiler, XORPD_x_xm, EX86_SELECT_66(op) | EX86_SSE2, dst_r, SLJIT_MEM0(), (sljit_sw)((op & SLJIT_32) ? sse2_buffer : sse2_buffer + 8)));
 		break;
 
 	case SLJIT_ABS_F64:
-		FAIL_IF(emit_sse2_logic(compiler, ANDPD_x_xm, !(op & SLJIT_32), dst_r, SLJIT_MEM0(), (sljit_sw)((op & SLJIT_32) ? sse2_buffer + 4 : sse2_buffer + 12)));
+		FAIL_IF(emit_groupf(compiler, ANDPD_x_xm, EX86_SELECT_66(op) | EX86_SSE2, dst_r, SLJIT_MEM0(), (sljit_sw)((op & SLJIT_32) ? sse2_buffer + 4 : sse2_buffer + 12)));
 		break;
 	}
 
@@ -3122,19 +3102,19 @@
 
 	switch (GET_OPCODE(op)) {
 	case SLJIT_ADD_F64:
-		FAIL_IF(emit_sse2(compiler, ADDSD_x_xm, op & SLJIT_32, dst_r, src2, src2w));
+		FAIL_IF(emit_groupf(compiler, ADDSD_x_xm, EX86_SELECT_F2_F3(op) | EX86_SSE2, dst_r, src2, src2w));
 		break;
 
 	case SLJIT_SUB_F64:
-		FAIL_IF(emit_sse2(compiler, SUBSD_x_xm, op & SLJIT_32, dst_r, src2, src2w));
+		FAIL_IF(emit_groupf(compiler, SUBSD_x_xm, EX86_SELECT_F2_F3(op) | EX86_SSE2, dst_r, src2, src2w));
 		break;
 
 	case SLJIT_MUL_F64:
-		FAIL_IF(emit_sse2(compiler, MULSD_x_xm, op & SLJIT_32, dst_r, src2, src2w));
+		FAIL_IF(emit_groupf(compiler, MULSD_x_xm, EX86_SELECT_F2_F3(op) | EX86_SSE2, dst_r, src2, src2w));
 		break;
 
 	case SLJIT_DIV_F64:
-		FAIL_IF(emit_sse2(compiler, DIVSD_x_xm, op & SLJIT_32, dst_r, src2, src2w));
+		FAIL_IF(emit_groupf(compiler, DIVSD_x_xm, EX86_SELECT_F2_F3(op) | EX86_SSE2, dst_r, src2, src2w));
 		break;
 	}
 
@@ -3148,6 +3128,8 @@
 	sljit_s32 src1, sljit_sw src1w,
 	sljit_s32 src2, sljit_sw src2w)
 {
+	sljit_uw pref;
+
 	CHECK_ERROR();
 	CHECK(check_sljit_emit_fop2r(compiler, op, dst_freg, src1, src1w, src2, src2w));
 	ADJUST_LOCAL_OFFSET(src1, src1w);
@@ -3159,9 +3141,10 @@
 
 	if (dst_freg == src1) {
 		FAIL_IF(emit_sse2_load(compiler, op & SLJIT_32, TMP_FREG, src2, src2w));
-		FAIL_IF(emit_sse2_logic(compiler, XORPD_x_xm, !(op & SLJIT_32), TMP_FREG, src1, src1w));
-		FAIL_IF(emit_sse2_logic(compiler, ANDPD_x_xm, !(op & SLJIT_32), TMP_FREG, SLJIT_MEM0(), (sljit_sw)((op & SLJIT_32) ? sse2_buffer : sse2_buffer + 8)));
-		return emit_sse2_logic(compiler, XORPD_x_xm, !(op & SLJIT_32), dst_freg, TMP_FREG, 0);
+		pref = EX86_SELECT_66(op) | EX86_SSE2;
+		FAIL_IF(emit_groupf(compiler, XORPD_x_xm, pref, TMP_FREG, src1, src1w));
+		FAIL_IF(emit_groupf(compiler, ANDPD_x_xm, pref, TMP_FREG, SLJIT_MEM0(), (sljit_sw)((op & SLJIT_32) ? sse2_buffer : sse2_buffer + 8)));
+		return emit_groupf(compiler, XORPD_x_xm, pref, dst_freg, TMP_FREG, 0);
 	}
 
 	if (src1 & SLJIT_MEM) {
@@ -3173,9 +3156,10 @@
 	if (dst_freg != src2)
 		FAIL_IF(emit_sse2_load(compiler, op & SLJIT_32, dst_freg, src2, src2w));
 
-	FAIL_IF(emit_sse2_logic(compiler, XORPD_x_xm, !(op & SLJIT_32), dst_freg, src1, src1w));
-	FAIL_IF(emit_sse2_logic(compiler, ANDPD_x_xm, !(op & SLJIT_32), dst_freg, SLJIT_MEM0(), (sljit_sw)((op & SLJIT_32) ? sse2_buffer : sse2_buffer + 8)));
-	return emit_sse2_logic(compiler, XORPD_x_xm, !(op & SLJIT_32), dst_freg, src1, src1w);
+	pref = EX86_SELECT_66(op) | EX86_SSE2;
+	FAIL_IF(emit_groupf(compiler, XORPD_x_xm, pref, dst_freg, src1, src1w));
+	FAIL_IF(emit_groupf(compiler, ANDPD_x_xm, pref, dst_freg, SLJIT_MEM0(), (sljit_sw)((op & SLJIT_32) ? sse2_buffer : sse2_buffer + 8)));
+	return emit_groupf(compiler, XORPD_x_xm, pref, dst_freg, src1, src1w);
 }
 
 /* --------------------------------------------------------------------- */
@@ -3406,7 +3390,6 @@
 	sljit_sw dstw = 0;
 #endif /* SLJIT_CONFIG_X86_32 */
 	sljit_sw src2w = 0;
-	sljit_u8* inst;
 
 	CHECK_ERROR();
 	CHECK(check_sljit_emit_select(compiler, type, dst_reg, src1, src1w, src2_reg));
@@ -3424,7 +3407,7 @@
 
 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
 	if (dst & SLJIT_MEM) {
-		if ((src1 & SLJIT_IMM) || (!(src1 & SLJIT_MEM) && (src2_reg & SLJIT_MEM))) {
+		if (src1 == SLJIT_IMM || (!(src1 & SLJIT_MEM) && (src2_reg & SLJIT_MEM))) {
 			EMIT_MOV(compiler, TMP_REG1, 0, src1, src1w);
 			src1 = src2_reg;
 			src1w = src2w;
@@ -3451,7 +3434,7 @@
 			}
 		}
 
-		if (SLJIT_UNLIKELY(src1 & SLJIT_IMM)) {
+		if (SLJIT_UNLIKELY(src1 == SLJIT_IMM)) {
 			SLJIT_ASSERT(dst_reg != TMP_REG1);
 			EMIT_MOV(compiler, TMP_REG1, 0, src1, src1w);
 			src1 = TMP_REG1;
@@ -3461,14 +3444,10 @@
 	}
 #endif /* SLJIT_CONFIG_X86_32 */
 
-	if (sljit_has_cpu_feature(SLJIT_HAS_CMOV)) {
-		inst = emit_x86_instruction(compiler, 2, dst_reg, 0, src1, src1w);
-		FAIL_IF(!inst);
-		inst[0] = GROUP_0F;
-		inst[1] = U8(get_jump_code((sljit_uw)type) - 0x40);
-	} else {
+	if (sljit_has_cpu_feature(SLJIT_HAS_CMOV))
+		FAIL_IF(emit_groupf(compiler, U8(get_jump_code((sljit_uw)type) - 0x40), 0, dst_reg, src1, src1w));
+	else
 		FAIL_IF(emit_cmov_generic(compiler, type, dst_reg, src1, src1w));
-	}
 
 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
 	if (dst_reg == TMP_REG1)
@@ -3522,9 +3501,8 @@
 	sljit_s32 reg_size = SLJIT_SIMD_GET_REG_SIZE(type);
 	sljit_s32 elem_size = SLJIT_SIMD_GET_ELEM_SIZE(type);
 	sljit_s32 alignment = SLJIT_SIMD_GET_ALIGNMENT(type);
-	sljit_u8 *inst;
 	sljit_u8 opcode = 0;
-	sljit_uw pref = 2 | EX86_SSE2;
+	sljit_uw pref;
 
 	CHECK_ERROR();
 	CHECK(check_sljit_emit_simd_mov(compiler, type, freg, srcdst, srcdstw));
@@ -3535,41 +3513,47 @@
 	compiler->mode32 = 1;
 #endif /* SLJIT_CONFIG_X86_64 */
 
-	if (reg_size == 4) {
-		if (!(srcdst & SLJIT_MEM))
-			alignment = 4;
-
-		if (type & SLJIT_SIMD_FLOAT) {
-			if (elem_size == 2 || elem_size == 3) {
-				opcode = alignment >= 4 ? MOVAPS_x_xm : MOVUPS_x_xm;
-
-				if (elem_size == 3)
-					pref |= EX86_PREF_66;
-
-				if (type & SLJIT_SIMD_STORE)
-					opcode = U8(opcode + 1);
-			}
-		} else {
-			opcode = (type & SLJIT_SIMD_STORE) ? MOVDQA_xm_x : MOVDQA_x_xm;
-			pref |= alignment >= 4 ? EX86_PREF_66 : EX86_PREF_F3;
-		}
-
-		if (opcode == 0)
+	switch (reg_size) {
+	case 4:
+		pref = 2 | EX86_SSE2;
+		break;
+	case 5:
+		if (!(cpu_feature_list & CPU_FEATURE_AVX2))
 			return SLJIT_ERR_UNSUPPORTED;
-
-		if (type & SLJIT_SIMD_TEST)
-			return SLJIT_SUCCESS;
-
-		inst = emit_x86_instruction(compiler, pref, freg, 0, srcdst, srcdstw);
-		FAIL_IF(!inst);
-
-		inst[0] = GROUP_0F;
-		inst[1] = opcode;
-		return SLJIT_SUCCESS;
+		pref = EX86_SSE2 | VEX_256;
+		break;
+	default:
+		return SLJIT_ERR_UNSUPPORTED;
 	}
 
-	/* TODO: Support VEX prefix and longer reg types. */
-	return SLJIT_ERR_UNSUPPORTED;
+	if (!(srcdst & SLJIT_MEM))
+		alignment = reg_size;
+
+	if (type & SLJIT_SIMD_FLOAT) {
+		if (elem_size == 2 || elem_size == 3) {
+			opcode = alignment >= reg_size ? MOVAPS_x_xm : MOVUPS_x_xm;
+
+			if (elem_size == 3)
+				pref |= EX86_PREF_66;
+
+			if (type & SLJIT_SIMD_STORE)
+				opcode = U8(opcode + 1);
+		}
+	} else {
+		opcode = (type & SLJIT_SIMD_STORE) ? MOVDQA_xm_x : MOVDQA_x_xm;
+		pref |= alignment >= reg_size ? EX86_PREF_66 : EX86_PREF_F3;
+	}
+
+	if (opcode == 0)
+		return SLJIT_ERR_UNSUPPORTED;
+
+	if (type & SLJIT_SIMD_TEST)
+		return SLJIT_SUCCESS;
+
+	if (pref & VEX_256)
+		return emit_vex_instruction(compiler, opcode | pref, freg, 0, srcdst, srcdstw);
+
+	return emit_groupf(compiler, opcode, pref, freg, srcdst, srcdstw);
 }
 
 SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_simd_replicate(struct sljit_compiler *compiler, sljit_s32 type,
@@ -3591,168 +3575,188 @@
 		CHECK_EXTRA_REGS(src, srcw, (void)0);
 	}
 
-#if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
+#if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
+	if ((type & SLJIT_SIMD_FLOAT) ? (elem_size < 2 || elem_size > 3) : (elem_size > 2))
+		return SLJIT_ERR_UNSUPPORTED;
+#else /* !SLJIT_CONFIG_X86_32 */
 	compiler->mode32 = 1;
-#endif /* SLJIT_CONFIG_X86_64 */
 
-	if (reg_size == 4) {
-		if (type & SLJIT_SIMD_FLOAT) {
-			if (elem_size < 2 || elem_size > 3)
-				return SLJIT_ERR_UNSUPPORTED;
+	if (elem_size > 3 || ((type & SLJIT_SIMD_FLOAT) && elem_size < 2))
+		return SLJIT_ERR_UNSUPPORTED;
+#endif /* SLJIT_CONFIG_X86_32 */
 
+	if (cpu_feature_list & CPU_FEATURE_AVX2) {
+		if (reg_size < 4 || reg_size > 5)
+			return SLJIT_ERR_UNSUPPORTED;
+
+		if (src != SLJIT_IMM && (reg_size == 5 || elem_size < 3 || !(type & SLJIT_SIMD_FLOAT))) {
 			if (type & SLJIT_SIMD_TEST)
 				return SLJIT_SUCCESS;
 
-			if (src & SLJIT_IMM) {
-				inst = emit_x86_instruction(compiler, 2 | (elem_size == 3 ? EX86_PREF_66 : 0) | EX86_SSE2, freg, 0, freg, 0);
-				FAIL_IF(!inst);
-				inst[0] = GROUP_0F;
-				inst[1] = XORPD_x_xm;
-				return SLJIT_SUCCESS;
-			}
-
-			if (elem_size == 2 && freg != src) {
-				FAIL_IF(emit_sse2_load(compiler, 1, freg, src, srcw));
+			if (!(src & SLJIT_MEM) && !(type & SLJIT_SIMD_FLOAT)) {
+#if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
+				if (elem_size >= 3)
+					compiler->mode32 = 0;
+#endif /* SLJIT_CONFIG_X86_64 */
+				FAIL_IF(emit_groupf(compiler, MOVD_x_rm, EX86_PREF_66 | EX86_SSE2_OP1, freg, src, srcw));
+#if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
+				compiler->mode32 = 1;
+#endif /* SLJIT_CONFIG_X86_64 */
 				src = freg;
 				srcw = 0;
 			}
 
-			inst = emit_x86_instruction(compiler, 2 | (elem_size == 3 ? EX86_PREF_F2 : 0) | EX86_SSE2, freg, 0, src, srcw);
-			FAIL_IF(!inst);
-			inst[0] = GROUP_0F;
-			inst[1] = elem_size == 2 ? SHUFPS_x_xm : MOVDDUP_x_xm;
-
-			if (elem_size == 2)
-				return emit_byte(compiler, 0);
-			return SLJIT_SUCCESS;
-		}
-
-		if (src & SLJIT_IMM) {
-			if (elem_size == 0) {
-				srcw = (sljit_u8)srcw;
-				srcw |= srcw << 8;
-				srcw |= srcw << 16;
-				elem_size = 2;
-			} else if (elem_size == 1) {
-				srcw = (sljit_u16)srcw;
-				srcw |= srcw << 16;
-				elem_size = 2;
-			}
-
-#if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
-			if (elem_size == 2 && (sljit_s32)srcw == -1)
-				srcw = -1;
-#endif /* SLJIT_CONFIG_X86_64 */
-
-			if (srcw == 0 || srcw == -1) {
-				inst = emit_x86_instruction(compiler, 2 | EX86_PREF_66 | EX86_SSE2, freg, 0, freg, 0);
-				FAIL_IF(!inst);
-				inst[0] = GROUP_0F;
-				inst[1] = srcw == 0 ? PXOR_x_xm : PCMPEQB_x_xm;
-				return SLJIT_SUCCESS;
-			}
-
-#if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
-			if (elem_size == 3)
-				FAIL_IF(emit_load_imm64(compiler, TMP_REG1, srcw));
-			else {
-#endif /* SLJIT_CONFIG_X86_64 */
-				EMIT_MOV(compiler, TMP_REG1, 0, SLJIT_IMM, srcw);
-				src = TMP_REG1;
-				srcw = 0;
-#if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
-			}
-#endif /* SLJIT_CONFIG_X86_64 */
-		}
-
+			switch (elem_size) {
+			case 0:
+				size = VPBROADCASTB_x_xm | EX86_PREF_66 | VEX_OP_0F38 | EX86_SSE2;
+				break;
+			case 1:
+				size = VPBROADCASTW_x_xm | EX86_PREF_66 | VEX_OP_0F38 | EX86_SSE2;
+				break;
+			case 2:
+				size = ((type & SLJIT_SIMD_FLOAT) ? VBROADCASTSS_x_xm : VPBROADCASTD_x_xm) | EX86_PREF_66 | VEX_OP_0F38 | EX86_SSE2;
+				break;
+			default:
 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
-		if (elem_size > 2)
-			return SLJIT_ERR_UNSUPPORTED;
+				size = VBROADCASTSD_x_xm | EX86_PREF_66 | VEX_OP_0F38 | EX86_SSE2;
+#else /* !SLJIT_CONFIG_X86_32 */
+				size = ((type & SLJIT_SIMD_FLOAT) ? VBROADCASTSD_x_xm : VPBROADCASTQ_x_xm) | EX86_PREF_66 | VEX_OP_0F38 | EX86_SSE2;
 #endif /* SLJIT_CONFIG_X86_32 */
-
-		if (type & SLJIT_SIMD_TEST)
-			return SLJIT_SUCCESS;
-
-		size = 2;
-		opcode = MOVD_x_rm;
-
-		switch (elem_size) {
-		case 0:
-			if (!FAST_IS_REG(src)) {
-				opcode = 0x3a /* Prefix of PINSRB_x_rm_i8. */;
-				size = 3;
+				break;
 			}
-			break;
-		case 1:
-			if (!FAST_IS_REG(src))
-				opcode = PINSRW_x_rm_i8;
-			break;
-		case 2:
-			break;
-#if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
-		case 3:
-			/* MOVQ */
-			compiler->mode32 = 0;
-			break;
-#endif /* SLJIT_CONFIG_X86_64 */
+
+			if (reg_size == 5)
+				size |= VEX_256;
+
+			return emit_vex_instruction(compiler, size, freg, 0, src, srcw);
+		}
+	} else if (reg_size != 4)
+		return SLJIT_ERR_UNSUPPORTED;
+
+	if (type & SLJIT_SIMD_TEST)
+		return SLJIT_SUCCESS;
+
+	if (type & SLJIT_SIMD_FLOAT) {
+		if (src == SLJIT_IMM) {
+			if (reg_size == 5)
+				return emit_vex_instruction(compiler, XORPD_x_xm | VEX_256 | (elem_size == 3 ? EX86_PREF_66 : 0) | EX86_SSE2 | VEX_SSE2_OPV, freg, freg, freg, 0);
+
+			return emit_groupf(compiler, XORPD_x_xm, (elem_size == 3 ? EX86_PREF_66 : 0) | EX86_SSE2, freg, freg, 0);
 		}
 
-		inst = emit_x86_instruction(compiler, size | EX86_PREF_66 | EX86_SSE2_OP1, freg, 0, src, srcw);
-		FAIL_IF(!inst);
-		inst[0] = GROUP_0F;
-		inst[1] = opcode;
-
-		if (size == 3) {
-			SLJIT_ASSERT(opcode == 0x3a);
-			inst[2] = PINSRB_x_rm_i8;
+		if (elem_size == 2 && freg != src) {
+			FAIL_IF(emit_sse2_load(compiler, 1, freg, src, srcw));
+			src = freg;
+			srcw = 0;
 		}
 
-		if (opcode != MOVD_x_rm)
-			FAIL_IF(emit_byte(compiler, 0));
+		FAIL_IF(emit_groupf(compiler, elem_size == 2 ? SHUFPS_x_xm : MOVDDUP_x_xm, (elem_size == 2 ? 0 : EX86_PREF_F2) | EX86_SSE2, freg, src, srcw));
 
-		switch (elem_size) {
-		case 0:
-			inst = emit_x86_instruction(compiler, 2 | EX86_PREF_66 | EX86_SSE2, TMP_FREG, 0, TMP_FREG, 0);
-			FAIL_IF(!inst);
-			inst[0] = GROUP_0F;
-			inst[1] = PXOR_x_xm;
-
-			inst = emit_x86_instruction(compiler, 3 | EX86_PREF_66 | EX86_SSE2, freg, 0, TMP_FREG, 0);
-			FAIL_IF(!inst);
-			inst[0] = GROUP_0F;
-			inst[1] = 0x38;
-			inst[2] = PSHUFB_x_xm;
-			return SLJIT_SUCCESS;
-		case 1:
-			inst = emit_x86_instruction(compiler, 2 | EX86_PREF_F2 | EX86_SSE2, freg, 0, freg, 0);
-			FAIL_IF(!inst);
-			inst[0] = GROUP_0F;
-			inst[1] = PSHUFLW_x_xm;
-
-			FAIL_IF(emit_byte(compiler, 0));
-			/* fallthrough */
-		case 2:
-			inst = emit_x86_instruction(compiler, 2 | EX86_PREF_66 | EX86_SSE2, freg, 0, freg, 0);
-			FAIL_IF(!inst);
-			inst[0] = GROUP_0F;
-			inst[1] = PSHUFD_x_xm;
-
+		if (elem_size == 2)
 			return emit_byte(compiler, 0);
-#if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
-		case 3:
-			compiler->mode32 = 1;
-			inst = emit_x86_instruction(compiler, 2 | EX86_PREF_66 | EX86_SSE2, freg, 0, freg, 0);
-			FAIL_IF(!inst);
-			inst[0] = GROUP_0F;
-			inst[1] = PSHUFD_x_xm;
-
-			return emit_byte(compiler, 0x44);
-#endif /* SLJIT_CONFIG_X86_64 */
-		}
+		return SLJIT_SUCCESS;
 	}
 
-	/* TODO: Support VEX prefix and longer reg types. */
-	return SLJIT_ERR_UNSUPPORTED;
+	if (src == SLJIT_IMM) {
+		if (elem_size == 0) {
+			srcw = (sljit_u8)srcw;
+			srcw |= srcw << 8;
+			srcw |= srcw << 16;
+			elem_size = 2;
+		} else if (elem_size == 1) {
+			srcw = (sljit_u16)srcw;
+			srcw |= srcw << 16;
+			elem_size = 2;
+		}
+
+#if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
+		if (elem_size == 2 && (sljit_s32)srcw == -1)
+			srcw = -1;
+#endif /* SLJIT_CONFIG_X86_64 */
+
+		if (srcw == 0 || srcw == -1) {
+			if (reg_size == 5)
+				return emit_vex_instruction(compiler, (srcw == 0 ? PXOR_x_xm : PCMPEQB_x_xm) | VEX_256 | EX86_PREF_66 | EX86_SSE2 | VEX_SSE2_OPV, freg, freg, freg, 0);
+
+			return emit_groupf(compiler, srcw == 0 ? PXOR_x_xm : PCMPEQB_x_xm, EX86_PREF_66 | EX86_SSE2, freg, freg, 0);
+		}
+
+#if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
+		if (elem_size == 3)
+			FAIL_IF(emit_load_imm64(compiler, TMP_REG1, srcw));
+		else
+#endif /* SLJIT_CONFIG_X86_64 */
+			EMIT_MOV(compiler, TMP_REG1, 0, SLJIT_IMM, srcw);
+
+		src = TMP_REG1;
+		srcw = 0;
+	}
+
+	size = 2;
+	opcode = MOVD_x_rm;
+
+	switch (elem_size) {
+	case 0:
+		if (!FAST_IS_REG(src)) {
+			opcode = 0x3a /* Prefix of PINSRB_x_rm_i8. */;
+			size = 3;
+		}
+		break;
+	case 1:
+		if (!FAST_IS_REG(src))
+			opcode = PINSRW_x_rm_i8;
+		break;
+	case 2:
+		break;
+#if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
+	case 3:
+		/* MOVQ */
+		compiler->mode32 = 0;
+		break;
+#endif /* SLJIT_CONFIG_X86_64 */
+	}
+
+	inst = emit_x86_instruction(compiler, size | EX86_PREF_66 | EX86_SSE2_OP1, freg, 0, src, srcw);
+	FAIL_IF(!inst);
+	inst[0] = GROUP_0F;
+	inst[1] = opcode;
+
+	if (reg_size == 5) {
+		SLJIT_ASSERT(opcode == MOVD_x_rm);
+#if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
+		size = VPBROADCASTD_x_xm;
+#else /* !SLJIT_CONFIG_X86_32 */
+		size = (elem_size == 3) ? VPBROADCASTQ_x_xm : VPBROADCASTD_x_xm;
+#endif /* SLJIT_CONFIG_X86_32 */
+		return emit_vex_instruction(compiler, size | VEX_256 | EX86_PREF_66 | VEX_OP_0F38 | EX86_SSE2, freg, 0, freg, 0);
+	}
+
+	if (size == 3) {
+		SLJIT_ASSERT(opcode == 0x3a);
+		inst[2] = PINSRB_x_rm_i8;
+	}
+
+	if (opcode != MOVD_x_rm)
+		FAIL_IF(emit_byte(compiler, 0));
+
+	switch (elem_size) {
+	case 0:
+		FAIL_IF(emit_groupf(compiler, PXOR_x_xm, EX86_PREF_66 | EX86_SSE2, TMP_FREG, TMP_FREG, 0));
+		return emit_groupf_ext(compiler, PSHUFB_x_xm, EX86_PREF_66 | VEX_OP_0F38 | EX86_SSE2, freg, TMP_FREG, 0);
+	case 1:
+		FAIL_IF(emit_groupf(compiler, PSHUFLW_x_xm, EX86_PREF_F2 | EX86_SSE2, freg, freg, 0));
+		FAIL_IF(emit_byte(compiler, 0));
+		/* fallthrough */
+	default:
+		FAIL_IF(emit_groupf(compiler, PSHUFD_x_xm, EX86_PREF_66 | EX86_SSE2, freg, freg, 0));
+		return emit_byte(compiler, 0);
+#if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
+	case 3:
+		compiler->mode32 = 1;
+		FAIL_IF(emit_groupf(compiler, PSHUFD_x_xm, EX86_PREF_66 | EX86_SSE2, freg, freg, 0));
+		return emit_byte(compiler, 0x44);
+#endif /* SLJIT_CONFIG_X86_64 */
+	}
 }
 
 SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_simd_lane_mov(struct sljit_compiler *compiler, sljit_s32 type,
@@ -3764,83 +3768,155 @@
 	sljit_u8 *inst;
 	sljit_u8 opcode = 0;
 	sljit_uw size;
+	sljit_s32 freg_orig = freg;
+#if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
+	sljit_s32 srcdst_is_ereg = 0;
+	sljit_s32 srcdst_orig = 0;
+	sljit_sw srcdstw_orig = 0;
+#endif /* SLJIT_CONFIG_X86_32 */
 
 	CHECK_ERROR();
 	CHECK(check_sljit_emit_simd_lane_mov(compiler, type, freg, lane_index, srcdst, srcdstw));
 
 	ADJUST_LOCAL_OFFSET(srcdst, srcdstw);
-	CHECK_EXTRA_REGS(srcdst, srcdstw, (void)0);
+
+	if (reg_size == 5) {
+		if (!(cpu_feature_list & CPU_FEATURE_AVX2))
+			return SLJIT_ERR_UNSUPPORTED;
+	} else if (reg_size != 4)
+		return SLJIT_ERR_UNSUPPORTED;
+
+#if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
+	if ((type & SLJIT_SIMD_FLOAT) ? (elem_size < 2 || elem_size > 3) : elem_size > 2)
+		return SLJIT_ERR_UNSUPPORTED;
+#else /* SLJIT_CONFIG_X86_32 */
+	if (elem_size > 3 || ((type & SLJIT_SIMD_FLOAT) && elem_size < 2))
+		return SLJIT_ERR_UNSUPPORTED;
+#endif /* SLJIT_CONFIG_X86_32 */
+
+	if (type & SLJIT_SIMD_TEST)
+		return SLJIT_SUCCESS;
 
 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
 	compiler->mode32 = 1;
+#else /* !SLJIT_CONFIG_X86_64 */
+	if (!(type & SLJIT_SIMD_FLOAT)) {
+		CHECK_EXTRA_REGS(srcdst, srcdstw, srcdst_is_ereg = 1);
+
+		if ((type & SLJIT_SIMD_STORE) && ((srcdst_is_ereg && elem_size < 2) || (elem_size == 0 && (type & SLJIT_SIMD_LANE_SIGNED) && FAST_IS_REG(srcdst) && reg_map[srcdst] >= 4))) {
+			srcdst_orig = srcdst;
+			srcdstw_orig = srcdstw;
+			srcdst = TMP_REG1;
+			srcdstw = 0;
+		}
+	}
 #endif /* SLJIT_CONFIG_X86_64 */
 
-	if (reg_size == 4) {
-		if (type & SLJIT_SIMD_FLOAT) {
-			if (elem_size == 3) {
-				if (type & SLJIT_SIMD_TEST)
-					return SLJIT_SUCCESS;
+	if (type & SLJIT_SIMD_LANE_ZERO) {
+		if (lane_index == 0) {
+			if (!(type & SLJIT_SIMD_FLOAT)) {
+#if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
+				if (elem_size == 3) {
+					compiler->mode32 = 0;
+					elem_size = 2;
+				}
+#endif /* SLJIT_CONFIG_X86_64 */
+				if (srcdst == SLJIT_IMM) {
+					if (elem_size == 0)
+						srcdstw = (sljit_u8)srcdstw;
+					else if (elem_size == 1)
+						srcdstw = (sljit_u16)srcdstw;
 
-				if (srcdst & SLJIT_MEM) {
-					if (type & SLJIT_SIMD_STORE)
-						opcode = lane_index == 0 ? MOVLPD_m_x : MOVHPD_m_x;
-					else
-						opcode = lane_index == 0 ? MOVLPD_x_m : MOVHPD_x_m;
-
-					return emit_sse2_logic(compiler, opcode, 1, freg, srcdst, srcdstw);
+					EMIT_MOV(compiler, TMP_REG1, 0, SLJIT_IMM, srcdstw);
+					srcdst = TMP_REG1;
+					srcdstw = 0;
+					elem_size = 2;
 				}
 
-				if (type & SLJIT_SIMD_STORE) {
-					if (lane_index == 1)
-						return emit_sse2_logic(compiler, MOVHLPS_x_x, 0, srcdst, freg, 0);
-					return emit_sse2_load(compiler, 0, srcdst, freg, 0);
+				if (elem_size == 2) {
+					if (reg_size == 4)
+						return emit_groupf(compiler, MOVD_x_rm, EX86_PREF_66 | EX86_SSE2_OP1, freg, srcdst, srcdstw);
+					return emit_vex_instruction(compiler, MOVD_x_rm | VEX_AUTO_W | EX86_PREF_66 | EX86_SSE2_OP1, freg, 0, srcdst, srcdstw);
 				}
+			} else if (srcdst & SLJIT_MEM) {
+				SLJIT_ASSERT(elem_size == 2 || elem_size == 3);
 
+				if (reg_size == 4)
+					return emit_groupf(compiler, MOVSD_x_xm, (elem_size == 2 ? EX86_PREF_F3 : EX86_PREF_F2) | EX86_SSE2, freg, srcdst, srcdstw);
+				return emit_vex_instruction(compiler, MOVSD_x_xm | (elem_size == 2 ? EX86_PREF_F3 : EX86_PREF_F2) | EX86_SSE2, freg, 0, srcdst, srcdstw);
+			} else if (elem_size == 3) {
+				if (reg_size == 4)
+					return emit_groupf(compiler, MOVQ_x_xm, EX86_PREF_F3 | EX86_SSE2, freg, srcdst, 0);
+				return emit_vex_instruction(compiler, MOVQ_x_xm | EX86_PREF_F3 | EX86_SSE2, freg, 0, srcdst, 0);
+			}
+		}
+
+		if (reg_size == 5 && lane_index >= (1 << (4 - elem_size))) {
+			freg = TMP_FREG;
+			lane_index -= (1 << (4 - elem_size));
+		} else if ((type & SLJIT_SIMD_FLOAT) && freg == srcdst) {
+			FAIL_IF(emit_sse2_load(compiler, elem_size == 2, TMP_FREG, srcdst, srcdstw));
+			srcdst = TMP_FREG;
+			srcdstw = 0;
+		}
+
+		size = ((!(type & SLJIT_SIMD_FLOAT) || elem_size != 2) ? EX86_PREF_66 : 0);
+
+		if (reg_size == 5)
+			FAIL_IF(emit_vex_instruction(compiler, ((type & SLJIT_SIMD_FLOAT) ? XORPD_x_xm : PXOR_x_xm) | VEX_256 | size | EX86_SSE2 | VEX_SSE2_OPV, freg, freg, freg, 0));
+		else
+			FAIL_IF(emit_groupf(compiler, (type & SLJIT_SIMD_FLOAT) ? XORPD_x_xm : PXOR_x_xm, size | EX86_SSE2, freg, freg, 0));
+	} else if (reg_size == 5 && lane_index >= (1 << (4 - elem_size))) {
+		FAIL_IF(emit_vex_instruction(compiler, ((type & SLJIT_SIMD_FLOAT) ? VEXTRACTF128_x_ym : VEXTRACTI128_x_ym) | VEX_256 | EX86_PREF_66 | VEX_OP_0F3A | EX86_SSE2, freg, 0, TMP_FREG, 0));
+		FAIL_IF(emit_byte(compiler, 1));
+
+		freg = TMP_FREG;
+		lane_index -= (1 << (4 - elem_size));
+	}
+
+	if (type & SLJIT_SIMD_FLOAT) {
+		if (elem_size == 3) {
+			if (srcdst & SLJIT_MEM) {
+				if (type & SLJIT_SIMD_STORE)
+					opcode = lane_index == 0 ? MOVLPD_m_x : MOVHPD_m_x;
+				else
+					opcode = lane_index == 0 ? MOVLPD_x_m : MOVHPD_x_m;
+
+				FAIL_IF(emit_groupf(compiler, opcode, EX86_PREF_66 | EX86_SSE2, freg, srcdst, srcdstw));
+
+				/* In case of store, freg is not TMP_FREG. */
+			} else if (type & SLJIT_SIMD_STORE) {
 				if (lane_index == 1)
-					return emit_sse2_logic(compiler, MOVLHPS_x_x, 0, freg, srcdst, 0);
-				return emit_sse2_store(compiler, 0, freg, 0, srcdst);
+					return emit_groupf(compiler, MOVHLPS_x_x, EX86_SSE2, srcdst, freg, 0);
+				return emit_sse2_load(compiler, 0, srcdst, freg, 0);
+			} else {
+				if (lane_index == 1)
+					FAIL_IF(emit_groupf(compiler, MOVLHPS_x_x, EX86_SSE2, freg, srcdst, 0));
+				else
+					FAIL_IF(emit_sse2_store(compiler, 0, freg, 0, srcdst));
 			}
-
-			if (elem_size != 2)
-				return SLJIT_ERR_UNSUPPORTED;
-
-			if (type & SLJIT_SIMD_TEST)
-				return SLJIT_SUCCESS;
-
-			if (!(type & SLJIT_SIMD_STORE)) {
-				if (lane_index == 0 && !(srcdst & SLJIT_MEM))
-					return emit_sse2_store(compiler, 1, freg, 0, srcdst);
-
-				inst = emit_x86_instruction(compiler, 3 | EX86_PREF_66 | EX86_SSE2, freg, 0, srcdst, srcdstw);
-				FAIL_IF(!inst);
-				inst[0] = GROUP_0F;
-				inst[1] = 0x3a;
-				inst[2] = INSERTPS_x_xm;
-
-				return emit_byte(compiler, U8(lane_index << 4));
-			}
-
+		} else if (type & SLJIT_SIMD_STORE) {
 			if (lane_index == 0)
 				return emit_sse2_store(compiler, 1, srcdst, srcdstw, freg);
 
 			if (srcdst & SLJIT_MEM) {
-				inst = emit_x86_instruction(compiler, 3 | EX86_PREF_66 | EX86_SSE2, freg, 0, srcdst, srcdstw);
-				FAIL_IF(!inst);
-				inst[0] = GROUP_0F;
-				inst[1] = 0x3a;
-				inst[2] = EXTRACTPS_x_xm;
-
+				FAIL_IF(emit_groupf_ext(compiler, EXTRACTPS_x_xm, EX86_PREF_66 | VEX_OP_0F3A | EX86_SSE2, freg, srcdst, srcdstw));
 				return emit_byte(compiler, U8(lane_index));
 			}
 
-			size = 2 | EX86_SSE2;
+			size = EX86_SSE2;
 			if (srcdst == freg)
 				opcode = SHUFPS_x_xm;
 			else {
+				if (cpu_feature_list & CPU_FEATURE_AVX) {
+					FAIL_IF(emit_vex_instruction(compiler, SHUFPS_x_xm | EX86_SSE2 | VEX_SSE2_OPV, srcdst, freg, freg, 0));
+					return emit_byte(compiler, U8(lane_index));
+				}
+
 				switch (lane_index) {
 				case 1:
 					opcode = MOVSHDUP_x_xm;
-					size = 2 | EX86_PREF_F3 | EX86_SSE2;
+					size = EX86_PREF_F3 | EX86_SSE2;
 					break;
 				case 2:
 					opcode = MOVHLPS_x_x;
@@ -3848,78 +3924,140 @@
 				default:
 					SLJIT_ASSERT(lane_index == 3);
 					opcode = PSHUFD_x_xm;
-					size = 2 | EX86_PREF_66 | EX86_SSE2;
+					size = EX86_PREF_66 | EX86_SSE2;
 					break;
 				}
 			}
 
-			inst = emit_x86_instruction(compiler, size, srcdst, 0, freg, 0);
-			FAIL_IF(!inst);
-			inst[0] = GROUP_0F;
-			inst[1] = opcode;
+			FAIL_IF(emit_groupf(compiler, opcode, size, srcdst, freg, 0));
 
 			if (opcode == SHUFPS_x_xm || opcode == PSHUFD_x_xm)
 				return emit_byte(compiler, U8(lane_index));
 
 			return SLJIT_SUCCESS;
-		}
-
-		if (srcdst & SLJIT_IMM) {
-#if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
-			if (elem_size < 3)
-				srcdstw = (sljit_s32)srcdstw;
-#endif /* SLJIT_CONFIG_X86_64 */
-			EMIT_MOV(compiler, TMP_REG1, 0, SLJIT_IMM, srcdstw);
-			srcdst = TMP_REG1;
-			srcdstw = 0;
-		}
-
-		size = 3;
-
-		switch (elem_size) {
-		case 0:
-			opcode = (type & SLJIT_SIMD_STORE) ? PEXTRB_rm_x_i8 : PINSRB_x_rm_i8;
-			break;
-		case 1:
-			if (!(type & SLJIT_SIMD_STORE)) {
-				size = 2;
-				opcode = PINSRW_x_rm_i8;
+		} else {
+			if (lane_index != 0 || (srcdst & SLJIT_MEM)) {
+				FAIL_IF(emit_groupf_ext(compiler, INSERTPS_x_xm, EX86_PREF_66 | VEX_OP_0F3A | EX86_SSE2, freg, srcdst, srcdstw));
+				FAIL_IF(emit_byte(compiler, U8(lane_index << 4)));
 			} else
-				opcode = PEXTRW_rm_x_i8;
-			break;
-		case 2:
-			opcode = (type & SLJIT_SIMD_STORE) ? PEXTRD_rm_x_i8 : PINSRD_x_rm_i8;
-			break;
-#if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
-		case 3:
-			/* PINSRQ / PEXTRQ */
-			opcode = (type & SLJIT_SIMD_STORE) ? PEXTRD_rm_x_i8 : PINSRD_x_rm_i8;
-			compiler->mode32 = 0;
-			break;
-#endif /* SLJIT_CONFIG_X86_64 */
+				FAIL_IF(emit_sse2_store(compiler, 1, freg, 0, srcdst));
 		}
 
-		if (opcode == 0)
-			return SLJIT_ERR_UNSUPPORTED;
-
-		if (type & SLJIT_SIMD_TEST)
+		if (freg != TMP_FREG || (type & SLJIT_SIMD_STORE))
 			return SLJIT_SUCCESS;
 
-		inst = emit_x86_instruction(compiler, size | EX86_PREF_66 | EX86_SSE2_OP1, freg, 0, srcdst, srcdstw);
-		FAIL_IF(!inst);
-		inst[0] = GROUP_0F;
+		SLJIT_ASSERT(reg_size == 5);
 
-		if (size == 3) {
-			inst[1] = 0x3a;
-			inst[2] = opcode;
-		} else
-			inst[1] = opcode;
+		if (type & SLJIT_SIMD_LANE_ZERO) {
+			FAIL_IF(emit_vex_instruction(compiler, VPERMPD_y_ym | VEX_256 | EX86_PREF_66 | VEX_OP_0F3A | VEX_W | EX86_SSE2, freg_orig, 0, TMP_FREG, 0));
+			return emit_byte(compiler, 0x4e);
+		}
 
-		return emit_byte(compiler, U8(lane_index));
+		FAIL_IF(emit_vex_instruction(compiler, VINSERTF128_y_y_xm | VEX_256 | EX86_PREF_66 | VEX_OP_0F3A | EX86_SSE2 | VEX_SSE2_OPV, freg_orig, freg_orig, TMP_FREG, 0));
+		return emit_byte(compiler, 1);
 	}
 
-	/* TODO: Support VEX prefix and longer reg types. */
-	return SLJIT_ERR_UNSUPPORTED;
+	if (srcdst == SLJIT_IMM) {
+		EMIT_MOV(compiler, TMP_REG1, 0, SLJIT_IMM, srcdstw);
+		srcdst = TMP_REG1;
+		srcdstw = 0;
+	}
+
+	size = 3;
+
+	switch (elem_size) {
+	case 0:
+		opcode = (type & SLJIT_SIMD_STORE) ? PEXTRB_rm_x_i8 : PINSRB_x_rm_i8;
+		break;
+	case 1:
+		if (!(type & SLJIT_SIMD_STORE)) {
+			size = 2;
+			opcode = PINSRW_x_rm_i8;
+		} else
+			opcode = PEXTRW_rm_x_i8;
+		break;
+	case 2:
+		opcode = (type & SLJIT_SIMD_STORE) ? PEXTRD_rm_x_i8 : PINSRD_x_rm_i8;
+		break;
+#if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
+	case 3:
+		/* PINSRQ / PEXTRQ */
+		opcode = (type & SLJIT_SIMD_STORE) ? PEXTRD_rm_x_i8 : PINSRD_x_rm_i8;
+		compiler->mode32 = 0;
+		break;
+#endif /* SLJIT_CONFIG_X86_64 */
+	}
+
+	inst = emit_x86_instruction(compiler, size | EX86_PREF_66 | EX86_SSE2_OP1, freg, 0, srcdst, srcdstw);
+	FAIL_IF(!inst);
+	inst[0] = GROUP_0F;
+
+	if (size == 3) {
+		inst[1] = 0x3a;
+		inst[2] = opcode;
+	} else
+		inst[1] = opcode;
+
+	FAIL_IF(emit_byte(compiler, U8(lane_index)));
+
+	if (!(type & SLJIT_SIMD_LANE_SIGNED) || (srcdst & SLJIT_MEM)) {
+		if (freg == TMP_FREG && !(type & SLJIT_SIMD_STORE)) {
+			SLJIT_ASSERT(reg_size == 5);
+
+			if (type & SLJIT_SIMD_LANE_ZERO) {
+				FAIL_IF(emit_vex_instruction(compiler, VPERMQ_y_ym | VEX_256 | EX86_PREF_66 | VEX_OP_0F3A | VEX_W | EX86_SSE2, freg_orig, 0, TMP_FREG, 0));
+				return emit_byte(compiler, 0x4e);
+			}
+
+			FAIL_IF(emit_vex_instruction(compiler, VINSERTI128_y_y_xm | VEX_256 | EX86_PREF_66 | VEX_OP_0F3A | EX86_SSE2 | VEX_SSE2_OPV, freg_orig, freg_orig, TMP_FREG, 0));
+			return emit_byte(compiler, 1);
+		}
+
+#if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
+		if (srcdst_orig & SLJIT_MEM)
+			return emit_mov(compiler, srcdst_orig, srcdstw_orig, TMP_REG1, 0);
+#endif /* SLJIT_CONFIG_X86_32 */
+		return SLJIT_SUCCESS;
+	}
+
+#if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
+	if (elem_size >= 3)
+		return SLJIT_SUCCESS;
+
+	compiler->mode32 = (type & SLJIT_32);
+
+	size = 2;
+
+	if (elem_size == 0)
+		size |= EX86_REX;
+
+	if (elem_size == 2) {
+		if (type & SLJIT_32)
+			return SLJIT_SUCCESS;
+
+		SLJIT_ASSERT(!(compiler->mode32));
+		size = 1;
+	}
+
+	inst = emit_x86_instruction(compiler, size, srcdst, 0, srcdst, 0);
+	FAIL_IF(!inst);
+
+	if (size != 1) {
+		inst[0] = GROUP_0F;
+		inst[1] = U8((elem_size == 0) ? MOVSX_r_rm8 : MOVSX_r_rm16);
+	} else
+		inst[0] = MOVSXD_r_rm;
+#else /* !SLJIT_CONFIG_X86_64 */
+	if (elem_size >= 2)
+		return SLJIT_SUCCESS;
+
+	FAIL_IF(emit_groupf(compiler, (elem_size == 0) ? MOVSX_r_rm8 : MOVSX_r_rm16, 0,
+		(srcdst_orig != 0 && FAST_IS_REG(srcdst_orig)) ? srcdst_orig : srcdst, srcdst, 0));
+
+	if (srcdst_orig & SLJIT_MEM)
+		return emit_mov(compiler, srcdst_orig, srcdstw_orig, TMP_REG1, 0);
+#endif /* SLJIT_CONFIG_X86_64 */
+	return SLJIT_SUCCESS;
 }
 
 SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_simd_lane_replicate(struct sljit_compiler *compiler, sljit_s32 type,
@@ -3928,101 +4066,103 @@
 {
 	sljit_s32 reg_size = SLJIT_SIMD_GET_REG_SIZE(type);
 	sljit_s32 elem_size = SLJIT_SIMD_GET_ELEM_SIZE(type);
-	sljit_u8 *inst;
 	sljit_uw pref;
 	sljit_u8 byte;
+#if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
+	sljit_s32 opcode3 = TMP_REG1;
+#else /* !SLJIT_CONFIG_X86_32 */
+	sljit_s32 opcode3 = SLJIT_S0;
+#endif /* SLJIT_CONFIG_X86_32 */
 
 	CHECK_ERROR();
 	CHECK(check_sljit_emit_simd_lane_replicate(compiler, type, freg, src, src_lane_index));
 
-	if (reg_size == 4) {
-		if (type & SLJIT_SIMD_FLOAT) {
-			pref = 0;
+#if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
+	compiler->mode32 = 1;
+#endif /* SLJIT_CONFIG_X86_64 */
+	SLJIT_ASSERT(reg_map[opcode3] == 3);
 
-			if (elem_size == 3) {
-				if (type & SLJIT_SIMD_TEST)
-					return SLJIT_SUCCESS;
+	if (reg_size == 5) {
+		if (!(cpu_feature_list & CPU_FEATURE_AVX2))
+			return SLJIT_ERR_UNSUPPORTED;
+	} else if (reg_size != 4)
+		return SLJIT_ERR_UNSUPPORTED;
 
-				if (src_lane_index == 0) {
-					inst = emit_x86_instruction(compiler, 2 | EX86_PREF_F2 | EX86_SSE2, freg, 0, src, 0);
-					FAIL_IF(!inst);
-					inst[0] = GROUP_0F;
-					inst[1] = MOVDDUP_x_xm;
-					return SLJIT_SUCCESS;
-				}
+	if (type & SLJIT_SIMD_FLOAT) {
+		pref = 0;
+		byte = U8(src_lane_index);
 
-				pref = EX86_PREF_66;
-			} else if (elem_size != 2)
-				return SLJIT_ERR_UNSUPPORTED;
-			else if (type & SLJIT_SIMD_TEST)
+		if (elem_size == 3) {
+			if (type & SLJIT_SIMD_TEST)
 				return SLJIT_SUCCESS;
 
-			if (freg != src) {
-				inst = emit_x86_instruction(compiler, 2 | pref | EX86_SSE2, freg, 0, src, 0);
-				FAIL_IF(!inst);
-				inst[0] = GROUP_0F;
-				inst[1] = MOVAPS_x_xm;
+			if (reg_size == 5) {
+				if (src_lane_index == 0)
+					return emit_vex_instruction(compiler, VBROADCASTSD_x_xm | VEX_256 | EX86_PREF_66 | VEX_OP_0F38 | EX86_SSE2, freg, 0, src, 0);
+
+				FAIL_IF(emit_vex_instruction(compiler, VPERMPD_y_ym | VEX_256 | EX86_PREF_66 | VEX_OP_0F3A | VEX_W | EX86_SSE2, freg, 0, src, 0));
+
+				byte = U8(byte | (byte << 2));
+				return emit_byte(compiler, U8(byte | (byte << 4)));
 			}
 
-			inst = emit_x86_instruction(compiler, 2 | pref | EX86_SSE2, freg, 0, freg, 0);
-			FAIL_IF(!inst);
-			inst[0] = GROUP_0F;
-			inst[1] = SHUFPS_x_xm;
+			if (src_lane_index == 0)
+				return emit_groupf(compiler, MOVDDUP_x_xm, EX86_PREF_F2 | EX86_SSE2, freg, src, 0);
 
-			byte = U8(src_lane_index);
-
-			if (elem_size == 2) {
-				byte = U8(byte | (byte << 2));
-				byte = U8(byte | (byte << 4));
-			} else
-				byte = U8(byte | (byte << 1));
-
-			return emit_byte(compiler, U8(byte));
-		}
-
-		if (type & SLJIT_SIMD_TEST)
+			/* Changes it to SHUFPD_x_xm. */
+			pref = EX86_PREF_66;
+		} else if (elem_size != 2)
+			return SLJIT_ERR_UNSUPPORTED;
+		else if (type & SLJIT_SIMD_TEST)
 			return SLJIT_SUCCESS;
 
-		if (elem_size >= 1) {
-			if (elem_size == 1) {
-				if (src_lane_index >= 4) {
-					byte = U8(src_lane_index - 4);
-					src_lane_index = 2;
-					pref = EX86_PREF_F3;
-				} else {
-					byte = U8(src_lane_index);
-					src_lane_index = 0;
-					pref = EX86_PREF_F2;
-				}
+		if (reg_size == 5) {
+			SLJIT_ASSERT(elem_size == 2);
 
-				inst = emit_x86_instruction(compiler, 2 | pref | EX86_SSE2, freg, 0, src, 0);
-				FAIL_IF(!inst);
-				inst[0] = GROUP_0F;
-				inst[1] = PSHUFLW_x_xm;
+			if (src_lane_index == 0)
+				return emit_vex_instruction(compiler, VBROADCASTSS_x_xm | VEX_256 | EX86_PREF_66 | VEX_OP_0F38 | EX86_SSE2, freg, 0, src, 0);
 
-				byte = U8(byte | (byte << 2));
-				FAIL_IF(emit_byte(compiler, U8(byte | (byte << 4))));
+			FAIL_IF(emit_vex_instruction(compiler, VPERMPD_y_ym | VEX_256 | EX86_PREF_66 | VEX_OP_0F3A | VEX_W | EX86_SSE2, freg, 0, src, 0));
 
-				src = freg;
+			byte = 0x44;
+			if (src_lane_index >= 4) {
+				byte = 0xee;
+				src_lane_index -= 4;
 			}
 
-			if (elem_size == 3)
-				src_lane_index <<= 1;
-
+			FAIL_IF(emit_byte(compiler, byte));
+			FAIL_IF(emit_vex_instruction(compiler, SHUFPS_x_xm | VEX_256 | pref | EX86_SSE2 | VEX_SSE2_OPV, freg, freg, freg, 0));
 			byte = U8(src_lane_index);
-			byte = U8(byte | (byte << 2));
+		} else if (freg != src && (cpu_feature_list & CPU_FEATURE_AVX)) {
+			FAIL_IF(emit_vex_instruction(compiler, SHUFPS_x_xm | pref | EX86_SSE2 | VEX_SSE2_OPV, freg, src, src, 0));
+		} else {
+			if (freg != src)
+				FAIL_IF(emit_groupf(compiler, MOVAPS_x_xm, pref | EX86_SSE2, freg, src, 0));
 
-			if (elem_size == 3)
-				byte |= 0x4;
-
-			inst = emit_x86_instruction(compiler, 2 | EX86_PREF_66 | EX86_SSE2, freg, 0, src, 0);
-			FAIL_IF(!inst);
-			inst[0] = GROUP_0F;
-			inst[1] = PSHUFD_x_xm;
-			return emit_byte(compiler, U8(byte | (byte << 4)));
+			FAIL_IF(emit_groupf(compiler, SHUFPS_x_xm, pref | EX86_SSE2, freg, freg, 0));
 		}
 
-		if (freg != src || src_lane_index != 0) {
+		if (elem_size == 2) {
+			byte = U8(byte | (byte << 2));
+			byte = U8(byte | (byte << 4));
+		} else
+			byte = U8(byte | (byte << 1));
+
+		return emit_byte(compiler, U8(byte));
+	}
+
+	if (type & SLJIT_SIMD_TEST)
+		return SLJIT_SUCCESS;
+
+	if (elem_size == 0) {
+		if (reg_size == 5 && src_lane_index >= 16) {
+			FAIL_IF(emit_vex_instruction(compiler, VPERMQ_y_ym | VEX_256 | EX86_PREF_66 | VEX_OP_0F3A | VEX_W | EX86_SSE2, freg, 0, src, 0));
+			FAIL_IF(emit_byte(compiler, src_lane_index >= 24 ? 0xff : 0xaa));
+			src_lane_index &= 0x7;
+			src = freg;
+		}
+
+		if ((freg != src && !(cpu_feature_list & CPU_FEATURE_AVX2)) || src_lane_index != 0) {
 			pref = 0;
 
 			if ((src_lane_index & 0x3) == 0) {
@@ -4032,50 +4172,281 @@
 				pref = EX86_PREF_F2;
 				byte = U8(src_lane_index >> 1);
 			} else {
-				if (freg != src) {
-					inst = emit_x86_instruction(compiler, 2 | EX86_PREF_66 | EX86_SSE2, freg, 0, src, 0);
-					FAIL_IF(!inst);
-					inst[0] = GROUP_0F;
-					inst[1] = MOVDQA_x_xm;
-				}
+				if (freg == src || !(cpu_feature_list & CPU_FEATURE_AVX2)) {
+					if (freg != src)
+						FAIL_IF(emit_groupf(compiler, MOVDQA_x_xm, EX86_PREF_66 | EX86_SSE2, freg, src, 0));
 
-				inst = emit_x86_instruction(compiler, 2 | EX86_PREF_66 | EX86_SSE2_OP2, 0, 0, freg, 0);
-				FAIL_IF(!inst);
-				inst[0] = GROUP_0F;
-				inst[1] = PSRLDQ_x;
-				inst[2] |= (0x3 << 3);
+					FAIL_IF(emit_groupf(compiler, PSRLDQ_x, 2 | EX86_PREF_66 | EX86_SSE2_OP2, opcode3, freg, 0));
+				} else
+					FAIL_IF(emit_vex_instruction(compiler, PSRLDQ_x | EX86_PREF_66 | EX86_SSE2_OP2 | VEX_SSE2_OPV, opcode3, freg, src, 0));
 
 				FAIL_IF(emit_byte(compiler, U8(src_lane_index)));
 			}
 
 			if (pref != 0) {
-				inst = emit_x86_instruction(compiler, 2 | pref | EX86_SSE2, freg, 0, src, 0);
-				FAIL_IF(!inst);
-				inst[0] = GROUP_0F;
-				inst[1] = PSHUFLW_x_xm;
-
-				byte = U8(byte | (byte << 2));
-				FAIL_IF(emit_byte(compiler, U8(byte | (byte << 4))));
+				FAIL_IF(emit_groupf(compiler, PSHUFLW_x_xm, pref | EX86_SSE2, freg, src, 0));
+				FAIL_IF(emit_byte(compiler, byte));
 			}
 
 			src = freg;
 		}
 
-		inst = emit_x86_instruction(compiler, 2 | EX86_PREF_66 | EX86_SSE2, TMP_FREG, 0, TMP_FREG, 0);
-		FAIL_IF(!inst);
-		inst[0] = GROUP_0F;
-		inst[1] = PXOR_x_xm;
+		if (cpu_feature_list & CPU_FEATURE_AVX2)
+			return emit_vex_instruction(compiler, VPBROADCASTB_x_xm | (reg_size == 5 ? VEX_256 : 0) | EX86_PREF_66 | VEX_OP_0F38 | EX86_SSE2, freg, 0, src, 0);
 
-		inst = emit_x86_instruction(compiler, 3 | EX86_PREF_66 | EX86_SSE2, freg, 0, TMP_FREG, 0);
-		FAIL_IF(!inst);
-		inst[0] = GROUP_0F;
-		inst[1] = 0x38;
-		inst[2] = PSHUFB_x_xm;
+		SLJIT_ASSERT(reg_size == 4);
+		FAIL_IF(emit_groupf(compiler, PXOR_x_xm, EX86_PREF_66 | EX86_SSE2, TMP_FREG, TMP_FREG, 0));
+		return emit_groupf_ext(compiler, PSHUFB_x_xm, EX86_PREF_66 | VEX_OP_0F38 | EX86_SSE2, freg, TMP_FREG, 0);
+	}
+
+	if ((cpu_feature_list & CPU_FEATURE_AVX2) && src_lane_index == 0 && elem_size <= 3) {
+		switch (elem_size) {
+		case 1:
+			pref = VPBROADCASTW_x_xm | EX86_PREF_66 | VEX_OP_0F38 | EX86_SSE2;
+			break;
+		case 2:
+			pref = VPBROADCASTD_x_xm | EX86_PREF_66 | VEX_OP_0F38 | EX86_SSE2;
+			break;
+		default:
+			pref = VPBROADCASTQ_x_xm | EX86_PREF_66 | VEX_OP_0F38 | EX86_SSE2;
+			break;
+		}
+
+		if (reg_size == 5)
+			pref |= VEX_256;
+
+		return emit_vex_instruction(compiler, pref, freg, 0, src, 0);
+	}
+
+	if (reg_size == 5) {
+		switch (elem_size) {
+		case 1:
+			byte = U8(src_lane_index & 0x3);
+			src_lane_index >>= 2;
+			pref = PSHUFLW_x_xm | VEX_256 | ((src_lane_index & 1) == 0 ? EX86_PREF_F2 : EX86_PREF_F3) | EX86_SSE2;
+			break;
+		case 2:
+			byte = U8(src_lane_index & 0x3);
+			src_lane_index >>= 1;
+			pref = PSHUFD_x_xm | VEX_256 | EX86_PREF_66 | EX86_SSE2;
+			break;
+		case 3:
+			pref = 0;
+			break;
+		default:
+			FAIL_IF(emit_vex_instruction(compiler, VPERMQ_y_ym | VEX_256 | EX86_PREF_66 | VEX_OP_0F3A | VEX_W | EX86_SSE2, freg, 0, src, 0));
+			return emit_byte(compiler, U8(src_lane_index == 0 ? 0x44 : 0xee));
+		}
+
+		if (pref != 0) {
+			FAIL_IF(emit_vex_instruction(compiler, pref, freg, 0, src, 0));
+			byte = U8(byte | (byte << 2));
+			FAIL_IF(emit_byte(compiler, U8(byte | (byte << 4))));
+
+			if (src_lane_index == 0)
+				return emit_vex_instruction(compiler, VPBROADCASTQ_x_xm | VEX_256 | EX86_PREF_66 | VEX_OP_0F38 | EX86_SSE2, freg, 0, freg, 0);
+
+			src = freg;
+		}
+
+		FAIL_IF(emit_vex_instruction(compiler, VPERMQ_y_ym | VEX_256 | EX86_PREF_66 | VEX_OP_0F3A | VEX_W | EX86_SSE2, freg, 0, src, 0));
+		byte = U8(src_lane_index);
+		byte = U8(byte | (byte << 2));
+		return emit_byte(compiler, U8(byte | (byte << 4)));
+	}
+
+	switch (elem_size) {
+	case 1:
+		byte = U8(src_lane_index & 0x3);
+		src_lane_index >>= 1;
+		pref = (src_lane_index & 2) == 0 ? EX86_PREF_F2 : EX86_PREF_F3;
+
+		FAIL_IF(emit_groupf(compiler, PSHUFLW_x_xm, pref | EX86_SSE2, freg, src, 0));
+		byte = U8(byte | (byte << 2));
+		FAIL_IF(emit_byte(compiler, U8(byte | (byte << 4))));
+
+		if ((cpu_feature_list & CPU_FEATURE_AVX2) && pref == EX86_PREF_F2)
+			return emit_vex_instruction(compiler, VPBROADCASTD_x_xm | EX86_PREF_66 | VEX_OP_0F38 | EX86_SSE2, freg, 0, freg, 0);
+
+		src = freg;
+		/* fallthrough */
+	case 2:
+		byte = U8(src_lane_index);
+		byte = U8(byte | (byte << 2));
+		break;
+	default:
+		byte = U8(src_lane_index << 1);
+		byte = U8(byte | (byte << 2) | 0x4);
+		break;
+	}
+
+	FAIL_IF(emit_groupf(compiler, PSHUFD_x_xm, EX86_PREF_66 | EX86_SSE2, freg, src, 0));
+	return emit_byte(compiler, U8(byte | (byte << 4)));
+}
+
+SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_simd_extend(struct sljit_compiler *compiler, sljit_s32 type,
+	sljit_s32 freg,
+	sljit_s32 src, sljit_sw srcw)
+{
+	sljit_s32 reg_size = SLJIT_SIMD_GET_REG_SIZE(type);
+	sljit_s32 elem_size = SLJIT_SIMD_GET_ELEM_SIZE(type);
+	sljit_s32 elem2_size = SLJIT_SIMD_GET_ELEM2_SIZE(type);
+	sljit_u8 opcode;
+
+	CHECK_ERROR();
+	CHECK(check_sljit_emit_simd_extend(compiler, type, freg, src, srcw));
+
+	ADJUST_LOCAL_OFFSET(src, srcw);
+
+#if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
+	compiler->mode32 = 1;
+#endif /* SLJIT_CONFIG_X86_64 */
+
+	if (reg_size == 5) {
+		if (!(cpu_feature_list & CPU_FEATURE_AVX2))
+			return SLJIT_ERR_UNSUPPORTED;
+	} else if (reg_size != 4)
+		return SLJIT_ERR_UNSUPPORTED;
+
+	if (type & SLJIT_SIMD_FLOAT) {
+		if (elem_size != 2 || elem2_size != 3)
+			return SLJIT_ERR_UNSUPPORTED;
+
+		if (type & SLJIT_SIMD_TEST)
+			return SLJIT_SUCCESS;
+
+		if (reg_size == 4)
+			return emit_groupf(compiler, CVTPS2PD_x_xm, EX86_SSE2, freg, src, srcw);
+		return emit_vex_instruction(compiler, CVTPS2PD_x_xm | VEX_256 | EX86_SSE2, freg, 0, src, srcw);
+	}
+
+	switch (elem_size) {
+	case 0:
+		if (elem2_size == 1)
+			opcode = (type & SLJIT_SIMD_EXTEND_SIGNED) ? PMOVSXBW_x_xm : PMOVZXBW_x_xm;
+		else if (elem2_size == 2)
+			opcode = (type & SLJIT_SIMD_EXTEND_SIGNED) ? PMOVSXBD_x_xm : PMOVZXBD_x_xm;
+		else if (elem2_size == 3)
+			opcode = (type & SLJIT_SIMD_EXTEND_SIGNED) ? PMOVSXBQ_x_xm : PMOVZXBQ_x_xm;
+		else
+			return SLJIT_ERR_UNSUPPORTED;
+		break;
+	case 1:
+		if (elem2_size == 2)
+			opcode = (type & SLJIT_SIMD_EXTEND_SIGNED) ? PMOVSXWD_x_xm : PMOVZXWD_x_xm;
+		else if (elem2_size == 3)
+			opcode = (type & SLJIT_SIMD_EXTEND_SIGNED) ? PMOVSXWQ_x_xm : PMOVZXWQ_x_xm;
+		else
+			return SLJIT_ERR_UNSUPPORTED;
+		break;
+	case 2:
+		if (elem2_size == 3)
+			opcode = (type & SLJIT_SIMD_EXTEND_SIGNED) ? PMOVSXDQ_x_xm : PMOVZXDQ_x_xm;
+		else
+			return SLJIT_ERR_UNSUPPORTED;
+		break;
+	default:
+		return SLJIT_ERR_UNSUPPORTED;
+	}
+
+	if (type & SLJIT_SIMD_TEST)
+		return SLJIT_SUCCESS;
+
+	if (reg_size == 4)
+		return emit_groupf_ext(compiler, opcode, EX86_PREF_66 | VEX_OP_0F38 | EX86_SSE2, freg, src, srcw);
+	return emit_vex_instruction(compiler, opcode | VEX_256 | EX86_PREF_66 | VEX_OP_0F38 | EX86_SSE2, freg, 0, src, srcw);
+}
+
+SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_simd_sign(struct sljit_compiler *compiler, sljit_s32 type,
+	sljit_s32 freg,
+	sljit_s32 dst, sljit_sw dstw)
+{
+	sljit_s32 reg_size = SLJIT_SIMD_GET_REG_SIZE(type);
+	sljit_s32 elem_size = SLJIT_SIMD_GET_ELEM_SIZE(type);
+	sljit_s32 dst_r;
+	sljit_uw pref;
+	sljit_u8 *inst;
+
+	CHECK_ERROR();
+	CHECK(check_sljit_emit_simd_sign(compiler, type, freg, dst, dstw));
+
+	ADJUST_LOCAL_OFFSET(dst, dstw);
+
+	CHECK_EXTRA_REGS(dst, dstw, (void)0);
+#if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
+	compiler->mode32 = 1;
+#endif /* SLJIT_CONFIG_X86_64 */
+
+	if (elem_size > 3 || ((type & SLJIT_SIMD_FLOAT) && elem_size < 2))
+		return SLJIT_ERR_UNSUPPORTED;
+
+	if (reg_size == 4) {
+		if (type & SLJIT_SIMD_TEST)
+			return SLJIT_SUCCESS;
+
+		pref = EX86_PREF_66 | EX86_SSE2_OP2;
+
+		switch (elem_size) {
+		case 1:
+			FAIL_IF(emit_groupf(compiler, PACKSSWB_x_xm, EX86_PREF_66 | EX86_SSE2, TMP_FREG, freg, 0));
+			freg = TMP_FREG;
+			break;
+		case 2:
+			pref = EX86_SSE2_OP2;
+			break;
+		}
+
+		dst_r = FAST_IS_REG(dst) ? dst : TMP_REG1;
+		FAIL_IF(emit_groupf(compiler, elem_size < 2 ? PMOVMSKB_r_x : MOVMSKPS_r_x, pref, dst_r, freg, 0));
+
+#if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
+		compiler->mode32 = type & SLJIT_32;
+#endif /* SLJIT_CONFIG_X86_64 */
+
+		if (elem_size == 1) {
+			inst = emit_x86_instruction(compiler, 1 | EX86_SHIFT_INS, SLJIT_IMM, 8, dst_r, 0);
+			FAIL_IF(!inst);
+			inst[1] |= SHR;
+		}
+
+		if (dst_r == TMP_REG1)
+			return emit_mov(compiler, dst, dstw, TMP_REG1, 0);
+
 		return SLJIT_SUCCESS;
 	}
 
-	/* TODO: Support VEX prefix and longer reg types. */
-	return SLJIT_ERR_UNSUPPORTED;
+	if (reg_size != 5 || !(cpu_feature_list & CPU_FEATURE_AVX2))
+		return SLJIT_ERR_UNSUPPORTED;
+
+	if (type & SLJIT_SIMD_TEST)
+		return SLJIT_SUCCESS;
+
+	dst_r = FAST_IS_REG(dst) ? dst : TMP_REG1;
+
+	if (elem_size == 1) {
+		FAIL_IF(emit_vex_instruction(compiler, VEXTRACTI128_x_ym | VEX_256 | EX86_PREF_66 | VEX_OP_0F3A | EX86_SSE2, freg, 0, TMP_FREG, 0));
+		FAIL_IF(emit_byte(compiler, 1));
+		FAIL_IF(emit_vex_instruction(compiler, PACKSSWB_x_xm | VEX_256 | EX86_PREF_66 | EX86_SSE2 | VEX_SSE2_OPV, TMP_FREG, freg, TMP_FREG, 0));
+		FAIL_IF(emit_groupf(compiler, PMOVMSKB_r_x, EX86_PREF_66 | EX86_SSE2_OP2, dst_r, TMP_FREG, 0));
+	} else {
+		pref = MOVMSKPS_r_x | VEX_256 | EX86_SSE2_OP2;
+
+		if (elem_size == 0)
+			pref = PMOVMSKB_r_x | VEX_256 | EX86_PREF_66 | EX86_SSE2_OP2;
+		else if (elem_size == 3)
+			pref |= EX86_PREF_66;
+
+		FAIL_IF(emit_vex_instruction(compiler, pref, dst_r, 0, freg, 0));
+	}
+
+	if (dst_r == TMP_REG1) {
+#if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
+		compiler->mode32 = type & SLJIT_32;
+#endif /* SLJIT_CONFIG_X86_64 */
+		return emit_mov(compiler, dst, dstw, TMP_REG1, 0);
+	}
+
+	return SLJIT_SUCCESS;
 }
 
 SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_atomic_load(struct sljit_compiler *compiler, sljit_s32 op,
@@ -4094,8 +4465,7 @@
 	sljit_s32 mem_reg,
 	sljit_s32 temp_reg)
 {
-	sljit_u8 *inst;
-	sljit_uw size;
+	sljit_uw pref;
 	sljit_s32 free_reg = TMP_REG1;
 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
 	sljit_sw srcw = 0;
@@ -4163,18 +4533,15 @@
 	/* Lock prefix. */
 	FAIL_IF(emit_byte(compiler, GROUP_LOCK));
 
-	size = 2;
+	pref = 0;
 	if (op == SLJIT_MOV_U16)
-		size |= EX86_HALF_ARG | EX86_PREF_66;
+		pref = EX86_HALF_ARG | EX86_PREF_66;
 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
 	if (op == SLJIT_MOV_U8)
-		size |= EX86_REX;
+		pref = EX86_REX;
 #endif /* SLJIT_CONFIG_X86_64 */
 
-	inst = emit_x86_instruction(compiler, size, src_reg, 0, SLJIT_MEM1(mem_reg), 0);
-	FAIL_IF(!inst);
-	inst[0] = GROUP_0F;
-	inst[1] = U8(op == SLJIT_MOV_U8 ? CMPXCHG_rm8_r : CMPXCHG_rm_r);
+	FAIL_IF(emit_groupf(compiler, U8(op == SLJIT_MOV_U8 ? CMPXCHG_rm8_r : CMPXCHG_rm_r), pref, src_reg, SLJIT_MEM1(mem_reg), 0));
 
 	if (temp_reg != SLJIT_R0) {
 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)