priv/host_generic_simd128.c - platform/external/valgrind - Git at Google


 /*---------------------------------------------------------------*/
 /*--- begin                            host_generic_simd128.c ---*/
 /*---------------------------------------------------------------*/

 /*
    This file is part of Valgrind, a dynamic binary instrumentation
    framework.

    Copyright (C) 2010-2012 OpenWorks GbR
       info@open-works.net

    This program is free software; you can redistribute it and/or
    modify it under the terms of the GNU General Public License as
    published by the Free Software Foundation; either version 2 of the
    License, or (at your option) any later version.

    This program is distributed in the hope that it will be useful, but
    WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
    General Public License for more details.

    You should have received a copy of the GNU General Public License
    along with this program; if not, write to the Free Software
    Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
    02110-1301, USA.

    The GNU General Public License is contained in the file COPYING.
 */

 /* Generic helper functions for doing 128-bit SIMD arithmetic in cases
    where the instruction selectors cannot generate code in-line.
    These are purely back-end entities and cannot be seen/referenced
    from IR. */

 #include "libvex_basictypes.h"
 #include "host_generic_simd128.h"


 /* Primitive helpers always take args of the real type (signed vs
    unsigned) but return an unsigned result, so there's no conversion
    weirdness when stuffing results back in the V128 union fields,
    which are all unsigned. */

 static inline UInt mul32 ( Int xx, Int yy )
 {
    Int t = ((Int)xx) * ((Int)yy);
    return toUInt(t);
 }

 static inline UInt max32S ( Int xx, Int yy )
 {
    return toUInt((xx > yy) ? xx : yy);
 }

 static inline UInt min32S ( Int xx, Int yy )
 {
    return toUInt((xx < yy) ? xx : yy);
 }

 static inline UInt max32U ( UInt xx, UInt yy )
 {
    return toUInt((xx > yy) ? xx : yy);
 }

 static inline UInt min32U ( UInt xx, UInt yy )
 {
    return toUInt((xx < yy) ? xx : yy);
 }

 static inline UShort max16U ( UShort xx, UShort yy )
 {
    return toUShort((xx > yy) ? xx : yy);
 }

 static inline UShort min16U ( UShort xx, UShort yy )
 {
    return toUShort((xx < yy) ? xx : yy);
 }

 static inline UChar max8S ( Char xx, Char yy )
 {
    return toUChar((xx > yy) ? xx : yy);
 }

 static inline UChar min8S ( Char xx, Char yy )
 {
    return toUChar((xx < yy) ? xx : yy);
 }

 static inline ULong cmpEQ64 ( Long xx, Long yy )
 {
    return (((Long)xx) == ((Long)yy))
              ? 0xFFFFFFFFFFFFFFFFULL : 0ULL;
 }

 static inline ULong cmpGT64S ( Long xx, Long yy )
 {
    return (((Long)xx) > ((Long)yy))
              ? 0xFFFFFFFFFFFFFFFFULL : 0ULL;
 }

 static inline ULong sar64 ( ULong v, UInt n )
 {
    return ((Long)v) >> n;
 }

 static inline UChar sar8 ( UChar v, UInt n )
 {
    return toUChar(((Char)v) >> n);
 }

 static inline UShort qnarrow32Sto16U ( UInt xx0 )
 {
    Int xx = (Int)xx0;
    if (xx < 0)     xx = 0;
    if (xx > 65535) xx = 65535;
    return (UShort)xx;
 }

 static inline UShort narrow32to16 ( UInt xx )
 {
    return (UShort)xx;
 }

 static inline UChar narrow16to8 ( UShort xx )
 {
    return (UChar)xx;
 }


 void VEX_REGPARM(3)
      h_generic_calc_Mul32x4 ( /*OUT*/V128* res,
                               V128* argL, V128* argR )
 {
    res->w32[0] = mul32(argL->w32[0], argR->w32[0]);
    res->w32[1] = mul32(argL->w32[1], argR->w32[1]);
    res->w32[2] = mul32(argL->w32[2], argR->w32[2]);
    res->w32[3] = mul32(argL->w32[3], argR->w32[3]);
 }

 void VEX_REGPARM(3)
      h_generic_calc_Max32Sx4 ( /*OUT*/V128* res,
                                V128* argL, V128* argR )
 {
    res->w32[0] = max32S(argL->w32[0], argR->w32[0]);
    res->w32[1] = max32S(argL->w32[1], argR->w32[1]);
    res->w32[2] = max32S(argL->w32[2], argR->w32[2]);
    res->w32[3] = max32S(argL->w32[3], argR->w32[3]);
 }

 void VEX_REGPARM(3)
      h_generic_calc_Min32Sx4 ( /*OUT*/V128* res,
                                V128* argL, V128* argR )
 {
    res->w32[0] = min32S(argL->w32[0], argR->w32[0]);
    res->w32[1] = min32S(argL->w32[1], argR->w32[1]);
    res->w32[2] = min32S(argL->w32[2], argR->w32[2]);
    res->w32[3] = min32S(argL->w32[3], argR->w32[3]);
 }

 void VEX_REGPARM(3)
      h_generic_calc_Max32Ux4 ( /*OUT*/V128* res,
                                V128* argL, V128* argR )
 {
    res->w32[0] = max32U(argL->w32[0], argR->w32[0]);
    res->w32[1] = max32U(argL->w32[1], argR->w32[1]);
    res->w32[2] = max32U(argL->w32[2], argR->w32[2]);
    res->w32[3] = max32U(argL->w32[3], argR->w32[3]);
 }

 void VEX_REGPARM(3)
      h_generic_calc_Min32Ux4 ( /*OUT*/V128* res,
                                V128* argL, V128* argR )
 {
    res->w32[0] = min32U(argL->w32[0], argR->w32[0]);
    res->w32[1] = min32U(argL->w32[1], argR->w32[1]);
    res->w32[2] = min32U(argL->w32[2], argR->w32[2]);
    res->w32[3] = min32U(argL->w32[3], argR->w32[3]);
 }

 void VEX_REGPARM(3)
      h_generic_calc_Max16Ux8 ( /*OUT*/V128* res,
                                V128* argL, V128* argR )
 {
    res->w16[0] = max16U(argL->w16[0], argR->w16[0]);
    res->w16[1] = max16U(argL->w16[1], argR->w16[1]);
    res->w16[2] = max16U(argL->w16[2], argR->w16[2]);
    res->w16[3] = max16U(argL->w16[3], argR->w16[3]);
    res->w16[4] = max16U(argL->w16[4], argR->w16[4]);
    res->w16[5] = max16U(argL->w16[5], argR->w16[5]);
    res->w16[6] = max16U(argL->w16[6], argR->w16[6]);
    res->w16[7] = max16U(argL->w16[7], argR->w16[7]);
 }

 void VEX_REGPARM(3)
      h_generic_calc_Min16Ux8 ( /*OUT*/V128* res,
                                V128* argL, V128* argR )
 {
    res->w16[0] = min16U(argL->w16[0], argR->w16[0]);
    res->w16[1] = min16U(argL->w16[1], argR->w16[1]);
    res->w16[2] = min16U(argL->w16[2], argR->w16[2]);
    res->w16[3] = min16U(argL->w16[3], argR->w16[3]);
    res->w16[4] = min16U(argL->w16[4], argR->w16[4]);
    res->w16[5] = min16U(argL->w16[5], argR->w16[5]);
    res->w16[6] = min16U(argL->w16[6], argR->w16[6]);
    res->w16[7] = min16U(argL->w16[7], argR->w16[7]);
 }

 void VEX_REGPARM(3)
      h_generic_calc_Max8Sx16 ( /*OUT*/V128* res,
                                V128* argL, V128* argR )
 {
    res->w8[ 0] = max8S(argL->w8[ 0], argR->w8[ 0]);
    res->w8[ 1] = max8S(argL->w8[ 1], argR->w8[ 1]);
    res->w8[ 2] = max8S(argL->w8[ 2], argR->w8[ 2]);
    res->w8[ 3] = max8S(argL->w8[ 3], argR->w8[ 3]);
    res->w8[ 4] = max8S(argL->w8[ 4], argR->w8[ 4]);
    res->w8[ 5] = max8S(argL->w8[ 5], argR->w8[ 5]);
    res->w8[ 6] = max8S(argL->w8[ 6], argR->w8[ 6]);
    res->w8[ 7] = max8S(argL->w8[ 7], argR->w8[ 7]);
    res->w8[ 8] = max8S(argL->w8[ 8], argR->w8[ 8]);
    res->w8[ 9] = max8S(argL->w8[ 9], argR->w8[ 9]);
    res->w8[10] = max8S(argL->w8[10], argR->w8[10]);
    res->w8[11] = max8S(argL->w8[11], argR->w8[11]);
    res->w8[12] = max8S(argL->w8[12], argR->w8[12]);
    res->w8[13] = max8S(argL->w8[13], argR->w8[13]);
    res->w8[14] = max8S(argL->w8[14], argR->w8[14]);
    res->w8[15] = max8S(argL->w8[15], argR->w8[15]);
 }

 void VEX_REGPARM(3)
      h_generic_calc_Min8Sx16 ( /*OUT*/V128* res,
                                V128* argL, V128* argR )
 {
    res->w8[ 0] = min8S(argL->w8[ 0], argR->w8[ 0]);
    res->w8[ 1] = min8S(argL->w8[ 1], argR->w8[ 1]);
    res->w8[ 2] = min8S(argL->w8[ 2], argR->w8[ 2]);
    res->w8[ 3] = min8S(argL->w8[ 3], argR->w8[ 3]);
    res->w8[ 4] = min8S(argL->w8[ 4], argR->w8[ 4]);
    res->w8[ 5] = min8S(argL->w8[ 5], argR->w8[ 5]);
    res->w8[ 6] = min8S(argL->w8[ 6], argR->w8[ 6]);
    res->w8[ 7] = min8S(argL->w8[ 7], argR->w8[ 7]);
    res->w8[ 8] = min8S(argL->w8[ 8], argR->w8[ 8]);
    res->w8[ 9] = min8S(argL->w8[ 9], argR->w8[ 9]);
    res->w8[10] = min8S(argL->w8[10], argR->w8[10]);
    res->w8[11] = min8S(argL->w8[11], argR->w8[11]);
    res->w8[12] = min8S(argL->w8[12], argR->w8[12]);
    res->w8[13] = min8S(argL->w8[13], argR->w8[13]);
    res->w8[14] = min8S(argL->w8[14], argR->w8[14]);
    res->w8[15] = min8S(argL->w8[15], argR->w8[15]);
 }

 void VEX_REGPARM(3)
      h_generic_calc_CmpEQ64x2 ( /*OUT*/V128* res,
                                 V128* argL, V128* argR )
 {
    res->w64[0] = cmpEQ64(argL->w64[0], argR->w64[0]);
    res->w64[1] = cmpEQ64(argL->w64[1], argR->w64[1]);
 }

 void VEX_REGPARM(3)
      h_generic_calc_CmpGT64Sx2 ( /*OUT*/V128* res,
                                  V128* argL, V128* argR )
 {
    res->w64[0] = cmpGT64S(argL->w64[0], argR->w64[0]);
    res->w64[1] = cmpGT64S(argL->w64[1], argR->w64[1]);
 }

 /* ------------ Shifting ------------ */
 /* Note that because these primops are undefined if the shift amount
    equals or exceeds the lane width, the shift amount is masked so
    that the scalar shifts are always in range.  In fact, given the
    semantics of these primops (Sar64x2, etc) it is an error if in
    fact we are ever given an out-of-range shift amount.
 */
 void /*not-regparm*/
      h_generic_calc_SarN64x2 ( /*OUT*/V128* res,
                                V128* argL, UInt nn)
 {
    /* vassert(nn < 64); */
    nn &= 63;
    res->w64[0] = sar64(argL->w64[0], nn);
    res->w64[1] = sar64(argL->w64[1], nn);
 }

 void /*not-regparm*/
      h_generic_calc_SarN8x16 ( /*OUT*/V128* res,
                               V128* argL, UInt nn)
 {
    /* vassert(nn < 8); */
    nn &= 7;
    res->w8[ 0] = sar8(argL->w8[ 0], nn);
    res->w8[ 1] = sar8(argL->w8[ 1], nn);
    res->w8[ 2] = sar8(argL->w8[ 2], nn);
    res->w8[ 3] = sar8(argL->w8[ 3], nn);
    res->w8[ 4] = sar8(argL->w8[ 4], nn);
    res->w8[ 5] = sar8(argL->w8[ 5], nn);
    res->w8[ 6] = sar8(argL->w8[ 6], nn);
    res->w8[ 7] = sar8(argL->w8[ 7], nn);
    res->w8[ 8] = sar8(argL->w8[ 8], nn);
    res->w8[ 9] = sar8(argL->w8[ 9], nn);
    res->w8[10] = sar8(argL->w8[10], nn);
    res->w8[11] = sar8(argL->w8[11], nn);
    res->w8[12] = sar8(argL->w8[12], nn);
    res->w8[13] = sar8(argL->w8[13], nn);
    res->w8[14] = sar8(argL->w8[14], nn);
    res->w8[15] = sar8(argL->w8[15], nn);
 }

 void VEX_REGPARM(3)
      h_generic_calc_QNarrowBin32Sto16Ux8 ( /*OUT*/V128* res,
                                            V128* argL, V128* argR )
 {
    res->w16[0] = qnarrow32Sto16U(argR->w32[0]);
    res->w16[1] = qnarrow32Sto16U(argR->w32[1]);
    res->w16[2] = qnarrow32Sto16U(argR->w32[2]);
    res->w16[3] = qnarrow32Sto16U(argR->w32[3]);
    res->w16[4] = qnarrow32Sto16U(argL->w32[0]);
    res->w16[5] = qnarrow32Sto16U(argL->w32[1]);
    res->w16[6] = qnarrow32Sto16U(argL->w32[2]);
    res->w16[7] = qnarrow32Sto16U(argL->w32[3]);
 }

 void VEX_REGPARM(3)
      h_generic_calc_NarrowBin16to8x16 ( /*OUT*/V128* res,
                                         V128* argL, V128* argR )
 {
    res->w8[ 0] = narrow16to8(argR->w16[0]);
    res->w8[ 1] = narrow16to8(argR->w16[1]);
    res->w8[ 2] = narrow16to8(argR->w16[2]);
    res->w8[ 3] = narrow16to8(argR->w16[3]);
    res->w8[ 4] = narrow16to8(argR->w16[4]);
    res->w8[ 5] = narrow16to8(argR->w16[5]);
    res->w8[ 6] = narrow16to8(argR->w16[6]);
    res->w8[ 7] = narrow16to8(argR->w16[7]);
    res->w8[ 8] = narrow16to8(argL->w16[0]);
    res->w8[ 9] = narrow16to8(argL->w16[1]);
    res->w8[10] = narrow16to8(argL->w16[2]);
    res->w8[11] = narrow16to8(argL->w16[3]);
    res->w8[12] = narrow16to8(argL->w16[4]);
    res->w8[13] = narrow16to8(argL->w16[5]);
    res->w8[14] = narrow16to8(argL->w16[6]);
    res->w8[15] = narrow16to8(argL->w16[7]);
 }

 void VEX_REGPARM(3)
      h_generic_calc_NarrowBin32to16x8 ( /*OUT*/V128* res,
                                         V128* argL, V128* argR )
 {
    res->w16[0] = narrow32to16(argR->w32[0]);
    res->w16[1] = narrow32to16(argR->w32[1]);
    res->w16[2] = narrow32to16(argR->w32[2]);
    res->w16[3] = narrow32to16(argR->w32[3]);
    res->w16[4] = narrow32to16(argL->w32[0]);
    res->w16[5] = narrow32to16(argL->w32[1]);
    res->w16[6] = narrow32to16(argL->w32[2]);
    res->w16[7] = narrow32to16(argL->w32[3]);
 }

 void VEX_REGPARM(3)
      h_generic_calc_Perm32x4 ( /*OUT*/V128* res,
                                V128* argL, V128* argR )
 {
    res->w32[0] = argL->w32[ argR->w32[0] & 3 ];
    res->w32[1] = argL->w32[ argR->w32[1] & 3 ];
    res->w32[2] = argL->w32[ argR->w32[2] & 3 ];
    res->w32[3] = argL->w32[ argR->w32[3] & 3 ];
 }

 UInt /*not-regparm*/
      h_generic_calc_GetMSBs8x16 ( ULong w64hi, ULong w64lo )
 {
    UInt r = 0;
    if (w64hi & (1ULL << (64-1))) r |= (1<<15);
    if (w64hi & (1ULL << (56-1))) r |= (1<<14);
    if (w64hi & (1ULL << (48-1))) r |= (1<<13);
    if (w64hi & (1ULL << (40-1))) r |= (1<<12);
    if (w64hi & (1ULL << (32-1))) r |= (1<<11);
    if (w64hi & (1ULL << (24-1))) r |= (1<<10);
    if (w64hi & (1ULL << (16-1))) r |= (1<<9);
    if (w64hi & (1ULL << ( 8-1))) r |= (1<<8);
    if (w64lo & (1ULL << (64-1))) r |= (1<<7);
    if (w64lo & (1ULL << (56-1))) r |= (1<<6);
    if (w64lo & (1ULL << (48-1))) r |= (1<<5);
    if (w64lo & (1ULL << (40-1))) r |= (1<<4);
    if (w64lo & (1ULL << (32-1))) r |= (1<<3);
    if (w64lo & (1ULL << (24-1))) r |= (1<<2);
    if (w64lo & (1ULL << (16-1))) r |= (1<<1);
    if (w64lo & (1ULL << ( 8-1))) r |= (1<<0);
    return r;
 }

 /*---------------------------------------------------------------*/
 /*--- end                              host_generic_simd128.c ---*/
 /*---------------------------------------------------------------*/

	/---------------------------------------------------------------/
	/--- begin host_generic_simd128.c ---/
	/---------------------------------------------------------------/

	/*
	This file is part of Valgrind, a dynamic binary instrumentation
	framework.

	Copyright (C) 2010-2012 OpenWorks GbR
	info@open-works.net

	This program is free software; you can redistribute it and/or
	modify it under the terms of the GNU General Public License as
	published by the Free Software Foundation; either version 2 of the
	License, or (at your option) any later version.

	This program is distributed in the hope that it will be useful, but
	WITHOUT ANY WARRANTY; without even the implied warranty of
	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
	General Public License for more details.

	You should have received a copy of the GNU General Public License
	along with this program; if not, write to the Free Software
	Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
	02110-1301, USA.

	The GNU General Public License is contained in the file COPYING.
	*/

	/* Generic helper functions for doing 128-bit SIMD arithmetic in cases
	where the instruction selectors cannot generate code in-line.
	These are purely back-end entities and cannot be seen/referenced
	from IR. */

	#include "libvex_basictypes.h"
	#include "host_generic_simd128.h"


	/* Primitive helpers always take args of the real type (signed vs
	unsigned) but return an unsigned result, so there's no conversion
	weirdness when stuffing results back in the V128 union fields,
	which are all unsigned. */

	static inline UInt mul32 ( Int xx, Int yy )
	{
	Int t = ((Int)xx) * ((Int)yy);
	return toUInt(t);
	}

	static inline UInt max32S ( Int xx, Int yy )
	{
	return toUInt((xx > yy) ? xx : yy);
	}

	static inline UInt min32S ( Int xx, Int yy )
	{
	return toUInt((xx < yy) ? xx : yy);
	}

	static inline UInt max32U ( UInt xx, UInt yy )
	{
	return toUInt((xx > yy) ? xx : yy);
	}

	static inline UInt min32U ( UInt xx, UInt yy )
	{
	return toUInt((xx < yy) ? xx : yy);
	}

	static inline UShort max16U ( UShort xx, UShort yy )
	{
	return toUShort((xx > yy) ? xx : yy);
	}

	static inline UShort min16U ( UShort xx, UShort yy )
	{
	return toUShort((xx < yy) ? xx : yy);
	}

	static inline UChar max8S ( Char xx, Char yy )
	{
	return toUChar((xx > yy) ? xx : yy);
	}

	static inline UChar min8S ( Char xx, Char yy )
	{
	return toUChar((xx < yy) ? xx : yy);
	}

	static inline ULong cmpEQ64 ( Long xx, Long yy )
	{
	return (((Long)xx) == ((Long)yy))
	? 0xFFFFFFFFFFFFFFFFULL : 0ULL;
	}

	static inline ULong cmpGT64S ( Long xx, Long yy )
	{
	return (((Long)xx) > ((Long)yy))
	? 0xFFFFFFFFFFFFFFFFULL : 0ULL;
	}

	static inline ULong sar64 ( ULong v, UInt n )
	{
	return ((Long)v) >> n;
	}

	static inline UChar sar8 ( UChar v, UInt n )
	{
	return toUChar(((Char)v) >> n);
	}

	static inline UShort qnarrow32Sto16U ( UInt xx0 )
	{
	Int xx = (Int)xx0;
	if (xx < 0) xx = 0;
	if (xx > 65535) xx = 65535;
	return (UShort)xx;
	}

	static inline UShort narrow32to16 ( UInt xx )
	{
	return (UShort)xx;
	}

	static inline UChar narrow16to8 ( UShort xx )
	{
	return (UChar)xx;
	}


	void VEX_REGPARM(3)
	h_generic_calc_Mul32x4 ( /OUT/V128* res,
	V128* argL, V128* argR )
	{
	res->w32[0] = mul32(argL->w32[0], argR->w32[0]);
	res->w32[1] = mul32(argL->w32[1], argR->w32[1]);
	res->w32[2] = mul32(argL->w32[2], argR->w32[2]);
	res->w32[3] = mul32(argL->w32[3], argR->w32[3]);
	}

	void VEX_REGPARM(3)
	h_generic_calc_Max32Sx4 ( /OUT/V128* res,
	V128* argL, V128* argR )
	{
	res->w32[0] = max32S(argL->w32[0], argR->w32[0]);
	res->w32[1] = max32S(argL->w32[1], argR->w32[1]);
	res->w32[2] = max32S(argL->w32[2], argR->w32[2]);
	res->w32[3] = max32S(argL->w32[3], argR->w32[3]);
	}

	void VEX_REGPARM(3)
	h_generic_calc_Min32Sx4 ( /OUT/V128* res,
	V128* argL, V128* argR )
	{
	res->w32[0] = min32S(argL->w32[0], argR->w32[0]);
	res->w32[1] = min32S(argL->w32[1], argR->w32[1]);
	res->w32[2] = min32S(argL->w32[2], argR->w32[2]);
	res->w32[3] = min32S(argL->w32[3], argR->w32[3]);
	}

	void VEX_REGPARM(3)
	h_generic_calc_Max32Ux4 ( /OUT/V128* res,
	V128* argL, V128* argR )
	{
	res->w32[0] = max32U(argL->w32[0], argR->w32[0]);
	res->w32[1] = max32U(argL->w32[1], argR->w32[1]);
	res->w32[2] = max32U(argL->w32[2], argR->w32[2]);
	res->w32[3] = max32U(argL->w32[3], argR->w32[3]);
	}

	void VEX_REGPARM(3)
	h_generic_calc_Min32Ux4 ( /OUT/V128* res,
	V128* argL, V128* argR )
	{
	res->w32[0] = min32U(argL->w32[0], argR->w32[0]);
	res->w32[1] = min32U(argL->w32[1], argR->w32[1]);
	res->w32[2] = min32U(argL->w32[2], argR->w32[2]);
	res->w32[3] = min32U(argL->w32[3], argR->w32[3]);
	}

	void VEX_REGPARM(3)
	h_generic_calc_Max16Ux8 ( /OUT/V128* res,
	V128* argL, V128* argR )
	{
	res->w16[0] = max16U(argL->w16[0], argR->w16[0]);
	res->w16[1] = max16U(argL->w16[1], argR->w16[1]);
	res->w16[2] = max16U(argL->w16[2], argR->w16[2]);
	res->w16[3] = max16U(argL->w16[3], argR->w16[3]);
	res->w16[4] = max16U(argL->w16[4], argR->w16[4]);
	res->w16[5] = max16U(argL->w16[5], argR->w16[5]);
	res->w16[6] = max16U(argL->w16[6], argR->w16[6]);
	res->w16[7] = max16U(argL->w16[7], argR->w16[7]);
	}

	void VEX_REGPARM(3)
	h_generic_calc_Min16Ux8 ( /OUT/V128* res,
	V128* argL, V128* argR )
	{
	res->w16[0] = min16U(argL->w16[0], argR->w16[0]);
	res->w16[1] = min16U(argL->w16[1], argR->w16[1]);
	res->w16[2] = min16U(argL->w16[2], argR->w16[2]);
	res->w16[3] = min16U(argL->w16[3], argR->w16[3]);
	res->w16[4] = min16U(argL->w16[4], argR->w16[4]);
	res->w16[5] = min16U(argL->w16[5], argR->w16[5]);
	res->w16[6] = min16U(argL->w16[6], argR->w16[6]);
	res->w16[7] = min16U(argL->w16[7], argR->w16[7]);
	}

	void VEX_REGPARM(3)
	h_generic_calc_Max8Sx16 ( /OUT/V128* res,
	V128* argL, V128* argR )
	{
	res->w8[ 0] = max8S(argL->w8[ 0], argR->w8[ 0]);
	res->w8[ 1] = max8S(argL->w8[ 1], argR->w8[ 1]);
	res->w8[ 2] = max8S(argL->w8[ 2], argR->w8[ 2]);
	res->w8[ 3] = max8S(argL->w8[ 3], argR->w8[ 3]);
	res->w8[ 4] = max8S(argL->w8[ 4], argR->w8[ 4]);
	res->w8[ 5] = max8S(argL->w8[ 5], argR->w8[ 5]);
	res->w8[ 6] = max8S(argL->w8[ 6], argR->w8[ 6]);
	res->w8[ 7] = max8S(argL->w8[ 7], argR->w8[ 7]);
	res->w8[ 8] = max8S(argL->w8[ 8], argR->w8[ 8]);
	res->w8[ 9] = max8S(argL->w8[ 9], argR->w8[ 9]);
	res->w8[10] = max8S(argL->w8[10], argR->w8[10]);
	res->w8[11] = max8S(argL->w8[11], argR->w8[11]);
	res->w8[12] = max8S(argL->w8[12], argR->w8[12]);
	res->w8[13] = max8S(argL->w8[13], argR->w8[13]);
	res->w8[14] = max8S(argL->w8[14], argR->w8[14]);
	res->w8[15] = max8S(argL->w8[15], argR->w8[15]);
	}

	void VEX_REGPARM(3)
	h_generic_calc_Min8Sx16 ( /OUT/V128* res,
	V128* argL, V128* argR )
	{
	res->w8[ 0] = min8S(argL->w8[ 0], argR->w8[ 0]);
	res->w8[ 1] = min8S(argL->w8[ 1], argR->w8[ 1]);
	res->w8[ 2] = min8S(argL->w8[ 2], argR->w8[ 2]);
	res->w8[ 3] = min8S(argL->w8[ 3], argR->w8[ 3]);
	res->w8[ 4] = min8S(argL->w8[ 4], argR->w8[ 4]);
	res->w8[ 5] = min8S(argL->w8[ 5], argR->w8[ 5]);
	res->w8[ 6] = min8S(argL->w8[ 6], argR->w8[ 6]);
	res->w8[ 7] = min8S(argL->w8[ 7], argR->w8[ 7]);
	res->w8[ 8] = min8S(argL->w8[ 8], argR->w8[ 8]);
	res->w8[ 9] = min8S(argL->w8[ 9], argR->w8[ 9]);
	res->w8[10] = min8S(argL->w8[10], argR->w8[10]);
	res->w8[11] = min8S(argL->w8[11], argR->w8[11]);
	res->w8[12] = min8S(argL->w8[12], argR->w8[12]);
	res->w8[13] = min8S(argL->w8[13], argR->w8[13]);
	res->w8[14] = min8S(argL->w8[14], argR->w8[14]);
	res->w8[15] = min8S(argL->w8[15], argR->w8[15]);
	}

	void VEX_REGPARM(3)
	h_generic_calc_CmpEQ64x2 ( /OUT/V128* res,
	V128* argL, V128* argR )
	{
	res->w64[0] = cmpEQ64(argL->w64[0], argR->w64[0]);
	res->w64[1] = cmpEQ64(argL->w64[1], argR->w64[1]);
	}

	void VEX_REGPARM(3)
	h_generic_calc_CmpGT64Sx2 ( /OUT/V128* res,
	V128* argL, V128* argR )
	{
	res->w64[0] = cmpGT64S(argL->w64[0], argR->w64[0]);
	res->w64[1] = cmpGT64S(argL->w64[1], argR->w64[1]);
	}

	/* ------------ Shifting ------------ */
	/* Note that because these primops are undefined if the shift amount
	equals or exceeds the lane width, the shift amount is masked so
	that the scalar shifts are always in range. In fact, given the
	semantics of these primops (Sar64x2, etc) it is an error if in
	fact we are ever given an out-of-range shift amount.
	*/
	void /not-regparm/
	h_generic_calc_SarN64x2 ( /OUT/V128* res,
	V128* argL, UInt nn)
	{
	/* vassert(nn < 64); */
	nn &= 63;
	res->w64[0] = sar64(argL->w64[0], nn);
	res->w64[1] = sar64(argL->w64[1], nn);
	}

	void /not-regparm/
	h_generic_calc_SarN8x16 ( /OUT/V128* res,
	V128* argL, UInt nn)
	{
	/* vassert(nn < 8); */
	nn &= 7;
	res->w8[ 0] = sar8(argL->w8[ 0], nn);
	res->w8[ 1] = sar8(argL->w8[ 1], nn);
	res->w8[ 2] = sar8(argL->w8[ 2], nn);
	res->w8[ 3] = sar8(argL->w8[ 3], nn);
	res->w8[ 4] = sar8(argL->w8[ 4], nn);
	res->w8[ 5] = sar8(argL->w8[ 5], nn);
	res->w8[ 6] = sar8(argL->w8[ 6], nn);
	res->w8[ 7] = sar8(argL->w8[ 7], nn);
	res->w8[ 8] = sar8(argL->w8[ 8], nn);
	res->w8[ 9] = sar8(argL->w8[ 9], nn);
	res->w8[10] = sar8(argL->w8[10], nn);
	res->w8[11] = sar8(argL->w8[11], nn);
	res->w8[12] = sar8(argL->w8[12], nn);
	res->w8[13] = sar8(argL->w8[13], nn);
	res->w8[14] = sar8(argL->w8[14], nn);
	res->w8[15] = sar8(argL->w8[15], nn);
	}

	void VEX_REGPARM(3)
	h_generic_calc_QNarrowBin32Sto16Ux8 ( /OUT/V128* res,
	V128* argL, V128* argR )
	{
	res->w16[0] = qnarrow32Sto16U(argR->w32[0]);
	res->w16[1] = qnarrow32Sto16U(argR->w32[1]);
	res->w16[2] = qnarrow32Sto16U(argR->w32[2]);
	res->w16[3] = qnarrow32Sto16U(argR->w32[3]);
	res->w16[4] = qnarrow32Sto16U(argL->w32[0]);
	res->w16[5] = qnarrow32Sto16U(argL->w32[1]);
	res->w16[6] = qnarrow32Sto16U(argL->w32[2]);
	res->w16[7] = qnarrow32Sto16U(argL->w32[3]);
	}

	void VEX_REGPARM(3)
	h_generic_calc_NarrowBin16to8x16 ( /OUT/V128* res,
	V128* argL, V128* argR )
	{
	res->w8[ 0] = narrow16to8(argR->w16[0]);
	res->w8[ 1] = narrow16to8(argR->w16[1]);
	res->w8[ 2] = narrow16to8(argR->w16[2]);
	res->w8[ 3] = narrow16to8(argR->w16[3]);
	res->w8[ 4] = narrow16to8(argR->w16[4]);
	res->w8[ 5] = narrow16to8(argR->w16[5]);
	res->w8[ 6] = narrow16to8(argR->w16[6]);
	res->w8[ 7] = narrow16to8(argR->w16[7]);
	res->w8[ 8] = narrow16to8(argL->w16[0]);
	res->w8[ 9] = narrow16to8(argL->w16[1]);
	res->w8[10] = narrow16to8(argL->w16[2]);
	res->w8[11] = narrow16to8(argL->w16[3]);
	res->w8[12] = narrow16to8(argL->w16[4]);
	res->w8[13] = narrow16to8(argL->w16[5]);
	res->w8[14] = narrow16to8(argL->w16[6]);
	res->w8[15] = narrow16to8(argL->w16[7]);
	}

	void VEX_REGPARM(3)
	h_generic_calc_NarrowBin32to16x8 ( /OUT/V128* res,
	V128* argL, V128* argR )
	{
	res->w16[0] = narrow32to16(argR->w32[0]);
	res->w16[1] = narrow32to16(argR->w32[1]);
	res->w16[2] = narrow32to16(argR->w32[2]);
	res->w16[3] = narrow32to16(argR->w32[3]);
	res->w16[4] = narrow32to16(argL->w32[0]);
	res->w16[5] = narrow32to16(argL->w32[1]);
	res->w16[6] = narrow32to16(argL->w32[2]);
	res->w16[7] = narrow32to16(argL->w32[3]);
	}

	void VEX_REGPARM(3)
	h_generic_calc_Perm32x4 ( /OUT/V128* res,
	V128* argL, V128* argR )
	{
	res->w32[0] = argL->w32[ argR->w32[0] & 3 ];
	res->w32[1] = argL->w32[ argR->w32[1] & 3 ];
	res->w32[2] = argL->w32[ argR->w32[2] & 3 ];
	res->w32[3] = argL->w32[ argR->w32[3] & 3 ];
	}

	UInt /not-regparm/
	h_generic_calc_GetMSBs8x16 ( ULong w64hi, ULong w64lo )
	{
	UInt r = 0;
	if (w64hi & (1ULL << (64-1))) r \|= (1<<15);
	if (w64hi & (1ULL << (56-1))) r \|= (1<<14);
	if (w64hi & (1ULL << (48-1))) r \|= (1<<13);
	if (w64hi & (1ULL << (40-1))) r \|= (1<<12);
	if (w64hi & (1ULL << (32-1))) r \|= (1<<11);
	if (w64hi & (1ULL << (24-1))) r \|= (1<<10);
	if (w64hi & (1ULL << (16-1))) r \|= (1<<9);
	if (w64hi & (1ULL << ( 8-1))) r \|= (1<<8);
	if (w64lo & (1ULL << (64-1))) r \|= (1<<7);
	if (w64lo & (1ULL << (56-1))) r \|= (1<<6);
	if (w64lo & (1ULL << (48-1))) r \|= (1<<5);
	if (w64lo & (1ULL << (40-1))) r \|= (1<<4);
	if (w64lo & (1ULL << (32-1))) r \|= (1<<3);
	if (w64lo & (1ULL << (24-1))) r \|= (1<<2);
	if (w64lo & (1ULL << (16-1))) r \|= (1<<1);
	if (w64lo & (1ULL << ( 8-1))) r \|= (1<<0);
	return r;
	}

	/---------------------------------------------------------------/
	/--- end host_generic_simd128.c ---/
	/---------------------------------------------------------------/