src/gallium/auxiliary/gallivm/lp_bld_format_yuv.c - platform/external/mesa3d - Git at Google

 /**************************************************************************
  *
  * Copyright 2010 VMware, Inc.
  * All Rights Reserved.
  *
  * Permission is hereby granted, free of charge, to any person obtaining a
  * copy of this software and associated documentation files (the
  * "Software"), to deal in the Software without restriction, including
  * without limitation the rights to use, copy, modify, merge, publish,
  * distribute, sub license, and/or sell copies of the Software, and to
  * permit persons to whom the Software is furnished to do so, subject to
  * the following conditions:
  *
  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
  * THE COPYRIGHT HOLDERS, AUTHORS AND/OR ITS SUPPLIERS BE LIABLE FOR ANY CLAIM,
  * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
  * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
  * USE OR OTHER DEALINGS IN THE SOFTWARE.
  *
  * The above copyright notice and this permission notice (including the
  * next paragraph) shall be included in all copies or substantial portions
  * of the Software.
  *
  **************************************************************************/


 /**
  * @file
  * YUV pixel format manipulation.
  *
  * @author Jose Fonseca <jfonseca@vmware.com>
  */


 #include "util/u_format.h"
 #include "util/u_cpu_detect.h"

 #include "lp_bld_arit.h"
 #include "lp_bld_type.h"
 #include "lp_bld_const.h"
 #include "lp_bld_conv.h"
 #include "lp_bld_gather.h"
 #include "lp_bld_format.h"
 #include "lp_bld_init.h"
 #include "lp_bld_logic.h"

 /**
  * Extract Y, U, V channels from packed UYVY.
  * @param packed  is a <n x i32> vector with the packed UYVY blocks
  * @param i  is a <n x i32> vector with the x pixel coordinate (0 or 1)
  */
 static void
 uyvy_to_yuv_soa(struct gallivm_state *gallivm,
                 unsigned n,
                 LLVMValueRef packed,
                 LLVMValueRef i,
                 LLVMValueRef *y,
                 LLVMValueRef *u,
                 LLVMValueRef *v)
 {
    LLVMBuilderRef builder = gallivm->builder;
    struct lp_type type;
    LLVMValueRef mask;

    memset(&type, 0, sizeof type);
    type.width = 32;
    type.length = n;

    assert(lp_check_value(type, packed));
    assert(lp_check_value(type, i));

    /*
     * Little endian:
     * y = (uyvy >> (16*i + 8)) & 0xff
     * u = (uyvy        ) & 0xff
     * v = (uyvy >> 16  ) & 0xff
     *
     * Big endian:
     * y = (uyvy >> (-16*i + 16)) & 0xff
     * u = (uyvy >> 24) & 0xff
     * v = (uyvy >>  8) & 0xff
     */

 #if defined(PIPE_ARCH_X86) || defined(PIPE_ARCH_X86_64)
    /*
     * Avoid shift with per-element count.
     * No support on x86, gets translated to roughly 5 instructions
     * per element. Didn't measure performance but cuts shader size
     * by quite a bit (less difference if cpu has no sse4.1 support).
     */
    if (util_cpu_caps.has_sse2 && n > 1) {
       LLVMValueRef sel, tmp, tmp2;
       struct lp_build_context bld32;

       lp_build_context_init(&bld32, gallivm, type);

       tmp = LLVMBuildLShr(builder, packed, lp_build_const_int_vec(gallivm, type, 8), "");
       tmp2 = LLVMBuildLShr(builder, tmp, lp_build_const_int_vec(gallivm, type, 16), "");
       sel = lp_build_compare(gallivm, type, PIPE_FUNC_EQUAL, i, lp_build_const_int_vec(gallivm, type, 0));
       *y = lp_build_select(&bld32, sel, tmp, tmp2);
    } else
 #endif
    {
       LLVMValueRef shift;
 #ifdef PIPE_ARCH_LITTLE_ENDIAN
       shift = LLVMBuildMul(builder, i, lp_build_const_int_vec(gallivm, type, 16), "");
       shift = LLVMBuildAdd(builder, shift, lp_build_const_int_vec(gallivm, type, 8), "");
 #else
       shift = LLVMBuildMul(builder, i, lp_build_const_int_vec(gallivm, type, -16), "");
       shift = LLVMBuildAdd(builder, shift, lp_build_const_int_vec(gallivm, type, 16), "");
 #endif
       *y = LLVMBuildLShr(builder, packed, shift, "");
    }

 #ifdef PIPE_ARCH_LITTLE_ENDIAN
    *u = packed;
    *v = LLVMBuildLShr(builder, packed, lp_build_const_int_vec(gallivm, type, 16), "");
 #else
    *u = LLVMBuildLShr(builder, packed, lp_build_const_int_vec(gallivm, type, 24), "");
    *v = LLVMBuildLShr(builder, packed, lp_build_const_int_vec(gallivm, type, 8), "");
 #endif

    mask = lp_build_const_int_vec(gallivm, type, 0xff);

    *y = LLVMBuildAnd(builder, *y, mask, "y");
    *u = LLVMBuildAnd(builder, *u, mask, "u");
    *v = LLVMBuildAnd(builder, *v, mask, "v");
 }


 /**
  * Extract Y, U, V channels from packed YUYV.
  * @param packed  is a <n x i32> vector with the packed YUYV blocks
  * @param i  is a <n x i32> vector with the x pixel coordinate (0 or 1)
  */
 static void
 yuyv_to_yuv_soa(struct gallivm_state *gallivm,
                 unsigned n,
                 LLVMValueRef packed,
                 LLVMValueRef i,
                 LLVMValueRef *y,
                 LLVMValueRef *u,
                 LLVMValueRef *v)
 {
    LLVMBuilderRef builder = gallivm->builder;
    struct lp_type type;
    LLVMValueRef mask;

    memset(&type, 0, sizeof type);
    type.width = 32;
    type.length = n;

    assert(lp_check_value(type, packed));
    assert(lp_check_value(type, i));

    /*
    * Little endian:
     * y = (yuyv >> 16*i) & 0xff
     * u = (yuyv >> 8   ) & 0xff
     * v = (yuyv >> 24  ) & 0xff
     *
     * Big endian:
     * y = (yuyv >> (-16*i + 24) & 0xff
     * u = (yuyv >> 16)          & 0xff
     * v = (yuyv)                & 0xff
     */

 #if defined(PIPE_ARCH_X86) || defined(PIPE_ARCH_X86_64)
    /*
     * Avoid shift with per-element count.
     * No support on x86, gets translated to roughly 5 instructions
     * per element. Didn't measure performance but cuts shader size
     * by quite a bit (less difference if cpu has no sse4.1 support).
     */
    if (util_cpu_caps.has_sse2 && n > 1) {
       LLVMValueRef sel, tmp;
       struct lp_build_context bld32;

       lp_build_context_init(&bld32, gallivm, type);

       tmp = LLVMBuildLShr(builder, packed, lp_build_const_int_vec(gallivm, type, 16), "");
       sel = lp_build_compare(gallivm, type, PIPE_FUNC_EQUAL, i, lp_build_const_int_vec(gallivm, type, 0));
        *y = lp_build_select(&bld32, sel, packed, tmp);
    } else
 #endif
    {
       LLVMValueRef shift;
 #ifdef PIPE_ARCH_LITTLE_ENDIAN
       shift = LLVMBuildMul(builder, i, lp_build_const_int_vec(gallivm, type, 16), "");
 #else
       shift = LLVMBuildMul(builder, i, lp_build_const_int_vec(gallivm, type, -16), "");
       shift = LLVMBuildAdd(builder, shift, lp_build_const_int_vec(gallivm, type, 24), "");
 #endif
       *y = LLVMBuildLShr(builder, packed, shift, "");
    }

 #ifdef PIPE_ARCH_LITTLE_ENDIAN
    *u = LLVMBuildLShr(builder, packed, lp_build_const_int_vec(gallivm, type, 8), "");
    *v = LLVMBuildLShr(builder, packed, lp_build_const_int_vec(gallivm, type, 24), "");
 #else
    *u = LLVMBuildLShr(builder, packed, lp_build_const_int_vec(gallivm, type, 16), "");
    *v = packed;
 #endif

    mask = lp_build_const_int_vec(gallivm, type, 0xff);

    *y = LLVMBuildAnd(builder, *y, mask, "y");
    *u = LLVMBuildAnd(builder, *u, mask, "u");
    *v = LLVMBuildAnd(builder, *v, mask, "v");
 }


 static inline void
 yuv_to_rgb_soa(struct gallivm_state *gallivm,
                unsigned n,
                LLVMValueRef y, LLVMValueRef u, LLVMValueRef v,
                LLVMValueRef *r, LLVMValueRef *g, LLVMValueRef *b)
 {
    LLVMBuilderRef builder = gallivm->builder;
    struct lp_type type;
    struct lp_build_context bld;

    LLVMValueRef c0;
    LLVMValueRef c8;
    LLVMValueRef c16;
    LLVMValueRef c128;
    LLVMValueRef c255;

    LLVMValueRef cy;
    LLVMValueRef cug;
    LLVMValueRef cub;
    LLVMValueRef cvr;
    LLVMValueRef cvg;

    memset(&type, 0, sizeof type);
    type.sign = TRUE;
    type.width = 32;
    type.length = n;

    lp_build_context_init(&bld, gallivm, type);

    assert(lp_check_value(type, y));
    assert(lp_check_value(type, u));
    assert(lp_check_value(type, v));

    /*
     * Constants
     */

    c0   = lp_build_const_int_vec(gallivm, type,   0);
    c8   = lp_build_const_int_vec(gallivm, type,   8);
    c16  = lp_build_const_int_vec(gallivm, type,  16);
    c128 = lp_build_const_int_vec(gallivm, type, 128);
    c255 = lp_build_const_int_vec(gallivm, type, 255);

    cy  = lp_build_const_int_vec(gallivm, type,  298);
    cug = lp_build_const_int_vec(gallivm, type, -100);
    cub = lp_build_const_int_vec(gallivm, type,  516);
    cvr = lp_build_const_int_vec(gallivm, type,  409);
    cvg = lp_build_const_int_vec(gallivm, type, -208);

    /*
     *  y -= 16;
     *  u -= 128;
     *  v -= 128;
     */

    y = LLVMBuildSub(builder, y, c16, "");
    u = LLVMBuildSub(builder, u, c128, "");
    v = LLVMBuildSub(builder, v, c128, "");

    /*
     * r = 298 * _y            + 409 * _v + 128;
     * g = 298 * _y - 100 * _u - 208 * _v + 128;
     * b = 298 * _y + 516 * _u            + 128;
     */

    y = LLVMBuildMul(builder, y, cy, "");
    y = LLVMBuildAdd(builder, y, c128, "");

    *r = LLVMBuildMul(builder, v, cvr, "");
    *g = LLVMBuildAdd(builder,
                      LLVMBuildMul(builder, u, cug, ""),
                      LLVMBuildMul(builder, v, cvg, ""),
                      "");
    *b = LLVMBuildMul(builder, u, cub, "");

    *r = LLVMBuildAdd(builder, *r, y, "");
    *g = LLVMBuildAdd(builder, *g, y, "");
    *b = LLVMBuildAdd(builder, *b, y, "");

    /*
     * r >>= 8;
     * g >>= 8;
     * b >>= 8;
     */

    *r = LLVMBuildAShr(builder, *r, c8, "r");
    *g = LLVMBuildAShr(builder, *g, c8, "g");
    *b = LLVMBuildAShr(builder, *b, c8, "b");

    /*
     * Clamp
     */

    *r = lp_build_clamp(&bld, *r, c0, c255);
    *g = lp_build_clamp(&bld, *g, c0, c255);
    *b = lp_build_clamp(&bld, *b, c0, c255);
 }


 static LLVMValueRef
 rgb_to_rgba_aos(struct gallivm_state *gallivm,
                 unsigned n,
                 LLVMValueRef r, LLVMValueRef g, LLVMValueRef b)
 {
    LLVMBuilderRef builder = gallivm->builder;
    struct lp_type type;
    LLVMValueRef a;
    LLVMValueRef rgba;

    memset(&type, 0, sizeof type);
    type.sign = TRUE;
    type.width = 32;
    type.length = n;

    assert(lp_check_value(type, r));
    assert(lp_check_value(type, g));
    assert(lp_check_value(type, b));

    /*
     * Make a 4 x unorm8 vector
     */

 #ifdef PIPE_ARCH_LITTLE_ENDIAN
    r = r;
    g = LLVMBuildShl(builder, g, lp_build_const_int_vec(gallivm, type, 8), "");
    b = LLVMBuildShl(builder, b, lp_build_const_int_vec(gallivm, type, 16), "");
    a = lp_build_const_int_vec(gallivm, type, 0xff000000);
 #else
    r = LLVMBuildShl(builder, r, lp_build_const_int_vec(gallivm, type, 24), "");
    g = LLVMBuildShl(builder, g, lp_build_const_int_vec(gallivm, type, 16), "");
    b = LLVMBuildShl(builder, b, lp_build_const_int_vec(gallivm, type, 8), "");
    a = lp_build_const_int_vec(gallivm, type, 0x000000ff);
 #endif

    rgba = r;
    rgba = LLVMBuildOr(builder, rgba, g, "");
    rgba = LLVMBuildOr(builder, rgba, b, "");
    rgba = LLVMBuildOr(builder, rgba, a, "");

    rgba = LLVMBuildBitCast(builder, rgba,
                            LLVMVectorType(LLVMInt8TypeInContext(gallivm->context), 4*n), "");

    return rgba;
 }


 /**
  * Convert from <n x i32> packed UYVY to <4n x i8> RGBA AoS
  */
 static LLVMValueRef
 uyvy_to_rgba_aos(struct gallivm_state *gallivm,
                  unsigned n,
                  LLVMValueRef packed,
                  LLVMValueRef i)
 {
    LLVMValueRef y, u, v;
    LLVMValueRef r, g, b;
    LLVMValueRef rgba;

    uyvy_to_yuv_soa(gallivm, n, packed, i, &y, &u, &v);
    yuv_to_rgb_soa(gallivm, n, y, u, v, &r, &g, &b);
    rgba = rgb_to_rgba_aos(gallivm, n, r, g, b);

    return rgba;
 }


 /**
  * Convert from <n x i32> packed YUYV to <4n x i8> RGBA AoS
  */
 static LLVMValueRef
 yuyv_to_rgba_aos(struct gallivm_state *gallivm,
                  unsigned n,
                  LLVMValueRef packed,
                  LLVMValueRef i)
 {
    LLVMValueRef y, u, v;
    LLVMValueRef r, g, b;
    LLVMValueRef rgba;

    yuyv_to_yuv_soa(gallivm, n, packed, i, &y, &u, &v);
    yuv_to_rgb_soa(gallivm, n, y, u, v, &r, &g, &b);
    rgba = rgb_to_rgba_aos(gallivm, n, r, g, b);

    return rgba;
 }


 /**
  * Convert from <n x i32> packed RG_BG to <4n x i8> RGBA AoS
  */
 static LLVMValueRef
 rgbg_to_rgba_aos(struct gallivm_state *gallivm,
                  unsigned n,
                  LLVMValueRef packed,
                  LLVMValueRef i)
 {
    LLVMValueRef r, g, b;
    LLVMValueRef rgba;

    uyvy_to_yuv_soa(gallivm, n, packed, i, &g, &r, &b);
    rgba = rgb_to_rgba_aos(gallivm, n, r, g, b);

    return rgba;
 }


 /**
  * Convert from <n x i32> packed GR_GB to <4n x i8> RGBA AoS
  */
 static LLVMValueRef
 grgb_to_rgba_aos(struct gallivm_state *gallivm,
                  unsigned n,
                  LLVMValueRef packed,
                  LLVMValueRef i)
 {
    LLVMValueRef r, g, b;
    LLVMValueRef rgba;

    yuyv_to_yuv_soa(gallivm, n, packed, i, &g, &r, &b);
    rgba = rgb_to_rgba_aos(gallivm, n, r, g, b);

    return rgba;
 }

 /**
  * Convert from <n x i32> packed GR_BR to <4n x i8> RGBA AoS
  */
 static LLVMValueRef
 grbr_to_rgba_aos(struct gallivm_state *gallivm,
                  unsigned n,
                  LLVMValueRef packed,
                  LLVMValueRef i)
 {
    LLVMValueRef r, g, b;
    LLVMValueRef rgba;

    uyvy_to_yuv_soa(gallivm, n, packed, i, &r, &g, &b);
    rgba = rgb_to_rgba_aos(gallivm, n, r, g, b);

    return rgba;
 }


 /**
  * Convert from <n x i32> packed RG_RB to <4n x i8> RGBA AoS
  */
 static LLVMValueRef
 rgrb_to_rgba_aos(struct gallivm_state *gallivm,
                  unsigned n,
                  LLVMValueRef packed,
                  LLVMValueRef i)
 {
    LLVMValueRef r, g, b;
    LLVMValueRef rgba;

    yuyv_to_yuv_soa(gallivm, n, packed, i, &r, &g, &b);
    rgba = rgb_to_rgba_aos(gallivm, n, r, g, b);

    return rgba;
 }

 /**
  * @param n  is the number of pixels processed
  * @param packed  is a <n x i32> vector with the packed YUYV blocks
  * @param i  is a <n x i32> vector with the x pixel coordinate (0 or 1)
  * @return  a <4*n x i8> vector with the pixel RGBA values in AoS
  */
 LLVMValueRef
 lp_build_fetch_subsampled_rgba_aos(struct gallivm_state *gallivm,
                                    const struct util_format_description *format_desc,
                                    unsigned n,
                                    LLVMValueRef base_ptr,
                                    LLVMValueRef offset,
                                    LLVMValueRef i,
                                    LLVMValueRef j)
 {
    LLVMValueRef packed;
    LLVMValueRef rgba;

    assert(format_desc->layout == UTIL_FORMAT_LAYOUT_SUBSAMPLED);
    assert(format_desc->block.bits == 32);
    assert(format_desc->block.width == 2);
    assert(format_desc->block.height == 1);

    packed = lp_build_gather(gallivm, n, 32, 32, TRUE, base_ptr, offset, FALSE);

    (void)j;

    switch (format_desc->format) {
    case PIPE_FORMAT_UYVY:
       rgba = uyvy_to_rgba_aos(gallivm, n, packed, i);
       break;
    case PIPE_FORMAT_YUYV:
       rgba = yuyv_to_rgba_aos(gallivm, n, packed, i);
       break;
    case PIPE_FORMAT_R8G8_B8G8_UNORM:
       rgba = rgbg_to_rgba_aos(gallivm, n, packed, i);
       break;
    case PIPE_FORMAT_G8R8_G8B8_UNORM:
       rgba = grgb_to_rgba_aos(gallivm, n, packed, i);
       break;
    case PIPE_FORMAT_G8R8_B8R8_UNORM:
       rgba = grbr_to_rgba_aos(gallivm, n, packed, i);
       break;
    case PIPE_FORMAT_R8G8_R8B8_UNORM:
       rgba = rgrb_to_rgba_aos(gallivm, n, packed, i);
       break;
    default:
       assert(0);
       rgba =  LLVMGetUndef(LLVMVectorType(LLVMInt8TypeInContext(gallivm->context), 4*n));
       break;
    }

    return rgba;
 }
	/**************************************************************************
	*
	* Copyright 2010 VMware, Inc.
	* All Rights Reserved.
	*
	* Permission is hereby granted, free of charge, to any person obtaining a
	* copy of this software and associated documentation files (the
	* "Software"), to deal in the Software without restriction, including
	* without limitation the rights to use, copy, modify, merge, publish,
	* distribute, sub license, and/or sell copies of the Software, and to
	* permit persons to whom the Software is furnished to do so, subject to
	* the following conditions:
	*
	* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
	* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
	* FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
	* THE COPYRIGHT HOLDERS, AUTHORS AND/OR ITS SUPPLIERS BE LIABLE FOR ANY CLAIM,
	* DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
	* OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
	* USE OR OTHER DEALINGS IN THE SOFTWARE.
	*
	* The above copyright notice and this permission notice (including the
	* next paragraph) shall be included in all copies or substantial portions
	* of the Software.
	*
	**************************************************************************/


	/**
	* @file
	* YUV pixel format manipulation.
	*
	* @author Jose Fonseca <jfonseca@vmware.com>
	*/


	#include "util/u_format.h"
	#include "util/u_cpu_detect.h"

	#include "lp_bld_arit.h"
	#include "lp_bld_type.h"
	#include "lp_bld_const.h"
	#include "lp_bld_conv.h"
	#include "lp_bld_gather.h"
	#include "lp_bld_format.h"
	#include "lp_bld_init.h"
	#include "lp_bld_logic.h"

	/**
	* Extract Y, U, V channels from packed UYVY.
	* @param packed is a <n x i32> vector with the packed UYVY blocks
	* @param i is a <n x i32> vector with the x pixel coordinate (0 or 1)
	*/
	static void
	uyvy_to_yuv_soa(struct gallivm_state *gallivm,
	unsigned n,
	LLVMValueRef packed,
	LLVMValueRef i,
	LLVMValueRef *y,
	LLVMValueRef *u,
	LLVMValueRef *v)
	{
	LLVMBuilderRef builder = gallivm->builder;
	struct lp_type type;
	LLVMValueRef mask;

	memset(&type, 0, sizeof type);
	type.width = 32;
	type.length = n;

	assert(lp_check_value(type, packed));
	assert(lp_check_value(type, i));

	/*
	* Little endian:
	* y = (uyvy >> (16*i + 8)) & 0xff
	* u = (uyvy ) & 0xff
	* v = (uyvy >> 16 ) & 0xff
	*
	* Big endian:
	* y = (uyvy >> (-16*i + 16)) & 0xff
	* u = (uyvy >> 24) & 0xff
	* v = (uyvy >> 8) & 0xff
	*/

	#if defined(PIPE_ARCH_X86) \|\| defined(PIPE_ARCH_X86_64)
	/*
	* Avoid shift with per-element count.
	* No support on x86, gets translated to roughly 5 instructions
	* per element. Didn't measure performance but cuts shader size
	* by quite a bit (less difference if cpu has no sse4.1 support).
	*/
	if (util_cpu_caps.has_sse2 && n > 1) {
	LLVMValueRef sel, tmp, tmp2;
	struct lp_build_context bld32;

	lp_build_context_init(&bld32, gallivm, type);

	tmp = LLVMBuildLShr(builder, packed, lp_build_const_int_vec(gallivm, type, 8), "");
	tmp2 = LLVMBuildLShr(builder, tmp, lp_build_const_int_vec(gallivm, type, 16), "");
	sel = lp_build_compare(gallivm, type, PIPE_FUNC_EQUAL, i, lp_build_const_int_vec(gallivm, type, 0));
	*y = lp_build_select(&bld32, sel, tmp, tmp2);
	} else
	#endif
	{
	LLVMValueRef shift;
	#ifdef PIPE_ARCH_LITTLE_ENDIAN
	shift = LLVMBuildMul(builder, i, lp_build_const_int_vec(gallivm, type, 16), "");
	shift = LLVMBuildAdd(builder, shift, lp_build_const_int_vec(gallivm, type, 8), "");
	#else
	shift = LLVMBuildMul(builder, i, lp_build_const_int_vec(gallivm, type, -16), "");
	shift = LLVMBuildAdd(builder, shift, lp_build_const_int_vec(gallivm, type, 16), "");
	#endif
	*y = LLVMBuildLShr(builder, packed, shift, "");
	}

	#ifdef PIPE_ARCH_LITTLE_ENDIAN
	*u = packed;
	*v = LLVMBuildLShr(builder, packed, lp_build_const_int_vec(gallivm, type, 16), "");
	#else
	*u = LLVMBuildLShr(builder, packed, lp_build_const_int_vec(gallivm, type, 24), "");
	*v = LLVMBuildLShr(builder, packed, lp_build_const_int_vec(gallivm, type, 8), "");
	#endif

	mask = lp_build_const_int_vec(gallivm, type, 0xff);

	y = LLVMBuildAnd(builder, y, mask, "y");
	u = LLVMBuildAnd(builder, u, mask, "u");
	v = LLVMBuildAnd(builder, v, mask, "v");
	}


	/**
	* Extract Y, U, V channels from packed YUYV.
	* @param packed is a <n x i32> vector with the packed YUYV blocks
	* @param i is a <n x i32> vector with the x pixel coordinate (0 or 1)
	*/
	static void
	yuyv_to_yuv_soa(struct gallivm_state *gallivm,
	unsigned n,
	LLVMValueRef packed,
	LLVMValueRef i,
	LLVMValueRef *y,
	LLVMValueRef *u,
	LLVMValueRef *v)
	{
	LLVMBuilderRef builder = gallivm->builder;
	struct lp_type type;
	LLVMValueRef mask;

	memset(&type, 0, sizeof type);
	type.width = 32;
	type.length = n;

	assert(lp_check_value(type, packed));
	assert(lp_check_value(type, i));

	/*
	* Little endian:
	* y = (yuyv >> 16*i) & 0xff
	* u = (yuyv >> 8 ) & 0xff
	* v = (yuyv >> 24 ) & 0xff
	*
	* Big endian:
	* y = (yuyv >> (-16*i + 24) & 0xff
	* u = (yuyv >> 16) & 0xff
	* v = (yuyv) & 0xff
	*/

	#if defined(PIPE_ARCH_X86) \|\| defined(PIPE_ARCH_X86_64)
	/*
	* Avoid shift with per-element count.
	* No support on x86, gets translated to roughly 5 instructions
	* per element. Didn't measure performance but cuts shader size
	* by quite a bit (less difference if cpu has no sse4.1 support).
	*/
	if (util_cpu_caps.has_sse2 && n > 1) {
	LLVMValueRef sel, tmp;
	struct lp_build_context bld32;

	lp_build_context_init(&bld32, gallivm, type);

	tmp = LLVMBuildLShr(builder, packed, lp_build_const_int_vec(gallivm, type, 16), "");
	sel = lp_build_compare(gallivm, type, PIPE_FUNC_EQUAL, i, lp_build_const_int_vec(gallivm, type, 0));
	*y = lp_build_select(&bld32, sel, packed, tmp);
	} else
	#endif
	{
	LLVMValueRef shift;
	#ifdef PIPE_ARCH_LITTLE_ENDIAN
	shift = LLVMBuildMul(builder, i, lp_build_const_int_vec(gallivm, type, 16), "");
	#else
	shift = LLVMBuildMul(builder, i, lp_build_const_int_vec(gallivm, type, -16), "");
	shift = LLVMBuildAdd(builder, shift, lp_build_const_int_vec(gallivm, type, 24), "");
	#endif
	*y = LLVMBuildLShr(builder, packed, shift, "");
	}

	#ifdef PIPE_ARCH_LITTLE_ENDIAN
	*u = LLVMBuildLShr(builder, packed, lp_build_const_int_vec(gallivm, type, 8), "");
	*v = LLVMBuildLShr(builder, packed, lp_build_const_int_vec(gallivm, type, 24), "");
	#else
	*u = LLVMBuildLShr(builder, packed, lp_build_const_int_vec(gallivm, type, 16), "");
	*v = packed;
	#endif

	mask = lp_build_const_int_vec(gallivm, type, 0xff);

	y = LLVMBuildAnd(builder, y, mask, "y");
	u = LLVMBuildAnd(builder, u, mask, "u");
	v = LLVMBuildAnd(builder, v, mask, "v");
	}


	static inline void
	yuv_to_rgb_soa(struct gallivm_state *gallivm,
	unsigned n,
	LLVMValueRef y, LLVMValueRef u, LLVMValueRef v,
	LLVMValueRef r, LLVMValueRef g, LLVMValueRef *b)
	{
	LLVMBuilderRef builder = gallivm->builder;
	struct lp_type type;
	struct lp_build_context bld;

	LLVMValueRef c0;
	LLVMValueRef c8;
	LLVMValueRef c16;
	LLVMValueRef c128;
	LLVMValueRef c255;

	LLVMValueRef cy;
	LLVMValueRef cug;
	LLVMValueRef cub;
	LLVMValueRef cvr;
	LLVMValueRef cvg;

	memset(&type, 0, sizeof type);
	type.sign = TRUE;
	type.width = 32;
	type.length = n;

	lp_build_context_init(&bld, gallivm, type);

	assert(lp_check_value(type, y));
	assert(lp_check_value(type, u));
	assert(lp_check_value(type, v));

	/*
	* Constants
	*/

	c0 = lp_build_const_int_vec(gallivm, type, 0);
	c8 = lp_build_const_int_vec(gallivm, type, 8);
	c16 = lp_build_const_int_vec(gallivm, type, 16);
	c128 = lp_build_const_int_vec(gallivm, type, 128);
	c255 = lp_build_const_int_vec(gallivm, type, 255);

	cy = lp_build_const_int_vec(gallivm, type, 298);
	cug = lp_build_const_int_vec(gallivm, type, -100);
	cub = lp_build_const_int_vec(gallivm, type, 516);
	cvr = lp_build_const_int_vec(gallivm, type, 409);
	cvg = lp_build_const_int_vec(gallivm, type, -208);

	/*
	* y -= 16;
	* u -= 128;
	* v -= 128;
	*/

	y = LLVMBuildSub(builder, y, c16, "");
	u = LLVMBuildSub(builder, u, c128, "");
	v = LLVMBuildSub(builder, v, c128, "");

	/*
	* r = 298 * _y + 409 * _v + 128;
	* g = 298 * _y - 100 * _u - 208 * _v + 128;
	* b = 298 * _y + 516 * _u + 128;
	*/

	y = LLVMBuildMul(builder, y, cy, "");
	y = LLVMBuildAdd(builder, y, c128, "");

	*r = LLVMBuildMul(builder, v, cvr, "");
	*g = LLVMBuildAdd(builder,
	LLVMBuildMul(builder, u, cug, ""),
	LLVMBuildMul(builder, v, cvg, ""),
	"");
	*b = LLVMBuildMul(builder, u, cub, "");

	r = LLVMBuildAdd(builder, r, y, "");
	g = LLVMBuildAdd(builder, g, y, "");
	b = LLVMBuildAdd(builder, b, y, "");

	/*
	* r >>= 8;
	* g >>= 8;
	* b >>= 8;
	*/

	r = LLVMBuildAShr(builder, r, c8, "r");
	g = LLVMBuildAShr(builder, g, c8, "g");
	b = LLVMBuildAShr(builder, b, c8, "b");

	/*
	* Clamp
	*/

	r = lp_build_clamp(&bld, r, c0, c255);
	g = lp_build_clamp(&bld, g, c0, c255);
	b = lp_build_clamp(&bld, b, c0, c255);
	}


	static LLVMValueRef
	rgb_to_rgba_aos(struct gallivm_state *gallivm,
	unsigned n,
	LLVMValueRef r, LLVMValueRef g, LLVMValueRef b)
	{
	LLVMBuilderRef builder = gallivm->builder;
	struct lp_type type;
	LLVMValueRef a;
	LLVMValueRef rgba;

	memset(&type, 0, sizeof type);
	type.sign = TRUE;
	type.width = 32;
	type.length = n;

	assert(lp_check_value(type, r));
	assert(lp_check_value(type, g));
	assert(lp_check_value(type, b));

	/*
	* Make a 4 x unorm8 vector
	*/

	#ifdef PIPE_ARCH_LITTLE_ENDIAN
	r = r;
	g = LLVMBuildShl(builder, g, lp_build_const_int_vec(gallivm, type, 8), "");
	b = LLVMBuildShl(builder, b, lp_build_const_int_vec(gallivm, type, 16), "");
	a = lp_build_const_int_vec(gallivm, type, 0xff000000);
	#else
	r = LLVMBuildShl(builder, r, lp_build_const_int_vec(gallivm, type, 24), "");
	g = LLVMBuildShl(builder, g, lp_build_const_int_vec(gallivm, type, 16), "");
	b = LLVMBuildShl(builder, b, lp_build_const_int_vec(gallivm, type, 8), "");
	a = lp_build_const_int_vec(gallivm, type, 0x000000ff);
	#endif

	rgba = r;
	rgba = LLVMBuildOr(builder, rgba, g, "");
	rgba = LLVMBuildOr(builder, rgba, b, "");
	rgba = LLVMBuildOr(builder, rgba, a, "");

	rgba = LLVMBuildBitCast(builder, rgba,
	LLVMVectorType(LLVMInt8TypeInContext(gallivm->context), 4*n), "");

	return rgba;
	}


	/**
	* Convert from <n x i32> packed UYVY to <4n x i8> RGBA AoS
	*/
	static LLVMValueRef
	uyvy_to_rgba_aos(struct gallivm_state *gallivm,
	unsigned n,
	LLVMValueRef packed,
	LLVMValueRef i)
	{
	LLVMValueRef y, u, v;
	LLVMValueRef r, g, b;
	LLVMValueRef rgba;

	uyvy_to_yuv_soa(gallivm, n, packed, i, &y, &u, &v);
	yuv_to_rgb_soa(gallivm, n, y, u, v, &r, &g, &b);
	rgba = rgb_to_rgba_aos(gallivm, n, r, g, b);

	return rgba;
	}


	/**
	* Convert from <n x i32> packed YUYV to <4n x i8> RGBA AoS
	*/
	static LLVMValueRef
	yuyv_to_rgba_aos(struct gallivm_state *gallivm,
	unsigned n,
	LLVMValueRef packed,
	LLVMValueRef i)
	{
	LLVMValueRef y, u, v;
	LLVMValueRef r, g, b;
	LLVMValueRef rgba;

	yuyv_to_yuv_soa(gallivm, n, packed, i, &y, &u, &v);
	yuv_to_rgb_soa(gallivm, n, y, u, v, &r, &g, &b);
	rgba = rgb_to_rgba_aos(gallivm, n, r, g, b);

	return rgba;
	}


	/**
	* Convert from <n x i32> packed RG_BG to <4n x i8> RGBA AoS
	*/
	static LLVMValueRef
	rgbg_to_rgba_aos(struct gallivm_state *gallivm,
	unsigned n,
	LLVMValueRef packed,
	LLVMValueRef i)
	{
	LLVMValueRef r, g, b;
	LLVMValueRef rgba;

	uyvy_to_yuv_soa(gallivm, n, packed, i, &g, &r, &b);
	rgba = rgb_to_rgba_aos(gallivm, n, r, g, b);

	return rgba;
	}


	/**
	* Convert from <n x i32> packed GR_GB to <4n x i8> RGBA AoS
	*/
	static LLVMValueRef
	grgb_to_rgba_aos(struct gallivm_state *gallivm,
	unsigned n,
	LLVMValueRef packed,
	LLVMValueRef i)
	{
	LLVMValueRef r, g, b;
	LLVMValueRef rgba;

	yuyv_to_yuv_soa(gallivm, n, packed, i, &g, &r, &b);
	rgba = rgb_to_rgba_aos(gallivm, n, r, g, b);

	return rgba;
	}

	/**
	* Convert from <n x i32> packed GR_BR to <4n x i8> RGBA AoS
	*/
	static LLVMValueRef
	grbr_to_rgba_aos(struct gallivm_state *gallivm,
	unsigned n,
	LLVMValueRef packed,
	LLVMValueRef i)
	{
	LLVMValueRef r, g, b;
	LLVMValueRef rgba;

	uyvy_to_yuv_soa(gallivm, n, packed, i, &r, &g, &b);
	rgba = rgb_to_rgba_aos(gallivm, n, r, g, b);

	return rgba;
	}


	/**
	* Convert from <n x i32> packed RG_RB to <4n x i8> RGBA AoS
	*/
	static LLVMValueRef
	rgrb_to_rgba_aos(struct gallivm_state *gallivm,
	unsigned n,
	LLVMValueRef packed,
	LLVMValueRef i)
	{
	LLVMValueRef r, g, b;
	LLVMValueRef rgba;

	yuyv_to_yuv_soa(gallivm, n, packed, i, &r, &g, &b);
	rgba = rgb_to_rgba_aos(gallivm, n, r, g, b);

	return rgba;
	}

	/**
	* @param n is the number of pixels processed
	* @param packed is a <n x i32> vector with the packed YUYV blocks
	* @param i is a <n x i32> vector with the x pixel coordinate (0 or 1)
	* @return a <4*n x i8> vector with the pixel RGBA values in AoS
	*/
	LLVMValueRef
	lp_build_fetch_subsampled_rgba_aos(struct gallivm_state *gallivm,
	const struct util_format_description *format_desc,
	unsigned n,
	LLVMValueRef base_ptr,
	LLVMValueRef offset,
	LLVMValueRef i,
	LLVMValueRef j)
	{
	LLVMValueRef packed;
	LLVMValueRef rgba;

	assert(format_desc->layout == UTIL_FORMAT_LAYOUT_SUBSAMPLED);
	assert(format_desc->block.bits == 32);
	assert(format_desc->block.width == 2);
	assert(format_desc->block.height == 1);

	packed = lp_build_gather(gallivm, n, 32, 32, TRUE, base_ptr, offset, FALSE);

	(void)j;

	switch (format_desc->format) {
	case PIPE_FORMAT_UYVY:
	rgba = uyvy_to_rgba_aos(gallivm, n, packed, i);
	break;
	case PIPE_FORMAT_YUYV:
	rgba = yuyv_to_rgba_aos(gallivm, n, packed, i);
	break;
	case PIPE_FORMAT_R8G8_B8G8_UNORM:
	rgba = rgbg_to_rgba_aos(gallivm, n, packed, i);
	break;
	case PIPE_FORMAT_G8R8_G8B8_UNORM:
	rgba = grgb_to_rgba_aos(gallivm, n, packed, i);
	break;
	case PIPE_FORMAT_G8R8_B8R8_UNORM:
	rgba = grbr_to_rgba_aos(gallivm, n, packed, i);
	break;
	case PIPE_FORMAT_R8G8_R8B8_UNORM:
	rgba = rgrb_to_rgba_aos(gallivm, n, packed, i);
	break;
	default:
	assert(0);
	rgba = LLVMGetUndef(LLVMVectorType(LLVMInt8TypeInContext(gallivm->context), 4*n));
	break;
	}

	return rgba;
	}