src/mesa/drivers/dri/i965/brw_fs_surface_builder.cpp - platform/external/mesa3d - Git at Google

 /*
  * Copyright © 2013-2015 Intel Corporation
  *
  * Permission is hereby granted, free of charge, to any person obtaining a
  * copy of this software and associated documentation files (the "Software"),
  * to deal in the Software without restriction, including without limitation
  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
  * and/or sell copies of the Software, and to permit persons to whom the
  * Software is furnished to do so, subject to the following conditions:
  *
  * The above copyright notice and this permission notice (including the next
  * paragraph) shall be included in all copies or substantial portions of the
  * Software.
  *
  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  * IN THE SOFTWARE.
  */

 #include "isl/isl.h"
 #include "brw_fs_surface_builder.h"
 #include "brw_fs.h"

 using namespace brw;

 namespace brw {
    namespace surface_access {
       namespace {
          /**
           * Generate a logical send opcode for a surface message and return
           * the result.
           */
          fs_reg
          emit_send(const fs_builder &bld, enum opcode opcode,
                    const fs_reg &addr, const fs_reg &src, const fs_reg &surface,
                    unsigned dims, unsigned arg, unsigned rsize,
                    brw_predicate pred = BRW_PREDICATE_NONE)
          {
             /* Reduce the dynamically uniform surface index to a single
              * scalar.
              */
             const fs_reg usurface = bld.emit_uniformize(surface);
             const fs_reg srcs[] = {
                addr, src, usurface, brw_imm_ud(dims), brw_imm_ud(arg)
             };
             const fs_reg dst = bld.vgrf(BRW_REGISTER_TYPE_UD, rsize);
             fs_inst *inst = bld.emit(opcode, dst, srcs, ARRAY_SIZE(srcs));

             inst->size_written = rsize * dst.component_size(inst->exec_size);
             inst->predicate = pred;
             return dst;
          }
       }

       /**
        * Emit an untyped surface read opcode.  \p dims determines the number
        * of components of the address and \p size the number of components of
        * the returned value.
        */
       fs_reg
       emit_untyped_read(const fs_builder &bld,
                         const fs_reg &surface, const fs_reg &addr,
                         unsigned dims, unsigned size,
                         brw_predicate pred)
       {
          return emit_send(bld, SHADER_OPCODE_UNTYPED_SURFACE_READ_LOGICAL,
                           addr, fs_reg(), surface, dims, size, size, pred);
       }

       /**
        * Emit an untyped surface write opcode.  \p dims determines the number
        * of components of the address and \p size the number of components of
        * the argument.
        */
       void
       emit_untyped_write(const fs_builder &bld, const fs_reg &surface,
                          const fs_reg &addr, const fs_reg &src,
                          unsigned dims, unsigned size,
                          brw_predicate pred)
       {
          emit_send(bld, SHADER_OPCODE_UNTYPED_SURFACE_WRITE_LOGICAL,
                    addr, src, surface, dims, size, 0, pred);
       }

       /**
        * Emit an untyped surface atomic opcode.  \p dims determines the number
        * of components of the address and \p rsize the number of components of
        * the returned value (either zero or one).
        */
       fs_reg
       emit_untyped_atomic(const fs_builder &bld,
                           const fs_reg &surface, const fs_reg &addr,
                           const fs_reg &src0, const fs_reg &src1,
                           unsigned dims, unsigned rsize, unsigned op,
                           brw_predicate pred)
       {
          /* FINISHME: Factor out this frequently recurring pattern into a
           * helper function.
           */
          const unsigned n = (src0.file != BAD_FILE) + (src1.file != BAD_FILE);
          const fs_reg srcs[] = { src0, src1 };
          const fs_reg tmp = bld.vgrf(BRW_REGISTER_TYPE_UD, n);
          bld.LOAD_PAYLOAD(tmp, srcs, n, 0);

          return emit_send(bld, SHADER_OPCODE_UNTYPED_ATOMIC_LOGICAL,
                           addr, tmp, surface, dims, op, rsize, pred);
       }

       /**
        * Emit a typed surface read opcode.  \p dims determines the number of
        * components of the address and \p size the number of components of the
        * returned value.
        */
       fs_reg
       emit_typed_read(const fs_builder &bld, const fs_reg &surface,
                       const fs_reg &addr, unsigned dims, unsigned size)
       {
          return emit_send(bld, SHADER_OPCODE_TYPED_SURFACE_READ_LOGICAL,
                           addr, fs_reg(), surface, dims, size, size);
       }

       /**
        * Emit a typed surface write opcode.  \p dims determines the number of
        * components of the address and \p size the number of components of the
        * argument.
        */
       void
       emit_typed_write(const fs_builder &bld, const fs_reg &surface,
                        const fs_reg &addr, const fs_reg &src,
                        unsigned dims, unsigned size)
       {
          emit_send(bld, SHADER_OPCODE_TYPED_SURFACE_WRITE_LOGICAL,
                    addr, src, surface, dims, size, 0);
       }

       /**
        * Emit a typed surface atomic opcode.  \p dims determines the number of
        * components of the address and \p rsize the number of components of
        * the returned value (either zero or one).
        */
       fs_reg
       emit_typed_atomic(const fs_builder &bld, const fs_reg &surface,
                         const fs_reg &addr,
                         const fs_reg &src0, const fs_reg &src1,
                         unsigned dims, unsigned rsize, unsigned op,
                         brw_predicate pred)
       {
          /* FINISHME: Factor out this frequently recurring pattern into a
           * helper function.
           */
          const unsigned n = (src0.file != BAD_FILE) + (src1.file != BAD_FILE);
          const fs_reg srcs[] = { src0, src1 };
          const fs_reg tmp = bld.vgrf(BRW_REGISTER_TYPE_UD, n);
          bld.LOAD_PAYLOAD(tmp, srcs, n, 0);

          return emit_send(bld, SHADER_OPCODE_TYPED_ATOMIC_LOGICAL,
                           addr, tmp, surface, dims, op, rsize);
       }
    }
 }

 namespace {
    namespace image_format_info {
       /* The higher compiler layers use the GL enums for image formats even if
        * they come in from SPIR-V or Vulkan.  We need to turn them into an ISL
        * enum before we can use them.
        */
       enum isl_format
       isl_format_for_gl_format(uint32_t gl_format)
       {
          switch (gl_format) {
          case GL_R8:             return ISL_FORMAT_R8_UNORM;
          case GL_R8_SNORM:       return ISL_FORMAT_R8_SNORM;
          case GL_R8UI:           return ISL_FORMAT_R8_UINT;
          case GL_R8I:            return ISL_FORMAT_R8_SINT;
          case GL_RG8:            return ISL_FORMAT_R8G8_UNORM;
          case GL_RG8_SNORM:      return ISL_FORMAT_R8G8_SNORM;
          case GL_RG8UI:          return ISL_FORMAT_R8G8_UINT;
          case GL_RG8I:           return ISL_FORMAT_R8G8_SINT;
          case GL_RGBA8:          return ISL_FORMAT_R8G8B8A8_UNORM;
          case GL_RGBA8_SNORM:    return ISL_FORMAT_R8G8B8A8_SNORM;
          case GL_RGBA8UI:        return ISL_FORMAT_R8G8B8A8_UINT;
          case GL_RGBA8I:         return ISL_FORMAT_R8G8B8A8_SINT;
          case GL_R11F_G11F_B10F: return ISL_FORMAT_R11G11B10_FLOAT;
          case GL_RGB10_A2:       return ISL_FORMAT_R10G10B10A2_UNORM;
          case GL_RGB10_A2UI:     return ISL_FORMAT_R10G10B10A2_UINT;
          case GL_R16:            return ISL_FORMAT_R16_UNORM;
          case GL_R16_SNORM:      return ISL_FORMAT_R16_SNORM;
          case GL_R16F:           return ISL_FORMAT_R16_FLOAT;
          case GL_R16UI:          return ISL_FORMAT_R16_UINT;
          case GL_R16I:           return ISL_FORMAT_R16_SINT;
          case GL_RG16:           return ISL_FORMAT_R16G16_UNORM;
          case GL_RG16_SNORM:     return ISL_FORMAT_R16G16_SNORM;
          case GL_RG16F:          return ISL_FORMAT_R16G16_FLOAT;
          case GL_RG16UI:         return ISL_FORMAT_R16G16_UINT;
          case GL_RG16I:          return ISL_FORMAT_R16G16_SINT;
          case GL_RGBA16:         return ISL_FORMAT_R16G16B16A16_UNORM;
          case GL_RGBA16_SNORM:   return ISL_FORMAT_R16G16B16A16_SNORM;
          case GL_RGBA16F:        return ISL_FORMAT_R16G16B16A16_FLOAT;
          case GL_RGBA16UI:       return ISL_FORMAT_R16G16B16A16_UINT;
          case GL_RGBA16I:        return ISL_FORMAT_R16G16B16A16_SINT;
          case GL_R32F:           return ISL_FORMAT_R32_FLOAT;
          case GL_R32UI:          return ISL_FORMAT_R32_UINT;
          case GL_R32I:           return ISL_FORMAT_R32_SINT;
          case GL_RG32F:          return ISL_FORMAT_R32G32_FLOAT;
          case GL_RG32UI:         return ISL_FORMAT_R32G32_UINT;
          case GL_RG32I:          return ISL_FORMAT_R32G32_SINT;
          case GL_RGBA32F:        return ISL_FORMAT_R32G32B32A32_FLOAT;
          case GL_RGBA32UI:       return ISL_FORMAT_R32G32B32A32_UINT;
          case GL_RGBA32I:        return ISL_FORMAT_R32G32B32A32_SINT;
          case GL_NONE:           return ISL_FORMAT_UNSUPPORTED;
          default:
             assert(!"Invalid image format");
             return ISL_FORMAT_UNSUPPORTED;
          }
       }

       /**
        * Simple 4-tuple of scalars used to pass around per-color component
        * values.
        */
       struct color_u {
          color_u(unsigned x = 0) : r(x), g(x), b(x), a(x)
          {
          }

          color_u(unsigned r, unsigned g, unsigned b, unsigned a) :
             r(r), g(g), b(b), a(a)
          {
          }

          unsigned
          operator[](unsigned i) const
          {
             const unsigned xs[] = { r, g, b, a };
             return xs[i];
          }

          unsigned r, g, b, a;
       };

       /**
        * Return the per-channel bitfield widths for a given image format.
        */
       inline color_u
       get_bit_widths(isl_format format)
       {
          const isl_format_layout *fmtl = isl_format_get_layout(format);

          return color_u(fmtl->channels.r.bits,
                         fmtl->channels.g.bits,
                         fmtl->channels.b.bits,
                         fmtl->channels.a.bits);
       }

       /**
        * Return the per-channel bitfield shifts for a given image format.
        */
       inline color_u
       get_bit_shifts(isl_format format)
       {
          const color_u widths = get_bit_widths(format);
          return color_u(0, widths.r, widths.r + widths.g,
                         widths.r + widths.g + widths.b);
       }

       /**
        * Return true if all present components have the same bit width.
        */
       inline bool
       is_homogeneous(isl_format format)
       {
          const color_u widths = get_bit_widths(format);
          return ((widths.g == 0 || widths.g == widths.r) &&
                  (widths.b == 0 || widths.b == widths.r) &&
                  (widths.a == 0 || widths.a == widths.r));
       }

       /**
        * Return true if the format conversion boils down to a trivial copy.
        */
       inline bool
       is_conversion_trivial(const gen_device_info *devinfo, isl_format format)
       {
          return (get_bit_widths(format).r == 32 && is_homogeneous(format)) ||
                  format == isl_lower_storage_image_format(devinfo, format);
       }

       /**
        * Return true if the hardware natively supports some format with
        * compatible bitfield layout, but possibly different data types.
        */
       inline bool
       has_supported_bit_layout(const gen_device_info *devinfo,
                                isl_format format)
       {
          const color_u widths = get_bit_widths(format);
          const color_u lower_widths = get_bit_widths(
             isl_lower_storage_image_format(devinfo, format));

          return (widths.r == lower_widths.r &&
                  widths.g == lower_widths.g &&
                  widths.b == lower_widths.b &&
                  widths.a == lower_widths.a);
       }

       /**
        * Return true if we are required to spread individual components over
        * several components of the format used by the hardware (RG32 and
        * friends implemented as RGBA16UI).
        */
       inline bool
       has_split_bit_layout(const gen_device_info *devinfo, isl_format format)
       {
          const isl_format lower_format =
             isl_lower_storage_image_format(devinfo, format);

          return (isl_format_get_num_channels(format) <
                  isl_format_get_num_channels(lower_format));
       }

       /**
        * Return true if the hardware returns garbage in the unused high bits
        * of each component.  This may happen on IVB because we rely on the
        * undocumented behavior that typed reads from surfaces of the
        * unsupported R8 and R16 formats return useful data in their least
        * significant bits.
        */
       inline bool
       has_undefined_high_bits(const gen_device_info *devinfo,
                               isl_format format)
       {
          const isl_format lower_format =
             isl_lower_storage_image_format(devinfo, format);

          return (devinfo->gen == 7 && !devinfo->is_haswell &&
                  (lower_format == ISL_FORMAT_R16_UINT ||
                   lower_format == ISL_FORMAT_R8_UINT));
       }

       /**
        * Return true if the format represents values as signed integers
        * requiring sign extension when unpacking.
        */
       inline bool
       needs_sign_extension(isl_format format)
       {
          return isl_format_has_snorm_channel(format) ||
                 isl_format_has_sint_channel(format);
       }
    }

    namespace image_validity {
       /**
        * Check whether the bound image is suitable for untyped access.
        */
       brw_predicate
       emit_untyped_image_check(const fs_builder &bld, const fs_reg &image,
                                brw_predicate pred)
       {
          const gen_device_info *devinfo = bld.shader->devinfo;
          const fs_reg stride = offset(image, bld, BRW_IMAGE_PARAM_STRIDE_OFFSET);

          if (devinfo->gen == 7 && !devinfo->is_haswell) {
             /* Check whether the first stride component (i.e. the Bpp value)
              * is greater than four, what on Gen7 indicates that a surface of
              * type RAW has been bound for untyped access.  Reading or writing
              * to a surface of type other than RAW using untyped surface
              * messages causes a hang on IVB and VLV.
              */
             set_predicate(pred,
                           bld.CMP(bld.null_reg_ud(), stride, brw_imm_d(4),
                                   BRW_CONDITIONAL_G));

             return BRW_PREDICATE_NORMAL;
          } else {
             /* More recent generations handle the format mismatch
              * gracefully.
              */
             return pred;
          }
       }

       /**
        * Check whether there is an image bound at the given index and write
        * the comparison result to f0.0.  Returns an appropriate predication
        * mode to use on subsequent image operations.
        */
       brw_predicate
       emit_typed_atomic_check(const fs_builder &bld, const fs_reg &image)
       {
          const gen_device_info *devinfo = bld.shader->devinfo;
          const fs_reg size = offset(image, bld, BRW_IMAGE_PARAM_SIZE_OFFSET);

          if (devinfo->gen == 7 && !devinfo->is_haswell) {
             /* Check the first component of the size field to find out if the
              * image is bound.  Necessary on IVB for typed atomics because
              * they don't seem to respect null surfaces and will happily
              * corrupt or read random memory when no image is bound.
              */
             bld.CMP(bld.null_reg_ud(),
                     retype(size, BRW_REGISTER_TYPE_UD),
                     brw_imm_d(0), BRW_CONDITIONAL_NZ);

             return BRW_PREDICATE_NORMAL;
          } else {
             /* More recent platforms implement compliant behavior when a null
              * surface is bound.
              */
             return BRW_PREDICATE_NONE;
          }
       }

       /**
        * Check whether the provided coordinates are within the image bounds
        * and write the comparison result to f0.0.  Returns an appropriate
        * predication mode to use on subsequent image operations.
        */
       brw_predicate
       emit_bounds_check(const fs_builder &bld, const fs_reg &image,
                         const fs_reg &addr, unsigned dims)
       {
          const fs_reg size = offset(image, bld, BRW_IMAGE_PARAM_SIZE_OFFSET);

          for (unsigned c = 0; c < dims; ++c)
             set_predicate(c == 0 ? BRW_PREDICATE_NONE : BRW_PREDICATE_NORMAL,
                           bld.CMP(bld.null_reg_ud(),
                                   offset(retype(addr, BRW_REGISTER_TYPE_UD), bld, c),
                                   offset(size, bld, c),
                                   BRW_CONDITIONAL_L));

          return BRW_PREDICATE_NORMAL;
       }
    }

    namespace image_coordinates {
       /**
        * Return the total number of coordinates needed to address a texel of
        * the surface, which may be more than the sum of \p surf_dims and \p
        * arr_dims if padding is required.
        */
       unsigned
       num_image_coordinates(const fs_builder &bld,
                             unsigned surf_dims, unsigned arr_dims,
                             isl_format format)
       {
          /* HSW in vec4 mode and our software coordinate handling for untyped
           * reads want the array index to be at the Z component.
           */
          const bool array_index_at_z =
             format != ISL_FORMAT_UNSUPPORTED &&
             !isl_has_matching_typed_storage_image_format(
                bld.shader->devinfo, format);
          const unsigned zero_dims =
             ((surf_dims == 1 && arr_dims == 1 && array_index_at_z) ? 1 : 0);

          return surf_dims + zero_dims + arr_dims;
       }

       /**
        * Transform image coordinates into the form expected by the
        * implementation.
        */
       fs_reg
       emit_image_coordinates(const fs_builder &bld, const fs_reg &addr,
                              unsigned surf_dims, unsigned arr_dims,
                              isl_format format)
       {
          const unsigned dims =
             num_image_coordinates(bld, surf_dims, arr_dims, format);

          if (dims > surf_dims + arr_dims) {
             assert(surf_dims == 1 && arr_dims == 1 && dims == 3);
             /* The array index is required to be passed in as the Z component,
              * insert a zero at the Y component to shift it to the right
              * position.
              *
              * FINISHME: Factor out this frequently recurring pattern into a
              * helper function.
              */
             const fs_reg srcs[] = { addr, brw_imm_d(0), offset(addr, bld, 1) };
             const fs_reg dst = bld.vgrf(addr.type, dims);
             bld.LOAD_PAYLOAD(dst, srcs, dims, 0);
             return dst;
          } else {
             return addr;
          }
       }

       /**
        * Calculate the offset in memory of the texel given by \p coord.
        *
        * This is meant to be used with untyped surface messages to access a
        * tiled surface, what involves taking into account the tiling and
        * swizzling modes of the surface manually so it will hopefully not
        * happen very often.
        *
        * The tiling algorithm implemented here matches either the X or Y
        * tiling layouts supported by the hardware depending on the tiling
        * coefficients passed to the program as uniforms.  See Volume 1 Part 2
        * Section 4.5 "Address Tiling Function" of the IVB PRM for an in-depth
        * explanation of the hardware tiling format.
        */
       fs_reg
       emit_address_calculation(const fs_builder &bld, const fs_reg &image,
                                const fs_reg &coord, unsigned dims)
       {
          const gen_device_info *devinfo = bld.shader->devinfo;
          const fs_reg off = offset(image, bld, BRW_IMAGE_PARAM_OFFSET_OFFSET);
          const fs_reg stride = offset(image, bld, BRW_IMAGE_PARAM_STRIDE_OFFSET);
          const fs_reg tile = offset(image, bld, BRW_IMAGE_PARAM_TILING_OFFSET);
          const fs_reg swz = offset(image, bld, BRW_IMAGE_PARAM_SWIZZLING_OFFSET);
          const fs_reg addr = bld.vgrf(BRW_REGISTER_TYPE_UD, 2);
          const fs_reg tmp = bld.vgrf(BRW_REGISTER_TYPE_UD, 2);
          const fs_reg minor = bld.vgrf(BRW_REGISTER_TYPE_UD, 2);
          const fs_reg major = bld.vgrf(BRW_REGISTER_TYPE_UD, 2);
          const fs_reg dst = bld.vgrf(BRW_REGISTER_TYPE_UD);

          /* Shift the coordinates by the fixed surface offset.  It may be
           * non-zero if the image is a single slice of a higher-dimensional
           * surface, or if a non-zero mipmap level of the surface is bound to
           * the pipeline.  The offset needs to be applied here rather than at
           * surface state set-up time because the desired slice-level may
           * start mid-tile, so simply shifting the surface base address
           * wouldn't give a well-formed tiled surface in the general case.
           */
          for (unsigned c = 0; c < 2; ++c)
             bld.ADD(offset(addr, bld, c), offset(off, bld, c),
                     (c < dims ?
                      offset(retype(coord, BRW_REGISTER_TYPE_UD), bld, c) :
                      fs_reg(brw_imm_d(0))));

          /* The layout of 3-D textures in memory is sort-of like a tiling
           * format.  At each miplevel, the slices are arranged in rows of
           * 2^level slices per row.  The slice row is stored in tmp.y and
           * the slice within the row is stored in tmp.x.
           *
           * The layout of 2-D array textures and cubemaps is much simpler:
           * Depending on whether the ARYSPC_LOD0 layout is in use it will be
           * stored in memory as an array of slices, each one being a 2-D
           * arrangement of miplevels, or as a 2D arrangement of miplevels,
           * each one being an array of slices.  In either case the separation
           * between slices of the same LOD is equal to the qpitch value
           * provided as stride.w.
           *
           * This code can be made to handle either 2D arrays and 3D textures
           * by passing in the miplevel as tile.z for 3-D textures and 0 in
           * tile.z for 2-D array textures.
           *
           * See Volume 1 Part 1 of the Gen7 PRM, sections 6.18.4.7 "Surface
           * Arrays" and 6.18.6 "3D Surfaces" for a more extensive discussion
           * of the hardware 3D texture and 2D array layouts.
           */
          if (dims > 2) {
             /* Decompose z into a major (tmp.y) and a minor (tmp.x)
              * index.
              */
             bld.BFE(offset(tmp, bld, 0), offset(tile, bld, 2), brw_imm_d(0),
                     offset(retype(coord, BRW_REGISTER_TYPE_UD), bld, 2));
             bld.SHR(offset(tmp, bld, 1),
                     offset(retype(coord, BRW_REGISTER_TYPE_UD), bld, 2),
                     offset(tile, bld, 2));

             /* Take into account the horizontal (tmp.x) and vertical (tmp.y)
              * slice offset.
              */
             for (unsigned c = 0; c < 2; ++c) {
                bld.MUL(offset(tmp, bld, c),
                        offset(stride, bld, 2 + c), offset(tmp, bld, c));
                bld.ADD(offset(addr, bld, c),
                        offset(addr, bld, c), offset(tmp, bld, c));
             }
          }

          if (dims > 1) {
             /* Calculate the major/minor x and y indices.  In order to
              * accommodate both X and Y tiling, the Y-major tiling format is
              * treated as being a bunch of narrow X-tiles placed next to each
              * other.  This means that the tile width for Y-tiling is actually
              * the width of one sub-column of the Y-major tile where each 4K
              * tile has 8 512B sub-columns.
              *
              * The major Y value is the row of tiles in which the pixel lives.
              * The major X value is the tile sub-column in which the pixel
              * lives; for X tiling, this is the same as the tile column, for Y
              * tiling, each tile has 8 sub-columns.  The minor X and Y indices
              * are the position within the sub-column.
              */
             for (unsigned c = 0; c < 2; ++c) {
                /* Calculate the minor x and y indices. */
                bld.BFE(offset(minor, bld, c), offset(tile, bld, c),
                        brw_imm_d(0), offset(addr, bld, c));

                /* Calculate the major x and y indices. */
                bld.SHR(offset(major, bld, c),
                        offset(addr, bld, c), offset(tile, bld, c));
             }

             /* Calculate the texel index from the start of the tile row and
              * the vertical coordinate of the row.
              * Equivalent to:
              *   tmp.x = (major.x << tile.y << tile.x) +
              *           (minor.y << tile.x) + minor.x
              *   tmp.y = major.y << tile.y
              */
             bld.SHL(tmp, major, offset(tile, bld, 1));
             bld.ADD(tmp, tmp, offset(minor, bld, 1));
             bld.SHL(tmp, tmp, offset(tile, bld, 0));
             bld.ADD(tmp, tmp, minor);
             bld.SHL(offset(tmp, bld, 1),
                     offset(major, bld, 1), offset(tile, bld, 1));

             /* Add it to the start of the tile row. */
             bld.MUL(offset(tmp, bld, 1),
                     offset(tmp, bld, 1), offset(stride, bld, 1));
             bld.ADD(tmp, tmp, offset(tmp, bld, 1));

             /* Multiply by the Bpp value. */
             bld.MUL(dst, tmp, stride);

             if (devinfo->gen < 8 && !devinfo->is_baytrail) {
                /* Take into account the two dynamically specified shifts.
                 * Both need are used to implement swizzling of X-tiled
                 * surfaces.  For Y-tiled surfaces only one bit needs to be
                 * XOR-ed with bit 6 of the memory address, so a swz value of
                 * 0xff (actually interpreted as 31 by the hardware) will be
                 * provided to cause the relevant bit of tmp.y to be zero and
                 * turn the first XOR into the identity.  For linear surfaces
                 * or platforms lacking address swizzling both shifts will be
                 * 0xff causing the relevant bits of both tmp.x and .y to be
                 * zero, what effectively disables swizzling.
                 */
                for (unsigned c = 0; c < 2; ++c)
                   bld.SHR(offset(tmp, bld, c), dst, offset(swz, bld, c));

                /* XOR tmp.x and tmp.y with bit 6 of the memory address. */
                bld.XOR(tmp, tmp, offset(tmp, bld, 1));
                bld.AND(tmp, tmp, brw_imm_d(1 << 6));
                bld.XOR(dst, dst, tmp);
             }

          } else {
             /* Multiply by the Bpp/stride value.  Note that the addr.y may be
              * non-zero even if the image is one-dimensional because a
              * vertical offset may have been applied above to select a
              * non-zero slice or level of a higher-dimensional texture.
              */
             bld.MUL(offset(addr, bld, 1),
                     offset(addr, bld, 1), offset(stride, bld, 1));
             bld.ADD(addr, addr, offset(addr, bld, 1));
             bld.MUL(dst, addr, stride);
          }

          return dst;
       }
    }

    namespace image_format_conversion {
       using image_format_info::color_u;

       namespace {
          /**
           * Maximum representable value in an unsigned integer with the given
           * number of bits.
           */
          inline unsigned
          scale(unsigned n)
          {
             return (1 << n) - 1;
          }
       }

       /**
        * Pack the vector \p src in a bitfield given the per-component bit
        * shifts and widths.  Note that bitfield components are not allowed to
        * cross 32-bit boundaries.
        */
       fs_reg
       emit_pack(const fs_builder &bld, const fs_reg &src,
                 const color_u &shifts, const color_u &widths)
       {
          const fs_reg dst = bld.vgrf(BRW_REGISTER_TYPE_UD, 4);
          bool seen[4] = {};

          for (unsigned c = 0; c < 4; ++c) {
             if (widths[c]) {
                const fs_reg tmp = bld.vgrf(BRW_REGISTER_TYPE_UD);

                /* Shift each component left to the correct bitfield position. */
                bld.SHL(tmp, offset(src, bld, c), brw_imm_ud(shifts[c] % 32));

                /* Add everything up. */
                if (seen[shifts[c] / 32]) {
                   bld.OR(offset(dst, bld, shifts[c] / 32),
                          offset(dst, bld, shifts[c] / 32), tmp);
                } else {
                   bld.MOV(offset(dst, bld, shifts[c] / 32), tmp);
                   seen[shifts[c] / 32] = true;
                }
             }
          }

          return dst;
       }

       /**
        * Unpack a vector from the bitfield \p src given the per-component bit
        * shifts and widths.  Note that bitfield components are not allowed to
        * cross 32-bit boundaries.
        */
       fs_reg
       emit_unpack(const fs_builder &bld, const fs_reg &src,
                   const color_u &shifts, const color_u &widths)
       {
          const fs_reg dst = bld.vgrf(src.type, 4);

          for (unsigned c = 0; c < 4; ++c) {
             if (widths[c]) {
                /* Shift left to discard the most significant bits. */
                bld.SHL(offset(dst, bld, c),
                        offset(src, bld, shifts[c] / 32),
                        brw_imm_ud(32 - shifts[c] % 32 - widths[c]));

                /* Shift back to the least significant bits using an arithmetic
                 * shift to get sign extension on signed types.
                 */
                bld.ASR(offset(dst, bld, c),
                        offset(dst, bld, c), brw_imm_ud(32 - widths[c]));
             }
          }

          return dst;
       }

       /**
        * Convert an integer vector into another integer vector of the
        * specified bit widths, properly handling overflow.
        */
       fs_reg
       emit_convert_to_integer(const fs_builder &bld, const fs_reg &src,
                               const color_u &widths, bool is_signed)
       {
          const unsigned s = (is_signed ? 1 : 0);
          const fs_reg dst = bld.vgrf(
             is_signed ? BRW_REGISTER_TYPE_D : BRW_REGISTER_TYPE_UD, 4);
          assert(src.type == dst.type);

          for (unsigned c = 0; c < 4; ++c) {
             if (widths[c]) {
                /* Clamp to the maximum value. */
                bld.emit_minmax(offset(dst, bld, c), offset(src, bld, c),
                                brw_imm_d((int)scale(widths[c] - s)),
                                BRW_CONDITIONAL_L);

                /* Clamp to the minimum value. */
                if (is_signed)
                   bld.emit_minmax(offset(dst, bld, c), offset(dst, bld, c),
                                   brw_imm_d(-(int)scale(widths[c] - s) - 1),
                                   BRW_CONDITIONAL_GE);

                /* Mask off all but the bits we actually want.  Otherwise, if
                 * we pass a negative number into the hardware when it's
                 * expecting something like UINT8, it will happily clamp it to
                 * +255 for us.
                 */
                if (is_signed && widths[c] < 32)
                   bld.AND(offset(dst, bld, c), offset(dst, bld, c),
                           brw_imm_d(scale(widths[c])));
             }
          }

          return dst;
       }

       /**
        * Convert a normalized fixed-point vector of the specified signedness
        * and bit widths into a floating point vector.
        */
       fs_reg
       emit_convert_from_scaled(const fs_builder &bld, const fs_reg &src,
                                const color_u &widths, bool is_signed)
       {
          const unsigned s = (is_signed ? 1 : 0);
          const fs_reg dst = bld.vgrf(BRW_REGISTER_TYPE_F, 4);

          for (unsigned c = 0; c < 4; ++c) {
             if (widths[c]) {
                /* Convert to float. */
                bld.MOV(offset(dst, bld, c), offset(src, bld, c));

                /* Divide by the normalization constants. */
                bld.MUL(offset(dst, bld, c), offset(dst, bld, c),
                        brw_imm_f(1.0f / scale(widths[c] - s)));

                /* Clamp to the minimum value. */
                if (is_signed)
                   bld.emit_minmax(offset(dst, bld, c),
                                   offset(dst, bld, c), brw_imm_f(-1.0f),
                                   BRW_CONDITIONAL_GE);
             }
          }
          return dst;
       }

       /**
        * Convert a floating-point vector into a normalized fixed-point vector
        * of the specified signedness and bit widths.
        */
       fs_reg
       emit_convert_to_scaled(const fs_builder &bld, const fs_reg &src,
                              const color_u &widths, bool is_signed)
       {
          const unsigned s = (is_signed ? 1 : 0);
          const fs_reg dst = bld.vgrf(
             is_signed ? BRW_REGISTER_TYPE_D : BRW_REGISTER_TYPE_UD, 4);
          const fs_reg fdst = retype(dst, BRW_REGISTER_TYPE_F);

          for (unsigned c = 0; c < 4; ++c) {
             if (widths[c]) {
                /* Clamp the normalized floating-point argument. */
                if (is_signed) {
                   bld.emit_minmax(offset(fdst, bld, c), offset(src, bld, c),
                                   brw_imm_f(-1.0f), BRW_CONDITIONAL_GE);

                   bld.emit_minmax(offset(fdst, bld, c), offset(fdst, bld, c),
                                   brw_imm_f(1.0f), BRW_CONDITIONAL_L);
                } else {
                   set_saturate(true, bld.MOV(offset(fdst, bld, c),
                                              offset(src, bld, c)));
                }

                /* Multiply by the normalization constants. */
                bld.MUL(offset(fdst, bld, c), offset(fdst, bld, c),
                        brw_imm_f((float)scale(widths[c] - s)));

                /* Convert to integer. */
                bld.RNDE(offset(fdst, bld, c), offset(fdst, bld, c));
                bld.MOV(offset(dst, bld, c), offset(fdst, bld, c));

                /* Mask off all but the bits we actually want.  Otherwise, if
                 * we pass a negative number into the hardware when it's
                 * expecting something like UINT8, it will happily clamp it to
                 * +255 for us.
                 */
                if (is_signed && widths[c] < 32)
                   bld.AND(offset(dst, bld, c), offset(dst, bld, c),
                           brw_imm_d(scale(widths[c])));
             }
          }

          return dst;
       }

       /**
        * Convert a floating point vector of the specified bit widths into a
        * 32-bit floating point vector.
        */
       fs_reg
       emit_convert_from_float(const fs_builder &bld, const fs_reg &src,
                               const color_u &widths)
       {
          const fs_reg dst = bld.vgrf(BRW_REGISTER_TYPE_UD, 4);
          const fs_reg fdst = retype(dst, BRW_REGISTER_TYPE_F);

          for (unsigned c = 0; c < 4; ++c) {
             if (widths[c]) {
                bld.MOV(offset(dst, bld, c), offset(src, bld, c));

                /* Extend 10-bit and 11-bit floating point numbers to 15 bits.
                 * This works because they have a 5-bit exponent just like the
                 * 16-bit floating point format, and they have no sign bit.
                 */
                if (widths[c] < 16)
                   bld.SHL(offset(dst, bld, c),
                           offset(dst, bld, c), brw_imm_ud(15 - widths[c]));

                /* Convert to 32-bit floating point. */
                bld.F16TO32(offset(fdst, bld, c), offset(dst, bld, c));
             }
          }

          return fdst;
       }

       /**
        * Convert a vector into a floating point vector of the specified bit
        * widths.
        */
       fs_reg
       emit_convert_to_float(const fs_builder &bld, const fs_reg &src,
                             const color_u &widths)
       {
          const fs_reg dst = bld.vgrf(BRW_REGISTER_TYPE_UD, 4);
          const fs_reg fdst = retype(dst, BRW_REGISTER_TYPE_F);

          for (unsigned c = 0; c < 4; ++c) {
             if (widths[c]) {
                bld.MOV(offset(fdst, bld, c), offset(src, bld, c));

                /* Clamp to the minimum value. */
                if (widths[c] < 16)
                   bld.emit_minmax(offset(fdst, bld, c), offset(fdst, bld, c),
                                   brw_imm_f(0.0f), BRW_CONDITIONAL_GE);

                /* Convert to 16-bit floating-point. */
                bld.F32TO16(offset(dst, bld, c), offset(fdst, bld, c));

                /* Discard the least significant bits to get floating point
                 * numbers of the requested width.  This works because the
                 * 10-bit and 11-bit floating point formats have a 5-bit
                 * exponent just like the 16-bit format, and they have no sign
                 * bit.
                 */
                if (widths[c] < 16)
                   bld.SHR(offset(dst, bld, c), offset(dst, bld, c),
                           brw_imm_ud(15 - widths[c]));
             }
          }

          return dst;
       }

       /**
        * Fill missing components of a vector with 0, 0, 0, 1.
        */
       fs_reg
       emit_pad(const fs_builder &bld, const fs_reg &src,
                const color_u &widths)
       {
          const fs_reg dst = bld.vgrf(src.type, 4);
          const unsigned pad[] = { 0, 0, 0, 1 };

          for (unsigned c = 0; c < 4; ++c)
             bld.MOV(offset(dst, bld, c),
                     widths[c] ? offset(src, bld, c)
                               : fs_reg(brw_imm_ud(pad[c])));

          return dst;
       }
    }
 }

 namespace brw {
    namespace image_access {
       /**
        * Load a vector from a surface of the given format and dimensionality
        * at the given coordinates.  \p surf_dims and \p arr_dims give the
        * number of non-array and array coordinates of the image respectively.
        */
       fs_reg
       emit_image_load(const fs_builder &bld,
                       const fs_reg &image, const fs_reg &addr,
                       unsigned surf_dims, unsigned arr_dims,
                       unsigned gl_format)
       {
          using namespace image_format_info;
          using namespace image_format_conversion;
          using namespace image_validity;
          using namespace image_coordinates;
          using namespace surface_access;
          const gen_device_info *devinfo = bld.shader->devinfo;
          const isl_format format = isl_format_for_gl_format(gl_format);
          const isl_format lower_format =
             isl_lower_storage_image_format(devinfo, format);
          fs_reg tmp;

          /* Transform the image coordinates into actual surface coordinates. */
          const fs_reg saddr =
             emit_image_coordinates(bld, addr, surf_dims, arr_dims, format);
          const unsigned dims =
             num_image_coordinates(bld, surf_dims, arr_dims, format);

          if (isl_has_matching_typed_storage_image_format(devinfo, format)) {
             /* Hopefully we get here most of the time... */
             tmp = emit_typed_read(bld, image, saddr, dims,
                                   isl_format_get_num_channels(lower_format));
          } else {
             /* Untyped surface reads return 32 bits of the surface per
              * component, without any sort of unpacking or type conversion,
              */
             const unsigned size = isl_format_get_layout(format)->bpb / 32;
             /* they don't properly handle out of bounds access, so we have to
              * check manually if the coordinates are valid and predicate the
              * surface read on the result,
              */
             const brw_predicate pred =
                emit_untyped_image_check(bld, image,
                                         emit_bounds_check(bld, image,
                                                           saddr, dims));

             /* and they don't know about surface coordinates, we need to
              * convert them to a raw memory offset.
              */
             const fs_reg laddr = emit_address_calculation(bld, image, saddr, dims);

             tmp = emit_untyped_read(bld, image, laddr, 1, size, pred);

             /* An out of bounds surface access should give zero as result. */
             for (unsigned c = 0; c < size; ++c)
                set_predicate(pred, bld.SEL(offset(tmp, bld, c),
                                            offset(tmp, bld, c), brw_imm_d(0)));
          }

          /* Set the register type to D instead of UD if the data type is
           * represented as a signed integer in memory so that sign extension
           * is handled correctly by unpack.
           */
          if (needs_sign_extension(format))
             tmp = retype(tmp, BRW_REGISTER_TYPE_D);

          if (!has_supported_bit_layout(devinfo, format)) {
             /* Unpack individual vector components from the bitfield if the
              * hardware is unable to do it for us.
              */
             if (has_split_bit_layout(devinfo, format))
                tmp = emit_pack(bld, tmp, get_bit_shifts(lower_format),
                                get_bit_widths(lower_format));
             else
                tmp = emit_unpack(bld, tmp, get_bit_shifts(format),
                                  get_bit_widths(format));

          } else if ((needs_sign_extension(format) &&
                      !is_conversion_trivial(devinfo, format)) ||
                     has_undefined_high_bits(devinfo, format)) {
             /* Perform a trivial unpack even though the bit layout matches in
              * order to get the most significant bits of each component
              * initialized properly.
              */
             tmp = emit_unpack(bld, tmp, color_u(0, 32, 64, 96),
                               get_bit_widths(format));
          }

          if (!isl_format_has_int_channel(format)) {
             if (is_conversion_trivial(devinfo, format)) {
                /* Just need to cast the vector to the target type. */
                tmp = retype(tmp, BRW_REGISTER_TYPE_F);
             } else {
                /* Do the right sort of type conversion to float. */
                if (isl_format_has_float_channel(format))
                   tmp = emit_convert_from_float(
                      bld, tmp, get_bit_widths(format));
                else
                   tmp = emit_convert_from_scaled(
                      bld, tmp, get_bit_widths(format),
                      isl_format_has_snorm_channel(format));
             }
          }

          /* Initialize missing components of the result. */
          return emit_pad(bld, tmp, get_bit_widths(format));
       }

       /**
        * Store a vector in a surface of the given format and dimensionality at
        * the given coordinates.  \p surf_dims and \p arr_dims give the number
        * of non-array and array coordinates of the image respectively.
        */
       void
       emit_image_store(const fs_builder &bld, const fs_reg &image,
                        const fs_reg &addr, const fs_reg &src,
                        unsigned surf_dims, unsigned arr_dims,
                        unsigned gl_format)
       {
          using namespace image_format_info;
          using namespace image_format_conversion;
          using namespace image_validity;
          using namespace image_coordinates;
          using namespace surface_access;
          const isl_format format = isl_format_for_gl_format(gl_format);
          const gen_device_info *devinfo = bld.shader->devinfo;

          /* Transform the image coordinates into actual surface coordinates. */
          const fs_reg saddr =
             emit_image_coordinates(bld, addr, surf_dims, arr_dims, format);
          const unsigned dims =
             num_image_coordinates(bld, surf_dims, arr_dims, format);

          if (gl_format == GL_NONE) {
             /* We don't know what the format is, but that's fine because it
              * implies write-only access, and typed surface writes are always
              * able to take care of type conversion and packing for us.
              */
             emit_typed_write(bld, image, saddr, src, dims, 4);

          } else {
             const isl_format lower_format =
                isl_lower_storage_image_format(devinfo, format);
             fs_reg tmp = src;

             if (!is_conversion_trivial(devinfo, format)) {
                /* Do the right sort of type conversion. */
                if (isl_format_has_float_channel(format))
                   tmp = emit_convert_to_float(bld, tmp, get_bit_widths(format));

                else if (isl_format_has_int_channel(format))
                   tmp = emit_convert_to_integer(bld, tmp, get_bit_widths(format),
                                                 isl_format_has_sint_channel(format));

                else
                   tmp = emit_convert_to_scaled(bld, tmp, get_bit_widths(format),
                                                isl_format_has_snorm_channel(format));
             }

             /* We're down to bit manipulation at this point. */
             tmp = retype(tmp, BRW_REGISTER_TYPE_UD);

             if (!has_supported_bit_layout(devinfo, format)) {
                /* Pack the vector components into a bitfield if the hardware
                 * is unable to do it for us.
                 */
                if (has_split_bit_layout(devinfo, format))
                   tmp = emit_unpack(bld, tmp, get_bit_shifts(lower_format),
                                     get_bit_widths(lower_format));

                else
                   tmp = emit_pack(bld, tmp, get_bit_shifts(format),
                                   get_bit_widths(format));
             }

             if (isl_has_matching_typed_storage_image_format(devinfo, format)) {
                /* Hopefully we get here most of the time... */
                emit_typed_write(bld, image, saddr, tmp, dims,
                                 isl_format_get_num_channels(lower_format));

             } else {
                /* Untyped surface writes store 32 bits of the surface per
                 * component, without any sort of packing or type conversion,
                 */
                const unsigned size = isl_format_get_layout(format)->bpb / 32;

                /* they don't properly handle out of bounds access, so we have
                 * to check manually if the coordinates are valid and predicate
                 * the surface write on the result,
                 */
                const brw_predicate pred =
                   emit_untyped_image_check(bld, image,
                                            emit_bounds_check(bld, image,
                                                              saddr, dims));

                /* and, phew, they don't know about surface coordinates, we
                 * need to convert them to a raw memory offset.
                 */
                const fs_reg laddr = emit_address_calculation(
                   bld, image, saddr, dims);

                emit_untyped_write(bld, image, laddr, tmp, 1, size, pred);
             }
          }
       }

       /**
        * Perform an atomic read-modify-write operation in a surface of the
        * given dimensionality at the given coordinates.  \p surf_dims and \p
        * arr_dims give the number of non-array and array coordinates of the
        * image respectively.  Main building block of the imageAtomic GLSL
        * built-ins.
        */
       fs_reg
       emit_image_atomic(const fs_builder &bld,
                         const fs_reg &image, const fs_reg &addr,
                         const fs_reg &src0, const fs_reg &src1,
                         unsigned surf_dims, unsigned arr_dims,
                         unsigned rsize, unsigned op)
       {
          using namespace image_validity;
          using namespace image_coordinates;
          using namespace surface_access;
          /* Avoid performing an atomic operation on an unbound surface. */
          const brw_predicate pred = emit_typed_atomic_check(bld, image);

          /* Transform the image coordinates into actual surface coordinates. */
          const fs_reg saddr =
             emit_image_coordinates(bld, addr, surf_dims, arr_dims,
                                    ISL_FORMAT_R32_UINT);
          const unsigned dims =
             num_image_coordinates(bld, surf_dims, arr_dims,
                                   ISL_FORMAT_R32_UINT);

          /* Thankfully we can do without untyped atomics here. */
          const fs_reg tmp = emit_typed_atomic(bld, image, saddr, src0, src1,
                                               dims, rsize, op, pred);

          /* An unbound surface access should give zero as result. */
          if (rsize && pred)
             set_predicate(pred, bld.SEL(tmp, tmp, brw_imm_d(0)));

          return retype(tmp, src0.type);
       }
    }
 }