blob: 3508c200073831e77b80efbcdfece2d126a3d29a [file] [log] [blame]
/*
* Copyright 2019-2020 Valve Corporation
* SPDX-License-Identifier: MIT
*
* Authors:
* Jonathan Marek <jonathan@marek.ca>
*/
#include "tu_private.h"
#include "tu_cs.h"
#include "vk_format.h"
#include "util/format_r11g11b10f.h"
#include "util/format_rgb9e5.h"
#include "util/format_srgb.h"
#include "util/half_float.h"
static uint32_t
tu_pack_float32_for_unorm(float val, int bits)
{
return _mesa_lroundevenf(CLAMP(val, 0.0f, 1.0f) * (float) ((1 << bits) - 1));
}
/* r2d_ = BLIT_OP_SCALE operations */
static enum a6xx_2d_ifmt
format_to_ifmt(VkFormat format)
{
if (format == VK_FORMAT_D24_UNORM_S8_UINT ||
format == VK_FORMAT_X8_D24_UNORM_PACK32)
return R2D_UNORM8;
/* get_component_bits doesn't work with depth/stencil formats: */
if (format == VK_FORMAT_D16_UNORM || format == VK_FORMAT_D32_SFLOAT)
return R2D_FLOAT32;
if (format == VK_FORMAT_S8_UINT)
return R2D_INT8;
/* use the size of the red channel to find the corresponding "ifmt" */
bool is_int = vk_format_is_int(format);
switch (vk_format_get_component_bits(format, UTIL_FORMAT_COLORSPACE_RGB, PIPE_SWIZZLE_X)) {
case 4: case 5: case 8:
return is_int ? R2D_INT8 : R2D_UNORM8;
case 10: case 11:
return is_int ? R2D_INT16 : R2D_FLOAT16;
case 16:
if (vk_format_is_float(format))
return R2D_FLOAT16;
return is_int ? R2D_INT16 : R2D_FLOAT32;
case 32:
return is_int ? R2D_INT32 : R2D_FLOAT32;
default:
unreachable("bad format");
return 0;
}
}
static void
r2d_coords(struct tu_cs *cs,
const VkOffset2D *dst,
const VkOffset2D *src,
const VkExtent2D *extent)
{
tu_cs_emit_regs(cs,
A6XX_GRAS_2D_DST_TL(.x = dst->x, .y = dst->y),
A6XX_GRAS_2D_DST_BR(.x = dst->x + extent->width - 1, .y = dst->y + extent->height - 1));
if (!src)
return;
tu_cs_emit_regs(cs,
A6XX_GRAS_2D_SRC_TL_X(src->x),
A6XX_GRAS_2D_SRC_BR_X(src->x + extent->width - 1),
A6XX_GRAS_2D_SRC_TL_Y(src->y),
A6XX_GRAS_2D_SRC_BR_Y(src->y + extent->height - 1));
}
static void
r2d_clear_value(struct tu_cs *cs, VkFormat format, const VkClearValue *val)
{
uint32_t clear_value[4] = {};
switch (format) {
case VK_FORMAT_X8_D24_UNORM_PACK32:
case VK_FORMAT_D24_UNORM_S8_UINT:
/* cleared as r8g8b8a8_unorm using special format */
clear_value[0] = tu_pack_float32_for_unorm(val->depthStencil.depth, 24);
clear_value[1] = clear_value[0] >> 8;
clear_value[2] = clear_value[0] >> 16;
clear_value[3] = val->depthStencil.stencil;
break;
case VK_FORMAT_D16_UNORM:
case VK_FORMAT_D32_SFLOAT:
/* R2D_FLOAT32 */
clear_value[0] = fui(val->depthStencil.depth);
break;
case VK_FORMAT_S8_UINT:
clear_value[0] = val->depthStencil.stencil;
break;
case VK_FORMAT_E5B9G9R9_UFLOAT_PACK32:
/* cleared as UINT32 */
clear_value[0] = float3_to_rgb9e5(val->color.float32);
break;
default:
assert(!vk_format_is_depth_or_stencil(format));
const struct util_format_description *desc = vk_format_description(format);
enum a6xx_2d_ifmt ifmt = format_to_ifmt(format);
assert(desc && (desc->layout == UTIL_FORMAT_LAYOUT_PLAIN ||
format == VK_FORMAT_B10G11R11_UFLOAT_PACK32));
for (unsigned i = 0; i < desc->nr_channels; i++) {
const struct util_format_channel_description *ch = &desc->channel[i];
if (ifmt == R2D_UNORM8) {
float linear = val->color.float32[i];
if (desc->colorspace == UTIL_FORMAT_COLORSPACE_SRGB && i < 3)
linear = util_format_linear_to_srgb_float(val->color.float32[i]);
if (ch->type == UTIL_FORMAT_TYPE_SIGNED)
clear_value[i] = _mesa_lroundevenf(CLAMP(linear, -1.0f, 1.0f) * 127.0f);
else
clear_value[i] = tu_pack_float32_for_unorm(linear, 8);
} else if (ifmt == R2D_FLOAT16) {
clear_value[i] = _mesa_float_to_half(val->color.float32[i]);
} else {
assert(ifmt == R2D_FLOAT32 || ifmt == R2D_INT32 ||
ifmt == R2D_INT16 || ifmt == R2D_INT8);
clear_value[i] = val->color.uint32[i];
}
}
break;
}
tu_cs_emit_pkt4(cs, REG_A6XX_RB_2D_SRC_SOLID_C0, 4);
tu_cs_emit_array(cs, clear_value, 4);
}
static void
r2d_src(struct tu_cmd_buffer *cmd,
struct tu_cs *cs,
const struct tu_image_view *iview,
uint32_t layer,
VkFilter filter)
{
uint32_t src_info = iview->SP_PS_2D_SRC_INFO;
if (filter != VK_FILTER_NEAREST)
src_info |= A6XX_SP_PS_2D_SRC_INFO_FILTER;
tu_cs_emit_pkt4(cs, REG_A6XX_SP_PS_2D_SRC_INFO, 5);
tu_cs_emit(cs, src_info);
tu_cs_emit(cs, iview->SP_PS_2D_SRC_SIZE);
tu_cs_image_ref_2d(cs, iview, layer, true);
tu_cs_emit_pkt4(cs, REG_A6XX_SP_PS_2D_SRC_FLAGS_LO, 3);
tu_cs_image_flag_ref(cs, iview, layer);
}
static void
r2d_src_buffer(struct tu_cmd_buffer *cmd,
struct tu_cs *cs,
VkFormat vk_format,
uint64_t va, uint32_t pitch,
uint32_t width, uint32_t height)
{
struct tu_native_format format = tu6_format_texture(vk_format, TILE6_LINEAR);
tu_cs_emit_regs(cs,
A6XX_SP_PS_2D_SRC_INFO(
.color_format = format.fmt,
.color_swap = format.swap,
.srgb = vk_format_is_srgb(vk_format),
.unk20 = 1,
.unk22 = 1),
A6XX_SP_PS_2D_SRC_SIZE(.width = width, .height = height),
A6XX_SP_PS_2D_SRC_LO((uint32_t) va),
A6XX_SP_PS_2D_SRC_HI(va >> 32),
A6XX_SP_PS_2D_SRC_PITCH(.pitch = pitch));
}
static void
r2d_dst(struct tu_cs *cs, const struct tu_image_view *iview, uint32_t layer)
{
assert(iview->image->samples == 1);
tu_cs_emit_pkt4(cs, REG_A6XX_RB_2D_DST_INFO, 4);
tu_cs_emit(cs, iview->RB_2D_DST_INFO);
tu_cs_image_ref_2d(cs, iview, layer, false);
tu_cs_emit_pkt4(cs, REG_A6XX_RB_2D_DST_FLAGS_LO, 3);
tu_cs_image_flag_ref(cs, iview, layer);
}
static void
r2d_dst_stencil(struct tu_cs *cs, const struct tu_image_view *iview, uint32_t layer)
{
assert(iview->image->samples == 1);
tu_cs_emit_pkt4(cs, REG_A6XX_RB_2D_DST_INFO, 4);
tu_cs_emit(cs, tu_image_view_stencil(iview, RB_2D_DST_INFO) & ~A6XX_RB_2D_DST_INFO_FLAGS);
tu_cs_emit_qw(cs, iview->stencil_base_addr + iview->stencil_layer_size * layer);
tu_cs_emit(cs, iview->stencil_PITCH);
}
static void
r2d_dst_buffer(struct tu_cs *cs, VkFormat vk_format, uint64_t va, uint32_t pitch)
{
struct tu_native_format format = tu6_format_color(vk_format, TILE6_LINEAR);
tu_cs_emit_regs(cs,
A6XX_RB_2D_DST_INFO(
.color_format = format.fmt,
.color_swap = format.swap,
.srgb = vk_format_is_srgb(vk_format)),
A6XX_RB_2D_DST_LO((uint32_t) va),
A6XX_RB_2D_DST_HI(va >> 32),
A6XX_RB_2D_DST_PITCH(pitch));
}
static void
r2d_setup_common(struct tu_cmd_buffer *cmd,
struct tu_cs *cs,
VkFormat vk_format,
VkImageAspectFlags aspect_mask,
enum a6xx_rotation rotation,
bool clear,
bool ubwc,
bool scissor)
{
enum a6xx_format format = tu6_base_format(vk_format);
enum a6xx_2d_ifmt ifmt = format_to_ifmt(vk_format);
uint32_t unknown_8c01 = 0;
if ((vk_format == VK_FORMAT_D24_UNORM_S8_UINT ||
vk_format == VK_FORMAT_X8_D24_UNORM_PACK32) && ubwc) {
format = FMT6_Z24_UNORM_S8_UINT_AS_R8G8B8A8;
}
/* note: the only format with partial clearing is D24S8 */
if (vk_format == VK_FORMAT_D24_UNORM_S8_UINT) {
/* preserve stencil channel */
if (aspect_mask == VK_IMAGE_ASPECT_DEPTH_BIT)
unknown_8c01 = 0x08000041;
/* preserve depth channels */
if (aspect_mask == VK_IMAGE_ASPECT_STENCIL_BIT)
unknown_8c01 = 0x00084001;
}
tu_cs_emit_pkt4(cs, REG_A6XX_RB_2D_UNKNOWN_8C01, 1);
tu_cs_emit(cs, unknown_8c01);
uint32_t blit_cntl = A6XX_RB_2D_BLIT_CNTL(
.scissor = scissor,
.rotate = rotation,
.solid_color = clear,
.d24s8 = format == FMT6_Z24_UNORM_S8_UINT_AS_R8G8B8A8 && !clear,
.color_format = format,
.mask = 0xf,
.ifmt = vk_format_is_srgb(vk_format) ? R2D_UNORM8_SRGB : ifmt,
).value;
tu_cs_emit_pkt4(cs, REG_A6XX_RB_2D_BLIT_CNTL, 1);
tu_cs_emit(cs, blit_cntl);
tu_cs_emit_pkt4(cs, REG_A6XX_GRAS_2D_BLIT_CNTL, 1);
tu_cs_emit(cs, blit_cntl);
if (format == FMT6_10_10_10_2_UNORM_DEST)
format = FMT6_16_16_16_16_FLOAT;
tu_cs_emit_regs(cs, A6XX_SP_2D_DST_FORMAT(
.sint = vk_format_is_sint(vk_format),
.uint = vk_format_is_uint(vk_format),
.color_format = format,
.srgb = vk_format_is_srgb(vk_format),
.mask = 0xf));
}
static void
r2d_setup(struct tu_cmd_buffer *cmd,
struct tu_cs *cs,
VkFormat vk_format,
VkImageAspectFlags aspect_mask,
enum a6xx_rotation rotation,
bool clear,
bool ubwc)
{
tu_emit_cache_flush_ccu(cmd, cs, TU_CMD_CCU_SYSMEM);
r2d_setup_common(cmd, cs, vk_format, aspect_mask, rotation, clear, ubwc, false);
}
static void
r2d_teardown(struct tu_cmd_buffer *cmd,
struct tu_cs *cs)
{
/* nothing to do here */
}
static void
r2d_run(struct tu_cmd_buffer *cmd, struct tu_cs *cs)
{
tu_cs_emit_pkt7(cs, CP_BLIT, 1);
tu_cs_emit(cs, CP_BLIT_0_OP(BLIT_OP_SCALE));
}
/* r3d_ = shader path operations */
void
tu_init_clear_blit_shaders(struct tu6_global *global)
{
#define MOV(args...) { .cat1 = { .opc_cat = 1, .src_type = TYPE_S32, .dst_type = TYPE_S32, args } }
#define CAT2(op, args...) { .cat2 = { .opc_cat = 2, .opc = (op) & 63, .full = 1, args } }
#define CAT3(op, args...) { .cat3 = { .opc_cat = 3, .opc = (op) & 63, args } }
static const instr_t vs_code[] = {
/* r0.xyz = r0.w ? c1.xyz : c0.xyz
* r1.xy = r0.w ? c1.zw : c0.zw
* r0.w = 1.0f
*/
CAT3(OPC_SEL_B32, .repeat = 2, .dst = 0,
.c1 = {.src1_c = 1, .src1 = 4}, .src1_r = 1,
.src2 = 3,
.c2 = {.src3_c = 1, .dummy = 1, .src3 = 0}),
CAT3(OPC_SEL_B32, .repeat = 1, .dst = 4,
.c1 = {.src1_c = 1, .src1 = 6}, .src1_r = 1,
.src2 = 3,
.c2 = {.src3_c = 1, .dummy = 1, .src3 = 2}),
MOV(.dst = 3, .src_im = 1, .fim_val = 1.0f ),
{ .cat0 = { .opc = OPC_END } },
};
static const instr_t fs_blit[] = {
/* " bary.f (ei)r63.x, 0, r0.x" note the blob doesn't have this in its
* blit path (its not clear what allows it to not have it)
*/
CAT2(OPC_BARY_F, .ei = 1, .full = 1, .dst = 63 * 4, .src1_im = 1),
{ .cat0 = { .opc = OPC_END } },
};
memcpy(&global->shaders[GLOBAL_SH_VS], vs_code, sizeof(vs_code));
memcpy(&global->shaders[GLOBAL_SH_FS_BLIT], fs_blit, sizeof(fs_blit));
for (uint32_t num_rts = 0; num_rts <= MAX_RTS; num_rts++) {
instr_t *code = global->shaders[GLOBAL_SH_FS_CLEAR0 + num_rts];
for (uint32_t i = 0; i < num_rts; i++) {
/* (rpt3)mov.s32s32 r0.x, (r)c[i].x */
*code++ = (instr_t) MOV(.repeat = 3, .dst = i * 4, .src_c = 1, .src_r = 1, .src = i * 4);
}
*code++ = (instr_t) { .cat0 = { .opc = OPC_END } };
}
}
static void
r3d_common(struct tu_cmd_buffer *cmd, struct tu_cs *cs, bool blit, uint32_t num_rts,
bool layered_clear)
{
struct ir3_const_state dummy_const_state = {};
struct ir3_shader dummy_shader = {};
struct ir3_shader_variant vs = {
.type = MESA_SHADER_VERTEX,
.instrlen = 1,
.constlen = 4,
.info.max_reg = 1,
.inputs_count = 1,
.inputs[0] = {
.slot = SYSTEM_VALUE_VERTEX_ID,
.regid = regid(0, 3),
.sysval = true,
},
.outputs_count = blit ? 2 : 1,
.outputs[0] = {
.slot = VARYING_SLOT_POS,
.regid = regid(0, 0),
},
.outputs[1] = {
.slot = VARYING_SLOT_VAR0,
.regid = regid(1, 0),
},
.shader = &dummy_shader,
.const_state = &dummy_const_state,
};
if (layered_clear) {
vs.outputs[1].slot = VARYING_SLOT_LAYER;
vs.outputs[1].regid = regid(1, 1);
vs.outputs_count = 2;
}
struct ir3_shader_variant fs = {
.type = MESA_SHADER_FRAGMENT,
.instrlen = 1, /* max of 9 instructions with num_rts = 8 */
.constlen = align(num_rts, 4),
.info.max_reg = MAX2(num_rts, 1) - 1,
.total_in = blit ? 2 : 0,
.num_samp = blit ? 1 : 0,
.inputs_count = blit ? 2 : 0,
.inputs[0] = {
.slot = VARYING_SLOT_VAR0,
.inloc = 0,
.compmask = 3,
.bary = true,
},
.inputs[1] = {
.slot = SYSTEM_VALUE_BARYCENTRIC_PERSP_PIXEL,
.regid = regid(0, 0),
.sysval = 1,
},
.num_sampler_prefetch = blit ? 1 : 0,
.sampler_prefetch[0] = {
.src = 0,
.wrmask = 0xf,
.cmd = 4,
},
.shader = &dummy_shader,
.const_state = &dummy_const_state,
};
tu_cs_emit_regs(cs, A6XX_HLSQ_INVALIDATE_CMD(
.vs_state = true,
.hs_state = true,
.ds_state = true,
.gs_state = true,
.fs_state = true,
.cs_state = true,
.gfx_ibo = true,
.cs_ibo = true,
.gfx_shared_const = true,
.gfx_bindless = 0x1f,
.cs_bindless = 0x1f));
tu6_emit_xs_config(cs, MESA_SHADER_VERTEX, &vs, global_iova(cmd, shaders[GLOBAL_SH_VS]));
tu6_emit_xs_config(cs, MESA_SHADER_TESS_CTRL, NULL, 0);
tu6_emit_xs_config(cs, MESA_SHADER_TESS_EVAL, NULL, 0);
tu6_emit_xs_config(cs, MESA_SHADER_GEOMETRY, NULL, 0);
tu6_emit_xs_config(cs, MESA_SHADER_FRAGMENT, &fs,
global_iova(cmd, shaders[blit ? GLOBAL_SH_FS_BLIT : (GLOBAL_SH_FS_CLEAR0 + num_rts)]));
tu_cs_emit_regs(cs, A6XX_PC_PRIMITIVE_CNTL_0());
tu_cs_emit_regs(cs, A6XX_VFD_CONTROL_0());
/* Copy what the blob does here. This will emit an extra 0x3f
* CP_EVENT_WRITE when multiview is disabled. I'm not exactly sure what
* this is working around yet.
*/
tu_cs_emit_pkt7(cs, CP_REG_WRITE, 3);
tu_cs_emit(cs, CP_REG_WRITE_0_TRACKER(UNK_EVENT_WRITE));
tu_cs_emit(cs, REG_A6XX_PC_MULTIVIEW_CNTL);
tu_cs_emit(cs, 0);
tu_cs_emit_regs(cs, A6XX_VFD_MULTIVIEW_CNTL());
tu6_emit_vpc(cs, &vs, NULL, NULL, NULL, &fs, 0, false);
/* REPL_MODE for varying with RECTLIST (2 vertices only) */
tu_cs_emit_regs(cs, A6XX_VPC_VARYING_INTERP_MODE(0, 0));
tu_cs_emit_regs(cs, A6XX_VPC_VARYING_PS_REPL_MODE(0, 2 << 2 | 1 << 0));
tu6_emit_fs_inputs(cs, &fs);
tu_cs_emit_regs(cs,
A6XX_GRAS_CL_CNTL(
.persp_division_disable = 1,
.vp_xform_disable = 1,
.vp_clip_code_ignore = 1,
.clip_disable = 1));
tu_cs_emit_regs(cs, A6XX_GRAS_SU_CNTL()); // XXX msaa enable?
tu_cs_emit_regs(cs,
A6XX_GRAS_SC_VIEWPORT_SCISSOR_TL(0, .x = 0, .y = 0),
A6XX_GRAS_SC_VIEWPORT_SCISSOR_BR(0, .x = 0x7fff, .y = 0x7fff));
tu_cs_emit_regs(cs,
A6XX_GRAS_SC_SCREEN_SCISSOR_TL(0, .x = 0, .y = 0),
A6XX_GRAS_SC_SCREEN_SCISSOR_BR(0, .x = 0x7fff, .y = 0x7fff));
tu_cs_emit_regs(cs,
A6XX_VFD_INDEX_OFFSET(),
A6XX_VFD_INSTANCE_START_OFFSET());
}
static void
r3d_coords_raw(struct tu_cs *cs, const float *coords)
{
tu_cs_emit_pkt7(cs, CP_LOAD_STATE6_GEOM, 3 + 8);
tu_cs_emit(cs, CP_LOAD_STATE6_0_DST_OFF(0) |
CP_LOAD_STATE6_0_STATE_TYPE(ST6_CONSTANTS) |
CP_LOAD_STATE6_0_STATE_SRC(SS6_DIRECT) |
CP_LOAD_STATE6_0_STATE_BLOCK(SB6_VS_SHADER) |
CP_LOAD_STATE6_0_NUM_UNIT(2));
tu_cs_emit(cs, CP_LOAD_STATE6_1_EXT_SRC_ADDR(0));
tu_cs_emit(cs, CP_LOAD_STATE6_2_EXT_SRC_ADDR_HI(0));
tu_cs_emit_array(cs, (const uint32_t *) coords, 8);
}
static void
r3d_coords(struct tu_cs *cs,
const VkOffset2D *dst,
const VkOffset2D *src,
const VkExtent2D *extent)
{
int32_t src_x1 = src ? src->x : 0;
int32_t src_y1 = src ? src->y : 0;
r3d_coords_raw(cs, (float[]) {
dst->x, dst->y,
src_x1, src_y1,
dst->x + extent->width, dst->y + extent->height,
src_x1 + extent->width, src_y1 + extent->height,
});
}
static void
r3d_clear_value(struct tu_cs *cs, VkFormat format, const VkClearValue *val)
{
tu_cs_emit_pkt7(cs, CP_LOAD_STATE6_FRAG, 3 + 4);
tu_cs_emit(cs, CP_LOAD_STATE6_0_DST_OFF(0) |
CP_LOAD_STATE6_0_STATE_TYPE(ST6_CONSTANTS) |
CP_LOAD_STATE6_0_STATE_SRC(SS6_DIRECT) |
CP_LOAD_STATE6_0_STATE_BLOCK(SB6_FS_SHADER) |
CP_LOAD_STATE6_0_NUM_UNIT(1));
tu_cs_emit(cs, CP_LOAD_STATE6_1_EXT_SRC_ADDR(0));
tu_cs_emit(cs, CP_LOAD_STATE6_2_EXT_SRC_ADDR_HI(0));
switch (format) {
case VK_FORMAT_X8_D24_UNORM_PACK32:
case VK_FORMAT_D24_UNORM_S8_UINT: {
/* cleared as r8g8b8a8_unorm using special format */
uint32_t tmp = tu_pack_float32_for_unorm(val->depthStencil.depth, 24);
tu_cs_emit(cs, fui((tmp & 0xff) / 255.0f));
tu_cs_emit(cs, fui((tmp >> 8 & 0xff) / 255.0f));
tu_cs_emit(cs, fui((tmp >> 16 & 0xff) / 255.0f));
tu_cs_emit(cs, fui((val->depthStencil.stencil & 0xff) / 255.0f));
} break;
case VK_FORMAT_D16_UNORM:
case VK_FORMAT_D32_SFLOAT:
tu_cs_emit(cs, fui(val->depthStencil.depth));
tu_cs_emit(cs, 0);
tu_cs_emit(cs, 0);
tu_cs_emit(cs, 0);
break;
case VK_FORMAT_S8_UINT:
tu_cs_emit(cs, val->depthStencil.stencil & 0xff);
tu_cs_emit(cs, 0);
tu_cs_emit(cs, 0);
tu_cs_emit(cs, 0);
break;
default:
/* as color formats use clear value as-is */
assert(!vk_format_is_depth_or_stencil(format));
tu_cs_emit_array(cs, val->color.uint32, 4);
break;
}
}
static void
r3d_src_common(struct tu_cmd_buffer *cmd,
struct tu_cs *cs,
const uint32_t *tex_const,
uint32_t offset_base,
uint32_t offset_ubwc,
VkFilter filter)
{
struct tu_cs_memory texture = { };
VkResult result = tu_cs_alloc(&cmd->sub_cs,
2, /* allocate space for a sampler too */
A6XX_TEX_CONST_DWORDS, &texture);
if (result != VK_SUCCESS) {
cmd->record_result = result;
return;
}
memcpy(texture.map, tex_const, A6XX_TEX_CONST_DWORDS * 4);
/* patch addresses for layer offset */
*(uint64_t*) (texture.map + 4) += offset_base;
uint64_t ubwc_addr = (texture.map[7] | (uint64_t) texture.map[8] << 32) + offset_ubwc;
texture.map[7] = ubwc_addr;
texture.map[8] = ubwc_addr >> 32;
texture.map[A6XX_TEX_CONST_DWORDS + 0] =
A6XX_TEX_SAMP_0_XY_MAG(tu6_tex_filter(filter, false)) |
A6XX_TEX_SAMP_0_XY_MIN(tu6_tex_filter(filter, false)) |
A6XX_TEX_SAMP_0_WRAP_S(A6XX_TEX_CLAMP_TO_EDGE) |
A6XX_TEX_SAMP_0_WRAP_T(A6XX_TEX_CLAMP_TO_EDGE) |
A6XX_TEX_SAMP_0_WRAP_R(A6XX_TEX_CLAMP_TO_EDGE) |
0x60000; /* XXX used by blob, doesn't seem necessary */
texture.map[A6XX_TEX_CONST_DWORDS + 1] =
0x1 | /* XXX used by blob, doesn't seem necessary */
A6XX_TEX_SAMP_1_UNNORM_COORDS |
A6XX_TEX_SAMP_1_MIPFILTER_LINEAR_FAR;
texture.map[A6XX_TEX_CONST_DWORDS + 2] = 0;
texture.map[A6XX_TEX_CONST_DWORDS + 3] = 0;
tu_cs_emit_pkt7(cs, CP_LOAD_STATE6_FRAG, 3);
tu_cs_emit(cs, CP_LOAD_STATE6_0_DST_OFF(0) |
CP_LOAD_STATE6_0_STATE_TYPE(ST6_SHADER) |
CP_LOAD_STATE6_0_STATE_SRC(SS6_INDIRECT) |
CP_LOAD_STATE6_0_STATE_BLOCK(SB6_FS_TEX) |
CP_LOAD_STATE6_0_NUM_UNIT(1));
tu_cs_emit_qw(cs, texture.iova + A6XX_TEX_CONST_DWORDS * 4);
tu_cs_emit_pkt4(cs, REG_A6XX_SP_FS_TEX_SAMP_LO, 2);
tu_cs_emit_qw(cs, texture.iova + A6XX_TEX_CONST_DWORDS * 4);
tu_cs_emit_pkt7(cs, CP_LOAD_STATE6_FRAG, 3);
tu_cs_emit(cs, CP_LOAD_STATE6_0_DST_OFF(0) |
CP_LOAD_STATE6_0_STATE_TYPE(ST6_CONSTANTS) |
CP_LOAD_STATE6_0_STATE_SRC(SS6_INDIRECT) |
CP_LOAD_STATE6_0_STATE_BLOCK(SB6_FS_TEX) |
CP_LOAD_STATE6_0_NUM_UNIT(1));
tu_cs_emit_qw(cs, texture.iova);
tu_cs_emit_pkt4(cs, REG_A6XX_SP_FS_TEX_CONST_LO, 2);
tu_cs_emit_qw(cs, texture.iova);
tu_cs_emit_regs(cs, A6XX_SP_FS_TEX_COUNT(1));
}
static void
r3d_src(struct tu_cmd_buffer *cmd,
struct tu_cs *cs,
const struct tu_image_view *iview,
uint32_t layer,
VkFilter filter)
{
r3d_src_common(cmd, cs, iview->descriptor,
iview->layer_size * layer,
iview->ubwc_layer_size * layer,
filter);
}
static void
r3d_src_buffer(struct tu_cmd_buffer *cmd,
struct tu_cs *cs,
VkFormat vk_format,
uint64_t va, uint32_t pitch,
uint32_t width, uint32_t height)
{
uint32_t desc[A6XX_TEX_CONST_DWORDS];
struct tu_native_format format = tu6_format_texture(vk_format, TILE6_LINEAR);
desc[0] =
COND(vk_format_is_srgb(vk_format), A6XX_TEX_CONST_0_SRGB) |
A6XX_TEX_CONST_0_FMT(format.fmt) |
A6XX_TEX_CONST_0_SWAP(format.swap) |
A6XX_TEX_CONST_0_SWIZ_X(A6XX_TEX_X) |
// XXX to swizzle into .w for stencil buffer_to_image
A6XX_TEX_CONST_0_SWIZ_Y(vk_format == VK_FORMAT_R8_UNORM ? A6XX_TEX_X : A6XX_TEX_Y) |
A6XX_TEX_CONST_0_SWIZ_Z(vk_format == VK_FORMAT_R8_UNORM ? A6XX_TEX_X : A6XX_TEX_Z) |
A6XX_TEX_CONST_0_SWIZ_W(vk_format == VK_FORMAT_R8_UNORM ? A6XX_TEX_X : A6XX_TEX_W);
desc[1] = A6XX_TEX_CONST_1_WIDTH(width) | A6XX_TEX_CONST_1_HEIGHT(height);
desc[2] =
A6XX_TEX_CONST_2_PITCH(pitch) |
A6XX_TEX_CONST_2_TYPE(A6XX_TEX_2D);
desc[3] = 0;
desc[4] = va;
desc[5] = va >> 32;
for (uint32_t i = 6; i < A6XX_TEX_CONST_DWORDS; i++)
desc[i] = 0;
r3d_src_common(cmd, cs, desc, 0, 0, VK_FILTER_NEAREST);
}
static void
r3d_dst(struct tu_cs *cs, const struct tu_image_view *iview, uint32_t layer)
{
tu6_emit_msaa(cs, iview->image->samples); /* TODO: move to setup */
tu_cs_emit_pkt4(cs, REG_A6XX_RB_MRT_BUF_INFO(0), 6);
tu_cs_emit(cs, iview->RB_MRT_BUF_INFO);
tu_cs_image_ref(cs, iview, layer);
tu_cs_emit(cs, 0);
tu_cs_emit_pkt4(cs, REG_A6XX_RB_MRT_FLAG_BUFFER(0), 3);
tu_cs_image_flag_ref(cs, iview, layer);
tu_cs_emit_regs(cs, A6XX_RB_RENDER_CNTL(.flag_mrts = iview->ubwc_enabled));
}
static void
r3d_dst_stencil(struct tu_cs *cs, const struct tu_image_view *iview, uint32_t layer)
{
tu6_emit_msaa(cs, iview->image->samples); /* TODO: move to setup */
tu_cs_emit_pkt4(cs, REG_A6XX_RB_MRT_BUF_INFO(0), 6);
tu_cs_emit(cs, tu_image_view_stencil(iview, RB_MRT_BUF_INFO));
tu_cs_image_stencil_ref(cs, iview, layer);
tu_cs_emit(cs, 0);
tu_cs_emit_regs(cs, A6XX_RB_RENDER_CNTL());
}
static void
r3d_dst_buffer(struct tu_cs *cs, VkFormat vk_format, uint64_t va, uint32_t pitch)
{
struct tu_native_format format = tu6_format_color(vk_format, TILE6_LINEAR);
tu6_emit_msaa(cs, 1); /* TODO: move to setup */
tu_cs_emit_regs(cs,
A6XX_RB_MRT_BUF_INFO(0, .color_format = format.fmt, .color_swap = format.swap),
A6XX_RB_MRT_PITCH(0, pitch),
A6XX_RB_MRT_ARRAY_PITCH(0, 0),
A6XX_RB_MRT_BASE_LO(0, (uint32_t) va),
A6XX_RB_MRT_BASE_HI(0, va >> 32),
A6XX_RB_MRT_BASE_GMEM(0, 0));
tu_cs_emit_regs(cs, A6XX_RB_RENDER_CNTL());
}
static uint8_t
aspect_write_mask(VkFormat vk_format, VkImageAspectFlags aspect_mask)
{
uint8_t mask = 0xf;
assert(aspect_mask);
/* note: the only format with partial writing is D24S8,
* clear/blit uses the _AS_R8G8B8A8 format to access it
*/
if (vk_format == VK_FORMAT_D24_UNORM_S8_UINT) {
if (aspect_mask == VK_IMAGE_ASPECT_DEPTH_BIT)
mask = 0x7;
if (aspect_mask == VK_IMAGE_ASPECT_STENCIL_BIT)
mask = 0x8;
}
return mask;
}
static void
r3d_setup(struct tu_cmd_buffer *cmd,
struct tu_cs *cs,
VkFormat vk_format,
VkImageAspectFlags aspect_mask,
enum a6xx_rotation rotation,
bool clear,
bool ubwc)
{
enum a6xx_format format = tu6_base_format(vk_format);
if ((vk_format == VK_FORMAT_D24_UNORM_S8_UINT ||
vk_format == VK_FORMAT_X8_D24_UNORM_PACK32) && ubwc) {
format = FMT6_Z24_UNORM_S8_UINT_AS_R8G8B8A8;
}
if (!cmd->state.pass) {
tu_emit_cache_flush_ccu(cmd, cs, TU_CMD_CCU_SYSMEM);
tu6_emit_window_scissor(cs, 0, 0, 0x3fff, 0x3fff);
}
tu_cs_emit_regs(cs, A6XX_GRAS_BIN_CONTROL(.dword = 0xc00000));
tu_cs_emit_regs(cs, A6XX_RB_BIN_CONTROL(.dword = 0xc00000));
r3d_common(cmd, cs, !clear, clear ? 1 : 0, false);
tu_cs_emit_pkt4(cs, REG_A6XX_SP_FS_OUTPUT_CNTL0, 2);
tu_cs_emit(cs, A6XX_SP_FS_OUTPUT_CNTL0_DEPTH_REGID(0xfc) |
A6XX_SP_FS_OUTPUT_CNTL0_SAMPMASK_REGID(0xfc) |
0xfc000000);
tu_cs_emit(cs, A6XX_SP_FS_OUTPUT_CNTL1_MRT(1));
tu_cs_emit_pkt4(cs, REG_A6XX_SP_FS_OUTPUT_REG(0), 1);
tu_cs_emit(cs, A6XX_SP_FS_OUTPUT_REG_REGID(0));
tu_cs_emit_regs(cs,
A6XX_RB_FS_OUTPUT_CNTL0(),
A6XX_RB_FS_OUTPUT_CNTL1(.mrt = 1));
tu_cs_emit_regs(cs, A6XX_SP_BLEND_CNTL());
tu_cs_emit_regs(cs, A6XX_RB_BLEND_CNTL(.sample_mask = 0xffff));
tu_cs_emit_regs(cs, A6XX_RB_DEPTH_PLANE_CNTL());
tu_cs_emit_regs(cs, A6XX_RB_DEPTH_CNTL());
tu_cs_emit_regs(cs, A6XX_GRAS_SU_DEPTH_PLANE_CNTL());
tu_cs_emit_regs(cs, A6XX_RB_STENCIL_CONTROL());
tu_cs_emit_regs(cs, A6XX_RB_STENCILMASK());
tu_cs_emit_regs(cs, A6XX_RB_STENCILWRMASK());
tu_cs_emit_regs(cs, A6XX_RB_STENCILREF());
tu_cs_emit_regs(cs, A6XX_RB_RENDER_COMPONENTS(.rt0 = 0xf));
tu_cs_emit_regs(cs, A6XX_SP_FS_RENDER_COMPONENTS(.rt0 = 0xf));
tu_cs_emit_regs(cs, A6XX_SP_FS_MRT_REG(0,
.color_format = format,
.color_sint = vk_format_is_sint(vk_format),
.color_uint = vk_format_is_uint(vk_format)));
tu_cs_emit_regs(cs, A6XX_RB_MRT_CONTROL(0,
.component_enable = aspect_write_mask(vk_format, aspect_mask)));
tu_cs_emit_regs(cs, A6XX_RB_SRGB_CNTL(vk_format_is_srgb(vk_format)));
tu_cs_emit_regs(cs, A6XX_SP_SRGB_CNTL(vk_format_is_srgb(vk_format)));
if (cmd->state.predication_active) {
tu_cs_emit_pkt7(cs, CP_DRAW_PRED_ENABLE_LOCAL, 1);
tu_cs_emit(cs, 0);
}
}
static void
r3d_run(struct tu_cmd_buffer *cmd, struct tu_cs *cs)
{
tu_cs_emit_pkt7(cs, CP_DRAW_INDX_OFFSET, 3);
tu_cs_emit(cs, CP_DRAW_INDX_OFFSET_0_PRIM_TYPE(DI_PT_RECTLIST) |
CP_DRAW_INDX_OFFSET_0_SOURCE_SELECT(DI_SRC_SEL_AUTO_INDEX) |
CP_DRAW_INDX_OFFSET_0_VIS_CULL(IGNORE_VISIBILITY));
tu_cs_emit(cs, 1); /* instance count */
tu_cs_emit(cs, 2); /* vertex count */
}
static void
r3d_teardown(struct tu_cmd_buffer *cmd, struct tu_cs *cs)
{
if (cmd->state.predication_active) {
tu_cs_emit_pkt7(cs, CP_DRAW_PRED_ENABLE_LOCAL, 1);
tu_cs_emit(cs, 1);
}
}
/* blit ops - common interface for 2d/shader paths */
struct blit_ops {
void (*coords)(struct tu_cs *cs,
const VkOffset2D *dst,
const VkOffset2D *src,
const VkExtent2D *extent);
void (*clear_value)(struct tu_cs *cs, VkFormat format, const VkClearValue *val);
void (*src)(
struct tu_cmd_buffer *cmd,
struct tu_cs *cs,
const struct tu_image_view *iview,
uint32_t layer,
VkFilter filter);
void (*src_buffer)(struct tu_cmd_buffer *cmd, struct tu_cs *cs,
VkFormat vk_format,
uint64_t va, uint32_t pitch,
uint32_t width, uint32_t height);
void (*dst)(struct tu_cs *cs, const struct tu_image_view *iview, uint32_t layer);
void (*dst_buffer)(struct tu_cs *cs, VkFormat vk_format, uint64_t va, uint32_t pitch);
void (*setup)(struct tu_cmd_buffer *cmd,
struct tu_cs *cs,
VkFormat vk_format,
VkImageAspectFlags aspect_mask,
enum a6xx_rotation rotation,
bool clear,
bool ubwc);
void (*run)(struct tu_cmd_buffer *cmd, struct tu_cs *cs);
void (*teardown)(struct tu_cmd_buffer *cmd,
struct tu_cs *cs);
};
static const struct blit_ops r2d_ops = {
.coords = r2d_coords,
.clear_value = r2d_clear_value,
.src = r2d_src,
.src_buffer = r2d_src_buffer,
.dst = r2d_dst,
.dst_buffer = r2d_dst_buffer,
.setup = r2d_setup,
.run = r2d_run,
.teardown = r2d_teardown,
};
static const struct blit_ops r3d_ops = {
.coords = r3d_coords,
.clear_value = r3d_clear_value,
.src = r3d_src,
.src_buffer = r3d_src_buffer,
.dst = r3d_dst,
.dst_buffer = r3d_dst_buffer,
.setup = r3d_setup,
.run = r3d_run,
.teardown = r3d_teardown,
};
/* passthrough set coords from 3D extents */
static void
coords(const struct blit_ops *ops,
struct tu_cs *cs,
const VkOffset3D *dst,
const VkOffset3D *src,
const VkExtent3D *extent)
{
ops->coords(cs, (const VkOffset2D*) dst, (const VkOffset2D*) src, (const VkExtent2D*) extent);
}
static VkFormat
copy_format(VkFormat format, VkImageAspectFlags aspect_mask, bool copy_buffer)
{
if (vk_format_is_compressed(format)) {
switch (vk_format_get_blocksize(format)) {
case 1: return VK_FORMAT_R8_UINT;
case 2: return VK_FORMAT_R16_UINT;
case 4: return VK_FORMAT_R32_UINT;
case 8: return VK_FORMAT_R32G32_UINT;
case 16:return VK_FORMAT_R32G32B32A32_UINT;
default:
unreachable("unhandled format size");
}
}
switch (format) {
case VK_FORMAT_G8_B8R8_2PLANE_420_UNORM:
if (aspect_mask == VK_IMAGE_ASPECT_PLANE_1_BIT)
return VK_FORMAT_R8G8_UNORM;
/* fallthrough */
case VK_FORMAT_G8_B8_R8_3PLANE_420_UNORM:
return VK_FORMAT_R8_UNORM;
case VK_FORMAT_D24_UNORM_S8_UINT:
if (aspect_mask == VK_IMAGE_ASPECT_STENCIL_BIT && copy_buffer)
return VK_FORMAT_R8_UNORM;
/* fallthrough */
default:
return format;
case VK_FORMAT_E5B9G9R9_UFLOAT_PACK32:
return VK_FORMAT_R32_UINT;
case VK_FORMAT_D32_SFLOAT_S8_UINT:
if (aspect_mask == VK_IMAGE_ASPECT_STENCIL_BIT)
return VK_FORMAT_S8_UINT;
assert(aspect_mask == VK_IMAGE_ASPECT_DEPTH_BIT);
return VK_FORMAT_D32_SFLOAT;
}
}
void
tu6_clear_lrz(struct tu_cmd_buffer *cmd,
struct tu_cs *cs,
struct tu_image *image,
const VkClearValue *value)
{
const struct blit_ops *ops = &r2d_ops;
ops->setup(cmd, cs, VK_FORMAT_D16_UNORM, VK_IMAGE_ASPECT_DEPTH_BIT, ROTATE_0, true, false);
ops->clear_value(cs, VK_FORMAT_D16_UNORM, value);
ops->dst_buffer(cs, VK_FORMAT_D16_UNORM,
image->bo->iova + image->bo_offset + image->lrz_offset,
image->lrz_pitch * 2);
ops->coords(cs, &(VkOffset2D) {}, NULL, &(VkExtent2D) {image->lrz_pitch, image->lrz_height});
ops->run(cmd, cs);
ops->teardown(cmd, cs);
}
static void
tu_image_view_copy_blit(struct tu_image_view *iview,
struct tu_image *image,
VkFormat format,
const VkImageSubresourceLayers *subres,
uint32_t layer,
bool stencil_read)
{
VkImageAspectFlags aspect_mask = subres->aspectMask;
/* always use the AS_R8G8B8A8 format for these */
if (format == VK_FORMAT_D24_UNORM_S8_UINT ||
format == VK_FORMAT_X8_D24_UNORM_PACK32) {
aspect_mask = VK_IMAGE_ASPECT_COLOR_BIT;
}
tu_image_view_init(iview, &(VkImageViewCreateInfo) {
.image = tu_image_to_handle(image),
.viewType = VK_IMAGE_VIEW_TYPE_2D,
.format = format,
/* image_to_buffer from d24s8 with stencil aspect mask writes out to r8 */
.components.r = stencil_read ? VK_COMPONENT_SWIZZLE_A : VK_COMPONENT_SWIZZLE_R,
.subresourceRange = {
.aspectMask = aspect_mask,
.baseMipLevel = subres->mipLevel,
.levelCount = 1,
.baseArrayLayer = subres->baseArrayLayer + layer,
.layerCount = 1,
},
}, false);
}
static void
tu_image_view_copy(struct tu_image_view *iview,
struct tu_image *image,
VkFormat format,
const VkImageSubresourceLayers *subres,
uint32_t layer,
bool stencil_read)
{
format = copy_format(format, subres->aspectMask, false);
tu_image_view_copy_blit(iview, image, format, subres, layer, stencil_read);
}
static void
tu_image_view_blit(struct tu_image_view *iview,
struct tu_image *image,
const VkImageSubresourceLayers *subres,
uint32_t layer)
{
tu_image_view_copy_blit(iview, image, image->vk_format, subres, layer, false);
}
static void
tu6_blit_image(struct tu_cmd_buffer *cmd,
struct tu_image *src_image,
struct tu_image *dst_image,
const VkImageBlit *info,
VkFilter filter)
{
const struct blit_ops *ops = &r2d_ops;
struct tu_cs *cs = &cmd->cs;
uint32_t layers;
/* 2D blit can't do rotation mirroring from just coordinates */
static const enum a6xx_rotation rotate[2][2] = {
{ROTATE_0, ROTATE_HFLIP},
{ROTATE_VFLIP, ROTATE_180},
};
bool mirror_x = (info->srcOffsets[1].x < info->srcOffsets[0].x) !=
(info->dstOffsets[1].x < info->dstOffsets[0].x);
bool mirror_y = (info->srcOffsets[1].y < info->srcOffsets[0].y) !=
(info->dstOffsets[1].y < info->dstOffsets[0].y);
bool mirror_z = (info->srcOffsets[1].z < info->srcOffsets[0].z) !=
(info->dstOffsets[1].z < info->dstOffsets[0].z);
if (mirror_z) {
tu_finishme("blit z mirror\n");
return;
}
if (info->srcOffsets[1].z - info->srcOffsets[0].z !=
info->dstOffsets[1].z - info->dstOffsets[0].z) {
tu_finishme("blit z filter\n");
return;
}
layers = info->srcOffsets[1].z - info->srcOffsets[0].z;
if (info->dstSubresource.layerCount > 1) {
assert(layers <= 1);
layers = info->dstSubresource.layerCount;
}
/* BC1_RGB_* formats need to have their last components overriden with 1
* when sampling, which is normally handled with the texture descriptor
* swizzle. The 2d path can't handle that, so use the 3d path.
*
* TODO: we could use RB_2D_BLIT_CNTL::MASK to make these formats work with
* the 2d path.
*/
if (dst_image->samples > 1 ||
src_image->vk_format == VK_FORMAT_BC1_RGB_UNORM_BLOCK ||
src_image->vk_format == VK_FORMAT_BC1_RGB_SRGB_BLOCK ||
filter == VK_FILTER_CUBIC_EXT)
ops = &r3d_ops;
/* use the right format in setup() for D32_S8
* TODO: this probably should use a helper
*/
VkFormat format = dst_image->vk_format;
if (format == VK_FORMAT_D32_SFLOAT_S8_UINT) {
if (info->dstSubresource.aspectMask == VK_IMAGE_ASPECT_DEPTH_BIT)
format = VK_FORMAT_D32_SFLOAT;
else if (info->dstSubresource.aspectMask == VK_IMAGE_ASPECT_STENCIL_BIT)
format = VK_FORMAT_S8_UINT;
else
unreachable("unexpected D32_S8 aspect mask in blit_image");
}
ops->setup(cmd, cs, format, info->dstSubresource.aspectMask,
rotate[mirror_y][mirror_x], false, dst_image->layout[0].ubwc);
if (ops == &r3d_ops) {
r3d_coords_raw(cs, (float[]) {
info->dstOffsets[0].x, info->dstOffsets[0].y,
info->srcOffsets[0].x, info->srcOffsets[0].y,
info->dstOffsets[1].x, info->dstOffsets[1].y,
info->srcOffsets[1].x, info->srcOffsets[1].y
});
} else {
tu_cs_emit_regs(cs,
A6XX_GRAS_2D_DST_TL(.x = MIN2(info->dstOffsets[0].x, info->dstOffsets[1].x),
.y = MIN2(info->dstOffsets[0].y, info->dstOffsets[1].y)),
A6XX_GRAS_2D_DST_BR(.x = MAX2(info->dstOffsets[0].x, info->dstOffsets[1].x) - 1,
.y = MAX2(info->dstOffsets[0].y, info->dstOffsets[1].y) - 1));
tu_cs_emit_regs(cs,
A6XX_GRAS_2D_SRC_TL_X(MIN2(info->srcOffsets[0].x, info->srcOffsets[1].x)),
A6XX_GRAS_2D_SRC_BR_X(MAX2(info->srcOffsets[0].x, info->srcOffsets[1].x) - 1),
A6XX_GRAS_2D_SRC_TL_Y(MIN2(info->srcOffsets[0].y, info->srcOffsets[1].y)),
A6XX_GRAS_2D_SRC_BR_Y(MAX2(info->srcOffsets[0].y, info->srcOffsets[1].y) - 1));
}
struct tu_image_view dst, src;
tu_image_view_blit(&dst, dst_image, &info->dstSubresource, info->dstOffsets[0].z);
tu_image_view_blit(&src, src_image, &info->srcSubresource, info->srcOffsets[0].z);
for (uint32_t i = 0; i < layers; i++) {
ops->dst(cs, &dst, i);
ops->src(cmd, cs, &src, i, filter);
ops->run(cmd, cs);
}
ops->teardown(cmd, cs);
}
void
tu_CmdBlitImage(VkCommandBuffer commandBuffer,
VkImage srcImage,
VkImageLayout srcImageLayout,
VkImage dstImage,
VkImageLayout dstImageLayout,
uint32_t regionCount,
const VkImageBlit *pRegions,
VkFilter filter)
{
TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
TU_FROM_HANDLE(tu_image, src_image, srcImage);
TU_FROM_HANDLE(tu_image, dst_image, dstImage);
for (uint32_t i = 0; i < regionCount; ++i) {
/* can't blit both depth and stencil at once with D32_S8
* TODO: more advanced 3D blit path to support it instead?
*/
if (src_image->vk_format == VK_FORMAT_D32_SFLOAT_S8_UINT ||
dst_image->vk_format == VK_FORMAT_D32_SFLOAT_S8_UINT) {
VkImageBlit region = pRegions[i];
uint32_t b;
for_each_bit(b, pRegions[i].dstSubresource.aspectMask) {
region.srcSubresource.aspectMask = BIT(b);
region.dstSubresource.aspectMask = BIT(b);
tu6_blit_image(cmd, src_image, dst_image, &region, filter);
}
continue;
}
tu6_blit_image(cmd, src_image, dst_image, pRegions + i, filter);
}
}
static void
copy_compressed(VkFormat format,
VkOffset3D *offset,
VkExtent3D *extent,
uint32_t *width,
uint32_t *height)
{
if (!vk_format_is_compressed(format))
return;
uint32_t block_width = vk_format_get_blockwidth(format);
uint32_t block_height = vk_format_get_blockheight(format);
offset->x /= block_width;
offset->y /= block_height;
if (extent) {
extent->width = DIV_ROUND_UP(extent->width, block_width);
extent->height = DIV_ROUND_UP(extent->height, block_height);
}
if (width)
*width = DIV_ROUND_UP(*width, block_width);
if (height)
*height = DIV_ROUND_UP(*height, block_height);
}
static void
tu_copy_buffer_to_image(struct tu_cmd_buffer *cmd,
struct tu_buffer *src_buffer,
struct tu_image *dst_image,
const VkBufferImageCopy *info)
{
struct tu_cs *cs = &cmd->cs;
uint32_t layers = MAX2(info->imageExtent.depth, info->imageSubresource.layerCount);
VkFormat src_format =
copy_format(dst_image->vk_format, info->imageSubresource.aspectMask, true);
const struct blit_ops *ops = &r2d_ops;
/* special case for buffer to stencil */
if (dst_image->vk_format == VK_FORMAT_D24_UNORM_S8_UINT &&
info->imageSubresource.aspectMask == VK_IMAGE_ASPECT_STENCIL_BIT) {
ops = &r3d_ops;
}
/* TODO: G8_B8R8_2PLANE_420_UNORM Y plane has different hardware format,
* which matters for UBWC. buffer_to_image/etc can fail because of this
*/
VkOffset3D offset = info->imageOffset;
VkExtent3D extent = info->imageExtent;
uint32_t src_width = info->bufferRowLength ?: extent.width;
uint32_t src_height = info->bufferImageHeight ?: extent.height;
copy_compressed(dst_image->vk_format, &offset, &extent, &src_width, &src_height);
uint32_t pitch = src_width * vk_format_get_blocksize(src_format);
uint32_t layer_size = src_height * pitch;
ops->setup(cmd, cs,
copy_format(dst_image->vk_format, info->imageSubresource.aspectMask, false),
info->imageSubresource.aspectMask, ROTATE_0, false, dst_image->layout[0].ubwc);
struct tu_image_view dst;
tu_image_view_copy(&dst, dst_image, dst_image->vk_format, &info->imageSubresource, offset.z, false);
for (uint32_t i = 0; i < layers; i++) {
ops->dst(cs, &dst, i);
uint64_t src_va = tu_buffer_iova(src_buffer) + info->bufferOffset + layer_size * i;
if ((src_va & 63) || (pitch & 63)) {
for (uint32_t y = 0; y < extent.height; y++) {
uint32_t x = (src_va & 63) / vk_format_get_blocksize(src_format);
ops->src_buffer(cmd, cs, src_format, src_va & ~63, pitch,
x + extent.width, 1);
ops->coords(cs, &(VkOffset2D){offset.x, offset.y + y}, &(VkOffset2D){x},
&(VkExtent2D) {extent.width, 1});
ops->run(cmd, cs);
src_va += pitch;
}
} else {
ops->src_buffer(cmd, cs, src_format, src_va, pitch, extent.width, extent.height);
coords(ops, cs, &offset, &(VkOffset3D){}, &extent);
ops->run(cmd, cs);
}
}
ops->teardown(cmd, cs);
}
void
tu_CmdCopyBufferToImage(VkCommandBuffer commandBuffer,
VkBuffer srcBuffer,
VkImage dstImage,
VkImageLayout dstImageLayout,
uint32_t regionCount,
const VkBufferImageCopy *pRegions)
{
TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
TU_FROM_HANDLE(tu_image, dst_image, dstImage);
TU_FROM_HANDLE(tu_buffer, src_buffer, srcBuffer);
for (unsigned i = 0; i < regionCount; ++i)
tu_copy_buffer_to_image(cmd, src_buffer, dst_image, pRegions + i);
}
static void
tu_copy_image_to_buffer(struct tu_cmd_buffer *cmd,
struct tu_image *src_image,
struct tu_buffer *dst_buffer,
const VkBufferImageCopy *info)
{
struct tu_cs *cs = &cmd->cs;
uint32_t layers = MAX2(info->imageExtent.depth, info->imageSubresource.layerCount);
VkFormat dst_format =
copy_format(src_image->vk_format, info->imageSubresource.aspectMask, true);
bool stencil_read = false;
if (src_image->vk_format == VK_FORMAT_D24_UNORM_S8_UINT &&
info->imageSubresource.aspectMask == VK_IMAGE_ASPECT_STENCIL_BIT) {
stencil_read = true;
}
const struct blit_ops *ops = stencil_read ? &r3d_ops : &r2d_ops;
VkOffset3D offset = info->imageOffset;
VkExtent3D extent = info->imageExtent;
uint32_t dst_width = info->bufferRowLength ?: extent.width;
uint32_t dst_height = info->bufferImageHeight ?: extent.height;
copy_compressed(src_image->vk_format, &offset, &extent, &dst_width, &dst_height);
uint32_t pitch = dst_width * vk_format_get_blocksize(dst_format);
uint32_t layer_size = pitch * dst_height;
ops->setup(cmd, cs, dst_format, VK_IMAGE_ASPECT_COLOR_BIT, ROTATE_0, false, false);
struct tu_image_view src;
tu_image_view_copy(&src, src_image, src_image->vk_format, &info->imageSubresource, offset.z, stencil_read);
for (uint32_t i = 0; i < layers; i++) {
ops->src(cmd, cs, &src, i, VK_FILTER_NEAREST);
uint64_t dst_va = tu_buffer_iova(dst_buffer) + info->bufferOffset + layer_size * i;
if ((dst_va & 63) || (pitch & 63)) {
for (uint32_t y = 0; y < extent.height; y++) {
uint32_t x = (dst_va & 63) / vk_format_get_blocksize(dst_format);
ops->dst_buffer(cs, dst_format, dst_va & ~63, 0);
ops->coords(cs, &(VkOffset2D) {x}, &(VkOffset2D){offset.x, offset.y + y},
&(VkExtent2D) {extent.width, 1});
ops->run(cmd, cs);
dst_va += pitch;
}
} else {
ops->dst_buffer(cs, dst_format, dst_va, pitch);
coords(ops, cs, &(VkOffset3D) {0, 0}, &offset, &extent);
ops->run(cmd, cs);
}
}
ops->teardown(cmd, cs);
}
void
tu_CmdCopyImageToBuffer(VkCommandBuffer commandBuffer,
VkImage srcImage,
VkImageLayout srcImageLayout,
VkBuffer dstBuffer,
uint32_t regionCount,
const VkBufferImageCopy *pRegions)
{
TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
TU_FROM_HANDLE(tu_image, src_image, srcImage);
TU_FROM_HANDLE(tu_buffer, dst_buffer, dstBuffer);
for (unsigned i = 0; i < regionCount; ++i)
tu_copy_image_to_buffer(cmd, src_image, dst_buffer, pRegions + i);
}
/* Tiled formats don't support swapping, which means that we can't support
* formats that require a non-WZYX swap like B8G8R8A8 natively. Also, some
* formats like B5G5R5A1 have a separate linear-only format when sampling.
* Currently we fake support for tiled swapped formats and use the unswapped
* format instead, but this means that reinterpreting copies to and from
* swapped formats can't be performed correctly unless we can swizzle the
* components by reinterpreting the other image as the "correct" swapped
* format, i.e. only when the other image is linear.
*/
static bool
is_swapped_format(VkFormat format)
{
struct tu_native_format linear = tu6_format_texture(format, TILE6_LINEAR);
struct tu_native_format tiled = tu6_format_texture(format, TILE6_3);
return linear.fmt != tiled.fmt || linear.swap != tiled.swap;
}
/* R8G8_* formats have a different tiling layout than other cpp=2 formats, and
* therefore R8G8 images can't be reinterpreted as non-R8G8 images (and vice
* versa). This should mirror the logic in fdl6_layout.
*/
static bool
image_is_r8g8(struct tu_image *image)
{
return image->layout[0].cpp == 2 &&
vk_format_get_nr_components(image->vk_format) == 2;
}
static void
tu_copy_image_to_image(struct tu_cmd_buffer *cmd,
struct tu_image *src_image,
struct tu_image *dst_image,
const VkImageCopy *info)
{
const struct blit_ops *ops = &r2d_ops;
struct tu_cs *cs = &cmd->cs;
if (dst_image->samples > 1)
ops = &r3d_ops;
VkFormat format = VK_FORMAT_UNDEFINED;
VkOffset3D src_offset = info->srcOffset;
VkOffset3D dst_offset = info->dstOffset;
VkExtent3D extent = info->extent;
/* From the Vulkan 1.2.140 spec, section 19.3 "Copying Data Between
* Images":
*
* When copying between compressed and uncompressed formats the extent
* members represent the texel dimensions of the source image and not
* the destination. When copying from a compressed image to an
* uncompressed image the image texel dimensions written to the
* uncompressed image will be source extent divided by the compressed
* texel block dimensions. When copying from an uncompressed image to a
* compressed image the image texel dimensions written to the compressed
* image will be the source extent multiplied by the compressed texel
* block dimensions.
*
* This means we only have to adjust the extent if the source image is
* compressed.
*/
copy_compressed(src_image->vk_format, &src_offset, &extent, NULL, NULL);
copy_compressed(dst_image->vk_format, &dst_offset, NULL, NULL, NULL);
VkFormat dst_format = copy_format(dst_image->vk_format, info->dstSubresource.aspectMask, false);
VkFormat src_format = copy_format(src_image->vk_format, info->srcSubresource.aspectMask, false);
bool use_staging_blit = false;
if (src_format == dst_format) {
/* Images that share a format can always be copied directly because it's
* the same as a blit.
*/
format = src_format;
} else if (!src_image->layout[0].tile_mode) {
/* If an image is linear, we can always safely reinterpret it with the
* other image's format and then do a regular blit.
*/
format = dst_format;
} else if (!dst_image->layout[0].tile_mode) {
format = src_format;
} else if (image_is_r8g8(src_image) != image_is_r8g8(dst_image)) {
/* We can't currently copy r8g8 images to/from other cpp=2 images,
* due to the different tile layout.
*/
use_staging_blit = true;
} else if (is_swapped_format(src_format) ||
is_swapped_format(dst_format)) {
/* If either format has a non-identity swap, then we can't copy
* to/from it.
*/
use_staging_blit = true;
} else if (!src_image->layout[0].ubwc) {
format = dst_format;
} else if (!dst_image->layout[0].ubwc) {
format = src_format;
} else {
/* Both formats use UBWC and so neither can be reinterpreted.
* TODO: We could do an in-place decompression of the dst instead.
*/
use_staging_blit = true;
}
struct tu_image_view dst, src;
if (use_staging_blit) {
tu_image_view_copy(&dst, dst_image, dst_format, &info->dstSubresource, dst_offset.z, false);
tu_image_view_copy(&src, src_image, src_format, &info->srcSubresource, src_offset.z, false);
struct tu_image staging_image = {
.vk_format = src_format,
.type = src_image->type,
.tiling = VK_IMAGE_TILING_LINEAR,
.extent = extent,
.level_count = 1,
.layer_count = info->srcSubresource.layerCount,
.samples = src_image->samples,
.bo_offset = 0,
};
VkImageSubresourceLayers staging_subresource = {
.aspectMask = VK_IMAGE_ASPECT_COLOR_BIT,
.mipLevel = 0,
.baseArrayLayer = 0,
.layerCount = info->srcSubresource.layerCount,
};
VkOffset3D staging_offset = { 0 };
staging_image.layout[0].tile_mode = TILE6_LINEAR;
staging_image.layout[0].ubwc = false;
fdl6_layout(&staging_image.layout[0],
vk_format_to_pipe_format(staging_image.vk_format),
staging_image.samples,
staging_image.extent.width,
staging_image.extent.height,
staging_image.extent.depth,
staging_image.level_count,
staging_image.layer_count,
staging_image.type == VK_IMAGE_TYPE_3D,
NULL);
VkResult result = tu_get_scratch_bo(cmd->device,
staging_image.layout[0].size,
&staging_image.bo);
if (result != VK_SUCCESS) {
cmd->record_result = result;
return;
}
struct tu_image_view staging;
tu_image_view_copy(&staging, &staging_image, src_format,
&staging_subresource, 0, false);
ops->setup(cmd, cs, src_format, VK_IMAGE_ASPECT_COLOR_BIT, ROTATE_0, false, false);
coords(ops, cs, &staging_offset, &src_offset, &extent);
for (uint32_t i = 0; i < info->extent.depth; i++) {
ops->src(cmd, cs, &src, i, VK_FILTER_NEAREST);
ops->dst(cs, &staging, i);
ops->run(cmd, cs);
}
/* When executed by the user there has to be a pipeline barrier here,
* but since we're doing it manually we'll have to flush ourselves.
*/
tu6_emit_event_write(cmd, cs, PC_CCU_FLUSH_COLOR_TS);
tu6_emit_event_write(cmd, cs, CACHE_INVALIDATE);
tu_image_view_copy(&staging, &staging_image, dst_format,
&staging_subresource, 0, false);
ops->setup(cmd, cs, dst_format, info->dstSubresource.aspectMask,
ROTATE_0, false, dst_image->layout[0].ubwc);
coords(ops, cs, &dst_offset, &staging_offset, &extent);
for (uint32_t i = 0; i < info->extent.depth; i++) {
ops->src(cmd, cs, &staging, i, VK_FILTER_NEAREST);
ops->dst(cs, &dst, i);
ops->run(cmd, cs);
}
} else {
tu_image_view_copy(&dst, dst_image, format, &info->dstSubresource, dst_offset.z, false);
tu_image_view_copy(&src, src_image, format, &info->srcSubresource, src_offset.z, false);
ops->setup(cmd, cs, format, info->dstSubresource.aspectMask,
ROTATE_0, false, dst_image->layout[0].ubwc);
coords(ops, cs, &dst_offset, &src_offset, &extent);
for (uint32_t i = 0; i < info->extent.depth; i++) {
ops->src(cmd, cs, &src, i, VK_FILTER_NEAREST);
ops->dst(cs, &dst, i);
ops->run(cmd, cs);
}
}
ops->teardown(cmd, cs);
}
void
tu_CmdCopyImage(VkCommandBuffer commandBuffer,
VkImage srcImage,
VkImageLayout srcImageLayout,
VkImage destImage,
VkImageLayout destImageLayout,
uint32_t regionCount,
const VkImageCopy *pRegions)
{
TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
TU_FROM_HANDLE(tu_image, src_image, srcImage);
TU_FROM_HANDLE(tu_image, dst_image, destImage);
for (uint32_t i = 0; i < regionCount; ++i)
tu_copy_image_to_image(cmd, src_image, dst_image, pRegions + i);
}
static void
copy_buffer(struct tu_cmd_buffer *cmd,
uint64_t dst_va,
uint64_t src_va,
uint64_t size,
uint32_t block_size)
{
const struct blit_ops *ops = &r2d_ops;
struct tu_cs *cs = &cmd->cs;
VkFormat format = block_size == 4 ? VK_FORMAT_R32_UINT : VK_FORMAT_R8_UNORM;
uint64_t blocks = size / block_size;
ops->setup(cmd, cs, format, VK_IMAGE_ASPECT_COLOR_BIT, ROTATE_0, false, false);
while (blocks) {
uint32_t src_x = (src_va & 63) / block_size;
uint32_t dst_x = (dst_va & 63) / block_size;
uint32_t width = MIN2(MIN2(blocks, 0x4000 - src_x), 0x4000 - dst_x);
ops->src_buffer(cmd, cs, format, src_va & ~63, 0, src_x + width, 1);
ops->dst_buffer( cs, format, dst_va & ~63, 0);
ops->coords(cs, &(VkOffset2D) {dst_x}, &(VkOffset2D) {src_x}, &(VkExtent2D) {width, 1});
ops->run(cmd, cs);
src_va += width * block_size;
dst_va += width * block_size;
blocks -= width;
}
ops->teardown(cmd, cs);
}
void
tu_CmdCopyBuffer(VkCommandBuffer commandBuffer,
VkBuffer srcBuffer,
VkBuffer dstBuffer,
uint32_t regionCount,
const VkBufferCopy *pRegions)
{
TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
TU_FROM_HANDLE(tu_buffer, src_buffer, srcBuffer);
TU_FROM_HANDLE(tu_buffer, dst_buffer, dstBuffer);
for (unsigned i = 0; i < regionCount; ++i) {
copy_buffer(cmd,
tu_buffer_iova(dst_buffer) + pRegions[i].dstOffset,
tu_buffer_iova(src_buffer) + pRegions[i].srcOffset,
pRegions[i].size, 1);
}
}
void
tu_CmdUpdateBuffer(VkCommandBuffer commandBuffer,
VkBuffer dstBuffer,
VkDeviceSize dstOffset,
VkDeviceSize dataSize,
const void *pData)
{
TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
TU_FROM_HANDLE(tu_buffer, buffer, dstBuffer);
struct tu_cs_memory tmp;
VkResult result = tu_cs_alloc(&cmd->sub_cs, DIV_ROUND_UP(dataSize, 64), 64, &tmp);
if (result != VK_SUCCESS) {
cmd->record_result = result;
return;
}
memcpy(tmp.map, pData, dataSize);
copy_buffer(cmd, tu_buffer_iova(buffer) + dstOffset, tmp.iova, dataSize, 4);
}
void
tu_CmdFillBuffer(VkCommandBuffer commandBuffer,
VkBuffer dstBuffer,
VkDeviceSize dstOffset,
VkDeviceSize fillSize,
uint32_t data)
{
TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
TU_FROM_HANDLE(tu_buffer, buffer, dstBuffer);
const struct blit_ops *ops = &r2d_ops;
struct tu_cs *cs = &cmd->cs;
if (fillSize == VK_WHOLE_SIZE)
fillSize = buffer->size - dstOffset;
uint64_t dst_va = tu_buffer_iova(buffer) + dstOffset;
uint32_t blocks = fillSize / 4;
ops->setup(cmd, cs, VK_FORMAT_R32_UINT, VK_IMAGE_ASPECT_COLOR_BIT, ROTATE_0, true, false);
ops->clear_value(cs, VK_FORMAT_R32_UINT, &(VkClearValue){.color = {.uint32[0] = data}});
while (blocks) {
uint32_t dst_x = (dst_va & 63) / 4;
uint32_t width = MIN2(blocks, 0x4000 - dst_x);
ops->dst_buffer(cs, VK_FORMAT_R32_UINT, dst_va & ~63, 0);
ops->coords(cs, &(VkOffset2D) {dst_x}, NULL, &(VkExtent2D) {width, 1});
ops->run(cmd, cs);
dst_va += width * 4;
blocks -= width;
}
ops->teardown(cmd, cs);
}
void
tu_CmdResolveImage(VkCommandBuffer commandBuffer,
VkImage srcImage,
VkImageLayout srcImageLayout,
VkImage dstImage,
VkImageLayout dstImageLayout,
uint32_t regionCount,
const VkImageResolve *pRegions)
{
TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
TU_FROM_HANDLE(tu_image, src_image, srcImage);
TU_FROM_HANDLE(tu_image, dst_image, dstImage);
const struct blit_ops *ops = &r2d_ops;
struct tu_cs *cs = &cmd->cs;
ops->setup(cmd, cs, dst_image->vk_format, VK_IMAGE_ASPECT_COLOR_BIT,
ROTATE_0, false, dst_image->layout[0].ubwc);
for (uint32_t i = 0; i < regionCount; ++i) {
const VkImageResolve *info = &pRegions[i];
uint32_t layers = MAX2(info->extent.depth, info->dstSubresource.layerCount);
assert(info->srcSubresource.layerCount == info->dstSubresource.layerCount);
/* TODO: aspect masks possible ? */
coords(ops, cs, &info->dstOffset, &info->srcOffset, &info->extent);
struct tu_image_view dst, src;
tu_image_view_blit(&dst, dst_image, &info->dstSubresource, info->dstOffset.z);
tu_image_view_blit(&src, src_image, &info->srcSubresource, info->srcOffset.z);
for (uint32_t i = 0; i < layers; i++) {
ops->src(cmd, cs, &src, i, VK_FILTER_NEAREST);
ops->dst(cs, &dst, i);
ops->run(cmd, cs);
}
}
ops->teardown(cmd, cs);
}
#define for_each_layer(layer, layer_mask, layers) \
for (uint32_t layer = 0; \
layer < ((layer_mask) ? (util_logbase2(layer_mask) + 1) : layers); \
layer++) \
if (!layer_mask || (layer_mask & BIT(layer)))
void
tu_resolve_sysmem(struct tu_cmd_buffer *cmd,
struct tu_cs *cs,
struct tu_image_view *src,
struct tu_image_view *dst,
uint32_t layer_mask,
uint32_t layers,
const VkRect2D *rect)
{
const struct blit_ops *ops = &r2d_ops;
assert(src->image->vk_format == dst->image->vk_format);
ops->setup(cmd, cs, dst->image->vk_format, VK_IMAGE_ASPECT_COLOR_BIT,
ROTATE_0, false, dst->ubwc_enabled);
ops->coords(cs, &rect->offset, &rect->offset, &rect->extent);
for_each_layer(i, layer_mask, layers) {
ops->src(cmd, cs, src, i, VK_FILTER_NEAREST);
ops->dst(cs, dst, i);
ops->run(cmd, cs);
}
ops->teardown(cmd, cs);
}
static void
clear_image(struct tu_cmd_buffer *cmd,
struct tu_image *image,
const VkClearValue *clear_value,
const VkImageSubresourceRange *range,
VkImageAspectFlags aspect_mask)
{
uint32_t level_count = tu_get_levelCount(image, range);
uint32_t layer_count = tu_get_layerCount(image, range);
struct tu_cs *cs = &cmd->cs;
VkFormat format = image->vk_format;
if (format == VK_FORMAT_D32_SFLOAT_S8_UINT || format == VK_FORMAT_E5B9G9R9_UFLOAT_PACK32)
format = copy_format(format, aspect_mask, false);
if (image->type == VK_IMAGE_TYPE_3D) {
assert(layer_count == 1);
assert(range->baseArrayLayer == 0);
}
const struct blit_ops *ops = image->samples > 1 ? &r3d_ops : &r2d_ops;
ops->setup(cmd, cs, format, aspect_mask, ROTATE_0, true, image->layout[0].ubwc);
if (image->vk_format == VK_FORMAT_E5B9G9R9_UFLOAT_PACK32)
ops->clear_value(cs, VK_FORMAT_E5B9G9R9_UFLOAT_PACK32, clear_value);
else
ops->clear_value(cs, format, clear_value);
for (unsigned j = 0; j < level_count; j++) {
if (image->type == VK_IMAGE_TYPE_3D)
layer_count = u_minify(image->extent.depth, range->baseMipLevel + j);
ops->coords(cs, &(VkOffset2D){}, NULL, &(VkExtent2D) {
u_minify(image->extent.width, range->baseMipLevel + j),
u_minify(image->extent.height, range->baseMipLevel + j)
});
struct tu_image_view dst;
tu_image_view_copy_blit(&dst, image, format, &(VkImageSubresourceLayers) {
.aspectMask = aspect_mask,
.mipLevel = range->baseMipLevel + j,
.baseArrayLayer = range->baseArrayLayer,
.layerCount = 1,
}, 0, false);
for (uint32_t i = 0; i < layer_count; i++) {
ops->dst(cs, &dst, i);
ops->run(cmd, cs);
}
}
ops->teardown(cmd, cs);
}
void
tu_CmdClearColorImage(VkCommandBuffer commandBuffer,
VkImage image_h,
VkImageLayout imageLayout,
const VkClearColorValue *pColor,
uint32_t rangeCount,
const VkImageSubresourceRange *pRanges)
{
TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
TU_FROM_HANDLE(tu_image, image, image_h);
for (unsigned i = 0; i < rangeCount; i++)
clear_image(cmd, image, (const VkClearValue*) pColor, pRanges + i, VK_IMAGE_ASPECT_COLOR_BIT);
}
void
tu_CmdClearDepthStencilImage(VkCommandBuffer commandBuffer,
VkImage image_h,
VkImageLayout imageLayout,
const VkClearDepthStencilValue *pDepthStencil,
uint32_t rangeCount,
const VkImageSubresourceRange *pRanges)
{
TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
TU_FROM_HANDLE(tu_image, image, image_h);
for (unsigned i = 0; i < rangeCount; i++) {
const VkImageSubresourceRange *range = &pRanges[i];
if (image->vk_format == VK_FORMAT_D32_SFLOAT_S8_UINT) {
/* can't clear both depth and stencil at once, split up the aspect mask */
uint32_t b;
for_each_bit(b, range->aspectMask)
clear_image(cmd, image, (const VkClearValue*) pDepthStencil, range, BIT(b));
continue;
}
clear_image(cmd, image, (const VkClearValue*) pDepthStencil, range, range->aspectMask);
}
}
static void
tu_clear_sysmem_attachments(struct tu_cmd_buffer *cmd,
uint32_t attachment_count,
const VkClearAttachment *attachments,
uint32_t rect_count,
const VkClearRect *rects)
{
/* the shader path here is special, it avoids changing MRT/etc state */
const struct tu_render_pass *pass = cmd->state.pass;
const struct tu_subpass *subpass = cmd->state.subpass;
const uint32_t mrt_count = subpass->color_count;
struct tu_cs *cs = &cmd->draw_cs;
uint32_t clear_value[MAX_RTS][4];
float z_clear_val = 0.0f;
uint8_t s_clear_val = 0;
uint32_t clear_rts = 0, clear_components = 0, num_rts = 0, b;
bool z_clear = false;
bool s_clear = false;
bool layered_clear = false;
uint32_t max_samples = 1;
for (uint32_t i = 0; i < attachment_count; i++) {
uint32_t a;
if (attachments[i].aspectMask & VK_IMAGE_ASPECT_COLOR_BIT) {
uint32_t c = attachments[i].colorAttachment;
a = subpass->color_attachments[c].attachment;
if (a == VK_ATTACHMENT_UNUSED)
continue;
clear_rts |= 1 << c;
clear_components |= 0xf << (c * 4);
memcpy(clear_value[c], &attachments[i].clearValue, 4 * sizeof(uint32_t));
} else {
a = subpass->depth_stencil_attachment.attachment;
if (a == VK_ATTACHMENT_UNUSED)
continue;
if (attachments[i].aspectMask & VK_IMAGE_ASPECT_DEPTH_BIT) {
z_clear = true;
z_clear_val = attachments[i].clearValue.depthStencil.depth;
}
if (attachments[i].aspectMask & VK_IMAGE_ASPECT_STENCIL_BIT) {
s_clear = true;
s_clear_val = attachments[i].clearValue.depthStencil.stencil & 0xff;
}
}
max_samples = MAX2(max_samples, pass->attachments[a].samples);
}
/* disable all draw states so they don't interfere
* TODO: use and re-use draw states
* we have to disable draw states individually to preserve
* input attachment states, because a secondary command buffer
* won't be able to restore them
*/
tu_cs_emit_pkt7(cs, CP_SET_DRAW_STATE, 3 * (TU_DRAW_STATE_COUNT - 2));
for (uint32_t i = 0; i < TU_DRAW_STATE_COUNT; i++) {
if (i == TU_DRAW_STATE_INPUT_ATTACHMENTS_GMEM ||
i == TU_DRAW_STATE_INPUT_ATTACHMENTS_SYSMEM)
continue;
tu_cs_emit(cs, CP_SET_DRAW_STATE__0_GROUP_ID(i) |
CP_SET_DRAW_STATE__0_DISABLE);
tu_cs_emit_qw(cs, 0);
}
cmd->state.dirty |= TU_CMD_DIRTY_DRAW_STATE;
tu_cs_emit_pkt4(cs, REG_A6XX_SP_FS_OUTPUT_CNTL0, 2);
tu_cs_emit(cs, A6XX_SP_FS_OUTPUT_CNTL0_DEPTH_REGID(0xfc) |
A6XX_SP_FS_OUTPUT_CNTL0_SAMPMASK_REGID(0xfc) |
0xfc000000);
tu_cs_emit(cs, A6XX_SP_FS_OUTPUT_CNTL1_MRT(mrt_count));
tu_cs_emit_pkt4(cs, REG_A6XX_SP_FS_OUTPUT_REG(0), mrt_count);
for (uint32_t i = 0; i < mrt_count; i++) {
if (clear_rts & (1 << i))
tu_cs_emit(cs, A6XX_SP_FS_OUTPUT_REG_REGID(num_rts++ * 4));
else
tu_cs_emit(cs, 0);
}
for (uint32_t i = 0; i < rect_count; i++) {
if (rects[i].baseArrayLayer || rects[i].layerCount > 1)
layered_clear = true;
}
/* a630 doesn't support multiview masks, which means that we can't use the
* normal multiview path without potentially recompiling a shader on-demand
* or using a more complicated variant that takes the mask as a const. Just
* use the layered path instead, since it shouldn't be much worse.
*/
if (subpass->multiview_mask) {
layered_clear = true;
}
r3d_common(cmd, cs, false, num_rts, layered_clear);
tu_cs_emit_regs(cs,
A6XX_SP_FS_RENDER_COMPONENTS(.dword = clear_components));
tu_cs_emit_regs(cs,
A6XX_RB_RENDER_COMPONENTS(.dword = clear_components));
tu_cs_emit_regs(cs,
A6XX_RB_FS_OUTPUT_CNTL0(),
A6XX_RB_FS_OUTPUT_CNTL1(.mrt = mrt_count));
tu_cs_emit_regs(cs, A6XX_SP_BLEND_CNTL());
tu_cs_emit_regs(cs, A6XX_RB_BLEND_CNTL(.independent_blend = 1, .sample_mask = 0xffff));
for (uint32_t i = 0; i < mrt_count; i++) {
tu_cs_emit_regs(cs, A6XX_RB_MRT_CONTROL(i,
.component_enable = COND(clear_rts & (1 << i), 0xf)));
}
if (z_clear) {
tu_cs_emit_regs(cs, A6XX_GRAS_LRZ_CNTL(0));
tu_cs_emit_regs(cs, A6XX_RB_LRZ_CNTL(0));
}
tu_cs_emit_regs(cs, A6XX_RB_DEPTH_PLANE_CNTL());
tu_cs_emit_regs(cs, A6XX_RB_DEPTH_CNTL(
.z_enable = z_clear,
.z_write_enable = z_clear,
.zfunc = FUNC_ALWAYS));
tu_cs_emit_regs(cs, A6XX_GRAS_SU_DEPTH_PLANE_CNTL());
tu_cs_emit_regs(cs, A6XX_RB_STENCIL_CONTROL(
.stencil_enable = s_clear,
.func = FUNC_ALWAYS,
.zpass = STENCIL_REPLACE));
tu_cs_emit_regs(cs, A6XX_RB_STENCILMASK(.mask = 0xff));
tu_cs_emit_regs(cs, A6XX_RB_STENCILWRMASK(.wrmask = 0xff));
tu_cs_emit_regs(cs, A6XX_RB_STENCILREF(.ref = s_clear_val));
tu_cs_emit_pkt7(cs, CP_LOAD_STATE6_FRAG, 3 + 4 * num_rts);
tu_cs_emit(cs, CP_LOAD_STATE6_0_DST_OFF(0) |
CP_LOAD_STATE6_0_STATE_TYPE(ST6_CONSTANTS) |
CP_LOAD_STATE6_0_STATE_SRC(SS6_DIRECT) |
CP_LOAD_STATE6_0_STATE_BLOCK(SB6_FS_SHADER) |
CP_LOAD_STATE6_0_NUM_UNIT(num_rts));
tu_cs_emit(cs, CP_LOAD_STATE6_1_EXT_SRC_ADDR(0));
tu_cs_emit(cs, CP_LOAD_STATE6_2_EXT_SRC_ADDR_HI(0));
for_each_bit(b, clear_rts)
tu_cs_emit_array(cs, clear_value[b], 4);
for (uint32_t i = 0; i < rect_count; i++) {
/* This should be true because of this valid usage for
* vkCmdClearAttachments:
*
* "If the render pass instance this is recorded in uses multiview,
* then baseArrayLayer must be zero and layerCount must be one"
*/
assert(!subpass->multiview_mask || rects[i].baseArrayLayer == 0);
for_each_layer(layer, subpass->multiview_mask, rects[i].layerCount) {
r3d_coords_raw(cs, (float[]) {
rects[i].rect.offset.x, rects[i].rect.offset.y,
z_clear_val, uif(rects[i].baseArrayLayer + layer),
rects[i].rect.offset.x + rects[i].rect.extent.width,
rects[i].rect.offset.y + rects[i].rect.extent.height,
z_clear_val, 1.0f,
});
r3d_run(cmd, cs);
}
}
}
static void
pack_gmem_clear_value(const VkClearValue *val, VkFormat format, uint32_t clear_value[4])
{
switch (format) {
case VK_FORMAT_X8_D24_UNORM_PACK32:
case VK_FORMAT_D24_UNORM_S8_UINT:
clear_value[0] = tu_pack_float32_for_unorm(val->depthStencil.depth, 24) |
val->depthStencil.stencil << 24;
return;
case VK_FORMAT_D16_UNORM:
clear_value[0] = tu_pack_float32_for_unorm(val->depthStencil.depth, 16);
return;
case VK_FORMAT_D32_SFLOAT:
clear_value[0] = fui(val->depthStencil.depth);
return;
case VK_FORMAT_S8_UINT:
clear_value[0] = val->depthStencil.stencil;
return;
default:
break;
}
float tmp[4];
memcpy(tmp, val->color.float32, 4 * sizeof(float));
if (vk_format_is_srgb(format)) {
for (int i = 0; i < 4; i++)
tmp[i] = util_format_linear_to_srgb_float(tmp[i]);
}
#define PACK_F(type) util_format_##type##_pack_rgba_float \
( (uint8_t*) &clear_value[0], 0, tmp, 0, 1, 1)
switch (vk_format_get_component_bits(format, UTIL_FORMAT_COLORSPACE_RGB, PIPE_SWIZZLE_X)) {
case 4:
PACK_F(r4g4b4a4_unorm);
break;
case 5:
if (vk_format_get_component_bits(format, UTIL_FORMAT_COLORSPACE_RGB, PIPE_SWIZZLE_Y) == 6)
PACK_F(r5g6b5_unorm);
else
PACK_F(r5g5b5a1_unorm);
break;
case 8:
if (vk_format_is_snorm(format))
PACK_F(r8g8b8a8_snorm);
else if (vk_format_is_unorm(format))
PACK_F(r8g8b8a8_unorm);
else
pack_int8(clear_value, val->color.uint32);
break;
case 10:
if (vk_format_is_int(format))
pack_int10_2(clear_value, val->color.uint32);
else
PACK_F(r10g10b10a2_unorm);
break;
case 11:
clear_value[0] = float3_to_r11g11b10f(val->color.float32);
break;
case 16:
if (vk_format_is_snorm(format))
PACK_F(r16g16b16a16_snorm);
else if (vk_format_is_unorm(format))
PACK_F(r16g16b16a16_unorm);
else if (vk_format_is_float(format))
PACK_F(r16g16b16a16_float);
else
pack_int16(clear_value, val->color.uint32);
break;
case 32:
memcpy(clear_value, val->color.float32, 4 * sizeof(float));
break;
default:
unreachable("unexpected channel size");
}
#undef PACK_F
}
static void
clear_gmem_attachment(struct tu_cmd_buffer *cmd,
struct tu_cs *cs,
VkFormat format,
uint8_t clear_mask,
uint32_t gmem_offset,
const VkClearValue *value)
{
tu_cs_emit_pkt4(cs, REG_A6XX_RB_BLIT_DST_INFO, 1);
tu_cs_emit(cs, A6XX_RB_BLIT_DST_INFO_COLOR_FORMAT(tu6_base_format(format)));
tu_cs_emit_regs(cs, A6XX_RB_BLIT_INFO(.gmem = 1, .clear_mask = clear_mask));
tu_cs_emit_pkt4(cs, REG_A6XX_RB_BLIT_BASE_GMEM, 1);
tu_cs_emit(cs, gmem_offset);
tu_cs_emit_pkt4(cs, REG_A6XX_RB_UNKNOWN_88D0, 1);
tu_cs_emit(cs, 0);
uint32_t clear_vals[4] = {};
pack_gmem_clear_value(value, format, clear_vals);
tu_cs_emit_pkt4(cs, REG_A6XX_RB_BLIT_CLEAR_COLOR_DW0, 4);
tu_cs_emit_array(cs, clear_vals, 4);
tu6_emit_event_write(cmd, cs, BLIT);
}
static void
tu_emit_clear_gmem_attachment(struct tu_cmd_buffer *cmd,
struct tu_cs *cs,
uint32_t attachment,
VkImageAspectFlags mask,
const VkClearValue *value)
{
const struct tu_render_pass_attachment *att =
&cmd->state.pass->attachments[attachment];
if (att->format == VK_FORMAT_D32_SFLOAT_S8_UINT) {
if (mask & VK_IMAGE_ASPECT_DEPTH_BIT)
clear_gmem_attachment(cmd, cs, VK_FORMAT_D32_SFLOAT, 0xf, att->gmem_offset, value);
if (mask & VK_IMAGE_ASPECT_STENCIL_BIT)
clear_gmem_attachment(cmd, cs, VK_FORMAT_S8_UINT, 0xf, att->gmem_offset_stencil, value);
return;
}
clear_gmem_attachment(cmd, cs, att->format, aspect_write_mask(att->format, mask), att->gmem_offset, value);
}
static void
tu_clear_gmem_attachments(struct tu_cmd_buffer *cmd,
uint32_t attachment_count,
const VkClearAttachment *attachments,
uint32_t rect_count,
const VkClearRect *rects)
{
const struct tu_subpass *subpass = cmd->state.subpass;
struct tu_cs *cs = &cmd->draw_cs;
/* TODO: swap the loops for smaller cmdstream */
for (unsigned i = 0; i < rect_count; i++) {
unsigned x1 = rects[i].rect.offset.x;
unsigned y1 = rects[i].rect.offset.y;
unsigned x2 = x1 + rects[i].rect.extent.width - 1;
unsigned y2 = y1 + rects[i].rect.extent.height - 1;
tu_cs_emit_pkt4(cs, REG_A6XX_RB_BLIT_SCISSOR_TL, 2);
tu_cs_emit(cs, A6XX_RB_BLIT_SCISSOR_TL_X(x1) | A6XX_RB_BLIT_SCISSOR_TL_Y(y1));
tu_cs_emit(cs, A6XX_RB_BLIT_SCISSOR_BR_X(x2) | A6XX_RB_BLIT_SCISSOR_BR_Y(y2));
for (unsigned j = 0; j < attachment_count; j++) {
uint32_t a;
if (attachments[j].aspectMask & VK_IMAGE_ASPECT_COLOR_BIT)
a = subpass->color_attachments[attachments[j].colorAttachment].attachment;
else
a = subpass->depth_stencil_attachment.attachment;
if (a == VK_ATTACHMENT_UNUSED)
continue;
tu_emit_clear_gmem_attachment(cmd, cs, a, attachments[j].aspectMask,
&attachments[j].clearValue);
}
}
}
void
tu_CmdClearAttachments(VkCommandBuffer commandBuffer,
uint32_t attachmentCount,
const VkClearAttachment *pAttachments,
uint32_t rectCount,
const VkClearRect *pRects)
{
TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
struct tu_cs *cs = &cmd->draw_cs;
/* sysmem path behaves like a draw, note we don't have a way of using different
* flushes for sysmem/gmem, so this needs to be outside of the cond_exec
*/
tu_emit_cache_flush_renderpass(cmd, cs);
/* vkCmdClearAttachments is supposed to respect the predicate if active.
* The easiest way to do this is to always use the 3d path, which always
* works even with GMEM because it's just a simple draw using the existing
* attachment state. However it seems that IGNORE_VISIBILITY draws must be
* skipped in the binning pass, since otherwise they produce binning data
* which isn't consumed and leads to the wrong binning data being read, so
* condition on GMEM | SYSMEM.
*/
if (cmd->state.predication_active) {
tu_cond_exec_start(cs, CP_COND_EXEC_0_RENDER_MODE_GMEM |
CP_COND_EXEC_0_RENDER_MODE_SYSMEM);
tu_clear_sysmem_attachments(cmd, attachmentCount, pAttachments, rectCount, pRects);
tu_cond_exec_end(cs);
return;
}
tu_cond_exec_start(cs, CP_COND_EXEC_0_RENDER_MODE_GMEM);
tu_clear_gmem_attachments(cmd, attachmentCount, pAttachments, rectCount, pRects);
tu_cond_exec_end(cs);
tu_cond_exec_start(cs, CP_COND_EXEC_0_RENDER_MODE_SYSMEM);
tu_clear_sysmem_attachments(cmd, attachmentCount, pAttachments, rectCount, pRects);
tu_cond_exec_end(cs);
for (uint32_t j = 0; j < attachmentCount; j++) {
if ((pAttachments[j].aspectMask & VK_IMAGE_ASPECT_DEPTH_BIT) == 0)
continue;
cmd->state.lrz.valid = false;
cmd->state.dirty |= TU_CMD_DIRTY_LRZ;
}
}
static void
clear_sysmem_attachment(struct tu_cmd_buffer *cmd,
struct tu_cs *cs,
VkFormat format,
VkImageAspectFlags clear_mask,
const VkRenderPassBeginInfo *info,
uint32_t a,
bool separate_stencil)
{
const struct tu_framebuffer *fb = cmd->state.framebuffer;
const struct tu_image_view *iview = fb->attachments[a].attachment;
const uint32_t clear_views = cmd->state.pass->attachments[a].clear_views;
const struct blit_ops *ops = &r2d_ops;
if (cmd->state.pass->attachments[a].samples > 1)
ops = &r3d_ops;
ops->setup(cmd, cs, format, clear_mask, ROTATE_0, true, iview->ubwc_enabled);
ops->coords(cs, &info->renderArea.offset, NULL, &info->renderArea.extent);
ops->clear_value(cs, format, &info->pClearValues[a]);
for_each_layer(i, clear_views, fb->layers) {
if (separate_stencil) {
if (ops == &r3d_ops)
r3d_dst_stencil(cs, iview, i);
else
r2d_dst_stencil(cs, iview, i);
} else {
ops->dst(cs, iview, i);
}
ops->run(cmd, cs);
}
ops->teardown(cmd, cs);
}
void
tu_clear_sysmem_attachment(struct tu_cmd_buffer *cmd,
struct tu_cs *cs,
uint32_t a,
const VkRenderPassBeginInfo *info)
{
const struct tu_render_pass_attachment *attachment =
&cmd->state.pass->attachments[a];
if (!attachment->clear_mask)
return;
/* Wait for any flushes at the beginning of the renderpass to complete */
tu_cs_emit_wfi(cs);
if (attachment->format == VK_FORMAT_D32_SFLOAT_S8_UINT) {
if (attachment->clear_mask & VK_IMAGE_ASPECT_DEPTH_BIT) {
clear_sysmem_attachment(cmd, cs, VK_FORMAT_D32_SFLOAT, VK_IMAGE_ASPECT_COLOR_BIT,
info, a, false);
}
if (attachment->clear_mask & VK_IMAGE_ASPECT_STENCIL_BIT) {
clear_sysmem_attachment(cmd, cs, VK_FORMAT_S8_UINT, VK_IMAGE_ASPECT_COLOR_BIT,
info, a, true);
}
} else {
clear_sysmem_attachment(cmd, cs, attachment->format, attachment->clear_mask,
info, a, false);
}
/* The spec doesn't explicitly say, but presumably the initial renderpass
* clear is considered part of the renderpass, and therefore barriers
* aren't required inside the subpass/renderpass. Therefore we need to
* flush CCU color into CCU depth here, just like with
* vkCmdClearAttachments(). Note that because this only happens at the
* beginning of a renderpass, and renderpass writes are considered
* "incoherent", we shouldn't have to worry about syncing depth into color
* beforehand as depth should already be flushed.
*/
if (vk_format_is_depth_or_stencil(attachment->format)) {
tu6_emit_event_write(cmd, cs, PC_CCU_FLUSH_COLOR_TS);
tu6_emit_event_write(cmd, cs, PC_CCU_INVALIDATE_DEPTH);
} else {
tu6_emit_event_write(cmd, cs, PC_CCU_FLUSH_COLOR_TS);
tu6_emit_event_write(cmd, cs, PC_CCU_INVALIDATE_COLOR);
}
}
void
tu_clear_gmem_attachment(struct tu_cmd_buffer *cmd,
struct tu_cs *cs,
uint32_t a,
const VkRenderPassBeginInfo *info)
{
const struct tu_render_pass_attachment *attachment =
&cmd->state.pass->attachments[a];
if (!attachment->clear_mask)
return;
tu_cs_emit_regs(cs, A6XX_RB_MSAA_CNTL(tu_msaa_samples(attachment->samples)));
tu_emit_clear_gmem_attachment(cmd, cs, a, attachment->clear_mask,
&info->pClearValues[a]);
}
static void
tu_emit_blit(struct tu_cmd_buffer *cmd,
struct tu_cs *cs,
const struct tu_image_view *iview,
const struct tu_render_pass_attachment *attachment,
bool resolve,
bool separate_stencil)
{
tu_cs_emit_regs(cs,
A6XX_RB_MSAA_CNTL(tu_msaa_samples(attachment->samples)));
tu_cs_emit_regs(cs, A6XX_RB_BLIT_INFO(
.unk0 = !resolve,
.gmem = !resolve,
/* "integer" bit disables msaa resolve averaging */
.integer = vk_format_is_int(attachment->format)));
tu_cs_emit_pkt4(cs, REG_A6XX_RB_BLIT_DST_INFO, 4);
if (separate_stencil) {
tu_cs_emit(cs, tu_image_view_stencil(iview, RB_BLIT_DST_INFO) & ~A6XX_RB_BLIT_DST_INFO_FLAGS);
tu_cs_emit_qw(cs, iview->stencil_base_addr);
tu_cs_emit(cs, iview->stencil_PITCH);
tu_cs_emit_regs(cs,
A6XX_RB_BLIT_BASE_GMEM(attachment->gmem_offset_stencil));
} else {
tu_cs_emit(cs, iview->RB_BLIT_DST_INFO);
tu_cs_image_ref_2d(cs, iview, 0, false);
tu_cs_emit_pkt4(cs, REG_A6XX_RB_BLIT_FLAG_DST_LO, 3);
tu_cs_image_flag_ref(cs, iview, 0);
tu_cs_emit_regs(cs,
A6XX_RB_BLIT_BASE_GMEM(attachment->gmem_offset));
}
tu6_emit_event_write(cmd, cs, BLIT);
}
static bool
blit_can_resolve(VkFormat format)
{
const struct util_format_description *desc = vk_format_description(format);
/* blit event can only do resolve for simple cases:
* averaging samples as unsigned integers or choosing only one sample
*/
if (vk_format_is_snorm(format) || vk_format_is_srgb(format))
return false;
/* can't do formats with larger channel sizes
* note: this includes all float formats
* note2: single channel integer formats seem OK
*/
if (desc->channel[0].size > 10)
return false;
switch (format) {
/* for unknown reasons blit event can't msaa resolve these formats when tiled
* likely related to these formats having different layout from other cpp=2 formats
*/
case VK_FORMAT_R8G8_UNORM:
case VK_FORMAT_R8G8_UINT:
case VK_FORMAT_R8G8_SINT:
/* TODO: this one should be able to work? */
case VK_FORMAT_D24_UNORM_S8_UINT:
return false;
default:
break;
}
return true;
}
void
tu_load_gmem_attachment(struct tu_cmd_buffer *cmd,
struct tu_cs *cs,
uint32_t a,
bool force_load)
{
const struct tu_image_view *iview =
cmd->state.framebuffer->attachments[a].attachment;
const struct tu_render_pass_attachment *attachment =
&cmd->state.pass->attachments[a];
if (attachment->load || force_load)
tu_emit_blit(cmd, cs, iview, attachment, false, false);
if (attachment->load_stencil || (attachment->format == VK_FORMAT_D32_SFLOAT_S8_UINT && force_load))
tu_emit_blit(cmd, cs, iview, attachment, false, true);
}
static void
store_cp_blit(struct tu_cmd_buffer *cmd,
struct tu_cs *cs,
struct tu_image_view *iview,
uint32_t samples,
bool separate_stencil,
VkFormat format,
uint32_t gmem_offset,
uint32_t cpp)
{
r2d_setup_common(cmd, cs, format, VK_IMAGE_ASPECT_COLOR_BIT, ROTATE_0, false,
iview->ubwc_enabled, true);
if (separate_stencil)
r2d_dst_stencil(cs, iview, 0);
else
r2d_dst(cs, iview, 0);
tu_cs_emit_regs(cs,
A6XX_SP_PS_2D_SRC_INFO(
.color_format = tu6_format_texture(format, TILE6_2).fmt,
.tile_mode = TILE6_2,
.srgb = vk_format_is_srgb(format),
.samples = tu_msaa_samples(samples),
.samples_average = !vk_format_is_int(format),
.unk20 = 1,
.unk22 = 1),
/* note: src size does not matter when not scaling */
A6XX_SP_PS_2D_SRC_SIZE( .width = 0x3fff, .height = 0x3fff),
A6XX_SP_PS_2D_SRC_LO(cmd->device->physical_device->gmem_base + gmem_offset),
A6XX_SP_PS_2D_SRC_HI(),
A6XX_SP_PS_2D_SRC_PITCH(.pitch = cmd->state.framebuffer->tile0.width * cpp));
/* sync GMEM writes with CACHE. */
tu6_emit_event_write(cmd, cs, CACHE_INVALIDATE);
/* Wait for CACHE_INVALIDATE to land */
tu_cs_emit_wfi(cs);
tu_cs_emit_pkt7(cs, CP_BLIT, 1);
tu_cs_emit(cs, CP_BLIT_0_OP(BLIT_OP_SCALE));
/* CP_BLIT writes to the CCU, unlike CP_EVENT_WRITE::BLIT which writes to
* sysmem, and we generally assume that GMEM renderpasses leave their
* results in sysmem, so we need to flush manually here.
*/
tu6_emit_event_write(cmd, cs, PC_CCU_FLUSH_COLOR_TS);
}
void
tu_store_gmem_attachment(struct tu_cmd_buffer *cmd,
struct tu_cs *cs,
uint32_t a,
uint32_t gmem_a)
{
const VkRect2D *render_area = &cmd->state.render_area;
struct tu_render_pass_attachment *dst = &cmd->state.pass->attachments[a];
struct tu_image_view *iview = cmd->state.framebuffer->attachments[a].attachment;
struct tu_render_pass_attachment *src = &cmd->state.pass->attachments[gmem_a];
if (!dst->store && !dst->store_stencil)
return;
uint32_t x1 = render_area->offset.x;
uint32_t y1 = render_area->offset.y;
uint32_t x2 = x1 + render_area->extent.width;
uint32_t y2 = y1 + render_area->extent.height;
/* x2/y2 can be unaligned if equal to the size of the image,
* since it will write into padding space
* the one exception is linear levels which don't have the
* required y padding in the layout (except for the last level)
*/
bool need_y2_align =
y2 != iview->extent.height || iview->need_y2_align;
bool unaligned =
x1 % GMEM_ALIGN_W || (x2 % GMEM_ALIGN_W && x2 != iview->extent.width) ||
y1 % GMEM_ALIGN_H || (y2 % GMEM_ALIGN_H && need_y2_align);
/* use fast path when render area is aligned, except for unsupported resolve cases */
if (!unaligned && (a == gmem_a || blit_can_resolve(dst->format))) {
if (dst->store)
tu_emit_blit(cmd, cs, iview, src, true, false);
if (dst->store_stencil)
tu_emit_blit(cmd, cs, iview, src, true, true);
return;
}
if (dst->samples > 1) {
/* I guess we need to use shader path in this case?
* need a testcase which fails because of this
*/
tu_finishme("unaligned store of msaa attachment\n");
return;
}
r2d_coords(cs, &render_area->offset, &render_area->offset, &render_area->extent);
VkFormat format = src->format;
if (format == VK_FORMAT_D32_SFLOAT_S8_UINT)
format = VK_FORMAT_D32_SFLOAT;
if (dst->store) {
store_cp_blit(cmd, cs, iview, src->samples, false, format,
src->gmem_offset, src->cpp);
}
if (dst->store_stencil) {
store_cp_blit(cmd, cs, iview, src->samples, true, VK_FORMAT_S8_UINT,
src->gmem_offset_stencil, src->samples);
}
}