blob: 593082bd7de073da66e2bfadf7fe34db30ff7423 [file] [log] [blame]
/****************************************************************************
* Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved.
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice (including the next
* paragraph) shall be included in all copies or substantial portions of the
* Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
* IN THE SOFTWARE.
*
* @file backend.h
*
* @brief Backend handles rasterization, pixel shading and output merger
* operations.
*
******************************************************************************/
#pragma once
void InitBackendSingleFuncTable(PFN_BACKEND_FUNC(&table)[SWR_INPUT_COVERAGE_COUNT][2][2]);
void InitBackendSampleFuncTable(PFN_BACKEND_FUNC(&table)[SWR_MULTISAMPLE_TYPE_COUNT][SWR_INPUT_COVERAGE_COUNT][2][2]);
static INLINE void CalcSampleBarycentrics(const BarycentricCoeffs& coeffs, SWR_PS_CONTEXT &psContext);
enum SWR_BACKEND_FUNCS
{
SWR_BACKEND_SINGLE_SAMPLE,
SWR_BACKEND_MSAA_PIXEL_RATE,
SWR_BACKEND_MSAA_SAMPLE_RATE,
SWR_BACKEND_FUNCS_MAX,
};
#if KNOB_SIMD_WIDTH == 8
static const __m256 vCenterOffsetsX = __m256{0.5, 1.5, 0.5, 1.5, 2.5, 3.5, 2.5, 3.5};
static const __m256 vCenterOffsetsY = __m256{0.5, 0.5, 1.5, 1.5, 0.5, 0.5, 1.5, 1.5};
static const __m256 vULOffsetsX = __m256{0.0, 1.0, 0.0, 1.0, 2.0, 3.0, 2.0, 3.0};
static const __m256 vULOffsetsY = __m256{0.0, 0.0, 1.0, 1.0, 0.0, 0.0, 1.0, 1.0};
#define MASK 0xff
#endif
static INLINE simdmask ComputeUserClipMask(uint8_t clipMask, float* pUserClipBuffer, simdscalar const &vI, simdscalar const &vJ)
{
simdscalar vClipMask = _simd_setzero_ps();
uint32_t numClipDistance = _mm_popcnt_u32(clipMask);
for (uint32_t i = 0; i < numClipDistance; ++i)
{
// pull triangle clip distance values from clip buffer
simdscalar vA = _simd_broadcast_ss(pUserClipBuffer++);
simdscalar vB = _simd_broadcast_ss(pUserClipBuffer++);
simdscalar vC = _simd_broadcast_ss(pUserClipBuffer++);
// interpolate
simdscalar vInterp = vplaneps(vA, vB, vC, vI, vJ);
// clip if interpolated clip distance is < 0 || NAN
simdscalar vCull = _simd_cmp_ps(_simd_setzero_ps(), vInterp, _CMP_NLE_UQ);
vClipMask = _simd_or_ps(vClipMask, vCull);
}
return _simd_movemask_ps(vClipMask);
}
INLINE static uint32_t RasterTileColorOffset(uint32_t sampleNum)
{
static const uint32_t RasterTileColorOffsets[16]
{ 0,
(KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8),
(KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * 2,
(KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * 3,
(KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * 4,
(KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * 5,
(KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * 6,
(KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * 7,
(KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * 8,
(KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * 9,
(KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * 10,
(KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * 11,
(KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * 12,
(KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * 13,
(KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * 14,
(KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * 15,
};
assert(sampleNum < 16);
return RasterTileColorOffsets[sampleNum];
}
INLINE static uint32_t RasterTileDepthOffset(uint32_t sampleNum)
{
static const uint32_t RasterTileDepthOffsets[16]
{ 0,
(KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8),
(KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * 2,
(KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * 3,
(KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * 4,
(KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * 5,
(KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * 6,
(KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * 7,
(KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * 8,
(KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * 9,
(KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * 10,
(KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * 11,
(KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * 12,
(KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * 13,
(KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * 14,
(KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * 15,
};
assert(sampleNum < 16);
return RasterTileDepthOffsets[sampleNum];
}
INLINE static uint32_t RasterTileStencilOffset(uint32_t sampleNum)
{
static const uint32_t RasterTileStencilOffsets[16]
{ 0,
(KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8),
(KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) * 2,
(KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) * 3,
(KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) * 4,
(KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) * 5,
(KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) * 6,
(KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) * 7,
(KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) * 8,
(KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) * 9,
(KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) * 10,
(KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) * 11,
(KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) * 12,
(KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) * 13,
(KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) * 14,
(KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) * 15,
};
assert(sampleNum < 16);
return RasterTileStencilOffsets[sampleNum];
}
template<typename T, uint32_t InputCoverage>
struct generateInputCoverage
{
INLINE generateInputCoverage(const uint64_t *const coverageMask, uint32_t (&inputMask)[KNOB_SIMD_WIDTH], const uint32_t sampleMask)
{
// will need to update for avx512
assert(KNOB_SIMD_WIDTH == 8);
simdscalari mask[2];
simdscalari sampleCoverage[2];
if(T::bIsCenterPattern)
{
// center coverage is the same for all samples; just broadcast to the sample slots
uint32_t centerCoverage = ((uint32_t)(*coverageMask) & MASK);
if(T::MultisampleT::numSamples == 1)
{
sampleCoverage[0] = _simd_set_epi32(0, 0, 0, 0, 0, 0, 0, centerCoverage);
}
else if(T::MultisampleT::numSamples == 2)
{
sampleCoverage[0] = _simd_set_epi32(0, 0, 0, 0, 0, 0, centerCoverage, centerCoverage);
}
else if(T::MultisampleT::numSamples == 4)
{
sampleCoverage[0] = _simd_set_epi32(0, 0, 0, 0, centerCoverage, centerCoverage, centerCoverage, centerCoverage);
}
else if(T::MultisampleT::numSamples == 8)
{
sampleCoverage[0] = _simd_set1_epi32(centerCoverage);
}
else if(T::MultisampleT::numSamples == 16)
{
sampleCoverage[0] = _simd_set1_epi32(centerCoverage);
sampleCoverage[1] = _simd_set1_epi32(centerCoverage);
}
}
else
{
simdscalari src = _simd_set1_epi32(0);
simdscalari index0 = _simd_set_epi32(7, 6, 5, 4, 3, 2, 1, 0), index1;
if(T::MultisampleT::numSamples == 1)
{
mask[0] = _simd_set_epi32(0, 0, 0, 0, 0, 0, 0, -1);
}
else if(T::MultisampleT::numSamples == 2)
{
mask[0] = _simd_set_epi32(0, 0, 0, 0, 0, 0, -1, -1);
}
else if(T::MultisampleT::numSamples == 4)
{
mask[0] = _simd_set_epi32(0, 0, 0, 0, -1, -1, -1, -1);
}
else if(T::MultisampleT::numSamples == 8)
{
mask[0] = _simd_set1_epi32(-1);
}
else if(T::MultisampleT::numSamples == 16)
{
mask[0] = _simd_set1_epi32(-1);
mask[1] = _simd_set1_epi32(-1);
index1 = _simd_set_epi32(15, 14, 13, 12, 11, 10, 9, 8);
}
// gather coverage for samples 0-7
sampleCoverage[0] = _mm256_castps_si256(_simd_mask_i32gather_ps(_mm256_castsi256_ps(src), (const float*)coverageMask, index0, _mm256_castsi256_ps(mask[0]), 8));
if(T::MultisampleT::numSamples > 8)
{
// gather coverage for samples 8-15
sampleCoverage[1] = _mm256_castps_si256(_simd_mask_i32gather_ps(_mm256_castsi256_ps(src), (const float*)coverageMask, index1, _mm256_castsi256_ps(mask[1]), 8));
}
}
mask[0] = _mm256_set_epi8(-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0xC, 0x8, 0x4, 0x0,
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0xC, 0x8, 0x4, 0x0);
// pull out the 8bit 4x2 coverage for samples 0-7 into the lower 32 bits of each 128bit lane
simdscalari packedCoverage0 = _simd_shuffle_epi8(sampleCoverage[0], mask[0]);
simdscalari packedCoverage1;
if(T::MultisampleT::numSamples > 8)
{
// pull out the 8bit 4x2 coverage for samples 8-15 into the lower 32 bits of each 128bit lane
packedCoverage1 = _simd_shuffle_epi8(sampleCoverage[1], mask[0]);
}
#if (KNOB_ARCH == KNOB_ARCH_AVX)
// pack lower 32 bits of each 128 bit lane into lower 64 bits of single 128 bit lane
simdscalari hiToLow = _mm256_permute2f128_si256(packedCoverage0, packedCoverage0, 0x83);
simdscalar shufRes = _mm256_shuffle_ps(_mm256_castsi256_ps(hiToLow), _mm256_castsi256_ps(hiToLow), _MM_SHUFFLE(1, 1, 0, 1));
packedCoverage0 = _mm256_castps_si256(_mm256_blend_ps(_mm256_castsi256_ps(packedCoverage0), shufRes, 0xFE));
simdscalari packedSampleCoverage;
if(T::MultisampleT::numSamples > 8)
{
// pack lower 32 bits of each 128 bit lane into upper 64 bits of single 128 bit lane
hiToLow = _mm256_permute2f128_si256(packedCoverage1, packedCoverage1, 0x83);
shufRes = _mm256_shuffle_ps(_mm256_castsi256_ps(hiToLow), _mm256_castsi256_ps(hiToLow), _MM_SHUFFLE(1, 1, 0, 1));
shufRes = _mm256_blend_ps(_mm256_castsi256_ps(packedCoverage1), shufRes, 0xFE);
packedCoverage1 = _mm256_castps_si256(_mm256_castpd_ps(_mm256_shuffle_pd(_mm256_castps_pd(shufRes), _mm256_castps_pd(shufRes), 0x01)));
packedSampleCoverage = _mm256_castps_si256(_mm256_blend_ps(_mm256_castsi256_ps(packedCoverage0), _mm256_castsi256_ps(packedCoverage1), 0xFC));
}
else
{
packedSampleCoverage = packedCoverage0;
}
#else
simdscalari permMask = _simd_set_epi32(0x7, 0x7, 0x7, 0x7, 0x7, 0x7, 0x4, 0x0);
// pack lower 32 bits of each 128 bit lane into lower 64 bits of single 128 bit lane
packedCoverage0 = _mm256_permutevar8x32_epi32(packedCoverage0, permMask);
simdscalari packedSampleCoverage;
if(T::MultisampleT::numSamples > 8)
{
permMask = _simd_set_epi32(0x7, 0x7, 0x7, 0x7, 0x4, 0x0, 0x7, 0x7);
// pack lower 32 bits of each 128 bit lane into upper 64 bits of single 128 bit lane
packedCoverage1 = _mm256_permutevar8x32_epi32(packedCoverage1, permMask);
// blend coverage masks for samples 0-7 and samples 8-15 into single 128 bit lane
packedSampleCoverage = _mm256_blend_epi32(packedCoverage0, packedCoverage1, 0x0C);
}
else
{
packedSampleCoverage = packedCoverage0;
}
#endif
for(int32_t i = KNOB_SIMD_WIDTH - 1; i >= 0; i--)
{
// convert packed sample coverage masks into single coverage masks for all samples for each pixel in the 4x2
inputMask[i] = _simd_movemask_epi8(packedSampleCoverage);
if(!T::bForcedSampleCount)
{
// input coverage has to be anded with sample mask if MSAA isn't forced on
inputMask[i] &= sampleMask;
}
// shift to the next pixel in the 4x2
packedSampleCoverage = _simd_slli_epi32(packedSampleCoverage, 1);
}
}
INLINE generateInputCoverage(const uint64_t *const coverageMask, simdscalar &inputCoverage, const uint32_t sampleMask)
{
uint32_t inputMask[KNOB_SIMD_WIDTH];
generateInputCoverage<T, T::InputCoverage>(coverageMask, inputMask, sampleMask);
inputCoverage = _simd_castsi_ps(_simd_set_epi32(inputMask[7], inputMask[6], inputMask[5], inputMask[4], inputMask[3], inputMask[2], inputMask[1], inputMask[0]));
}
};
template<typename T>
struct generateInputCoverage<T, SWR_INPUT_COVERAGE_INNER_CONSERVATIVE>
{
INLINE generateInputCoverage(const uint64_t *const coverageMask, simdscalar &inputCoverage, const uint32_t sampleMask)
{
// will need to update for avx512
assert(KNOB_SIMD_WIDTH == 8);
simdscalari vec = _simd_set1_epi32(coverageMask[0]);
const simdscalari bit = _simd_set_epi32(0x80, 0x40, 0x20, 0x10, 0x08, 0x04, 0x02, 0x01);
vec = _simd_and_si(vec, bit);
vec = _simd_cmplt_epi32(_simd_setzero_si(), vec);
vec = _simd_blendv_epi32(_simd_setzero_si(), _simd_set1_epi32(1), vec);
inputCoverage = _simd_castsi_ps(vec);
}
INLINE generateInputCoverage(const uint64_t *const coverageMask, uint32_t (&inputMask)[KNOB_SIMD_WIDTH], const uint32_t sampleMask)
{
uint32_t simdCoverage = (coverageMask[0] & MASK);
static const uint32_t FullCoverageMask = (1 << T::MultisampleT::numSamples) - 1;
for(int i = 0; i < KNOB_SIMD_WIDTH; i++)
{
// set all samples to covered if conservative coverage mask is set for that pixel
inputMask[i] = (((1 << i) & simdCoverage) > 0) ? FullCoverageMask : 0;
}
}
};
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
// Centroid behaves exactly as follows :
// (1) If all samples in the primitive are covered, the attribute is evaluated at the pixel center (even if the sample pattern does not happen to
// have a sample location there).
// (2) Else the attribute is evaluated at the first covered sample, in increasing order of sample index, where sample coverage is after ANDing the
// coverage with the SampleMask Rasterizer State.
// (3) If no samples are covered, such as on helper pixels executed off the bounds of a primitive to fill out 2x2 pixel stamps, the attribute is
// evaluated as follows : If the SampleMask Rasterizer state is a subset of the samples in the pixel, then the first sample covered by the
// SampleMask Rasterizer State is the evaluation point.Otherwise (full SampleMask), the pixel center is the evaluation point.
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
template<typename T>
INLINE void CalcCentroidPos(SWR_PS_CONTEXT &psContext, const SWR_MULTISAMPLE_POS& samplePos,
const uint64_t *const coverageMask, const uint32_t sampleMask,
simdscalar const &vXSamplePosUL, simdscalar const &vYSamplePosUL)
{
uint32_t inputMask[KNOB_SIMD_WIDTH];
generateInputCoverage<T, T::InputCoverage>(coverageMask, inputMask, sampleMask);
// Case (2) - partially covered pixel
// scan for first covered sample per pixel in the 4x2 span
unsigned long sampleNum[KNOB_SIMD_WIDTH];
(inputMask[0] > 0) ? (_BitScanForward(&sampleNum[0], inputMask[0])) : (sampleNum[0] = 0);
(inputMask[1] > 0) ? (_BitScanForward(&sampleNum[1], inputMask[1])) : (sampleNum[1] = 0);
(inputMask[2] > 0) ? (_BitScanForward(&sampleNum[2], inputMask[2])) : (sampleNum[2] = 0);
(inputMask[3] > 0) ? (_BitScanForward(&sampleNum[3], inputMask[3])) : (sampleNum[3] = 0);
(inputMask[4] > 0) ? (_BitScanForward(&sampleNum[4], inputMask[4])) : (sampleNum[4] = 0);
(inputMask[5] > 0) ? (_BitScanForward(&sampleNum[5], inputMask[5])) : (sampleNum[5] = 0);
(inputMask[6] > 0) ? (_BitScanForward(&sampleNum[6], inputMask[6])) : (sampleNum[6] = 0);
(inputMask[7] > 0) ? (_BitScanForward(&sampleNum[7], inputMask[7])) : (sampleNum[7] = 0);
// look up and set the sample offsets from UL pixel corner for first covered sample
simdscalar vXSample = _simd_set_ps(samplePos.X(sampleNum[7]),
samplePos.X(sampleNum[6]),
samplePos.X(sampleNum[5]),
samplePos.X(sampleNum[4]),
samplePos.X(sampleNum[3]),
samplePos.X(sampleNum[2]),
samplePos.X(sampleNum[1]),
samplePos.X(sampleNum[0]));
simdscalar vYSample = _simd_set_ps(samplePos.Y(sampleNum[7]),
samplePos.Y(sampleNum[6]),
samplePos.Y(sampleNum[5]),
samplePos.Y(sampleNum[4]),
samplePos.Y(sampleNum[3]),
samplePos.Y(sampleNum[2]),
samplePos.Y(sampleNum[1]),
samplePos.Y(sampleNum[0]));
// add sample offset to UL pixel corner
vXSample = _simd_add_ps(vXSamplePosUL, vXSample);
vYSample = _simd_add_ps(vYSamplePosUL, vYSample);
// Case (1) and case (3b) - All samples covered or not covered with full SampleMask
static const simdscalari vFullyCoveredMask = T::MultisampleT::FullSampleMask();
simdscalari vInputCoveragei = _simd_set_epi32(inputMask[7], inputMask[6], inputMask[5], inputMask[4], inputMask[3], inputMask[2], inputMask[1], inputMask[0]);
simdscalari vAllSamplesCovered = _simd_cmpeq_epi32(vInputCoveragei, vFullyCoveredMask);
static const simdscalari vZero = _simd_setzero_si();
const simdscalari vSampleMask = _simd_and_si(_simd_set1_epi32(sampleMask), vFullyCoveredMask);
simdscalari vNoSamplesCovered = _simd_cmpeq_epi32(vInputCoveragei, vZero);
simdscalari vIsFullSampleMask = _simd_cmpeq_epi32(vSampleMask, vFullyCoveredMask);
simdscalari vCase3b = _simd_and_si(vNoSamplesCovered, vIsFullSampleMask);
simdscalari vEvalAtCenter = _simd_or_si(vAllSamplesCovered, vCase3b);
// set the centroid position based on results from above
psContext.vX.centroid = _simd_blendv_ps(vXSample, psContext.vX.center, _simd_castsi_ps(vEvalAtCenter));
psContext.vY.centroid = _simd_blendv_ps(vYSample, psContext.vY.center, _simd_castsi_ps(vEvalAtCenter));
// Case (3a) No samples covered and partial sample mask
simdscalari vSomeSampleMaskSamples = _simd_cmplt_epi32(vSampleMask, vFullyCoveredMask);
// sample mask should never be all 0's for this case, but handle it anyways
unsigned long firstCoveredSampleMaskSample = 0;
(sampleMask > 0) ? (_BitScanForward(&firstCoveredSampleMaskSample, sampleMask)) : (firstCoveredSampleMaskSample = 0);
simdscalari vCase3a = _simd_and_si(vNoSamplesCovered, vSomeSampleMaskSamples);
vXSample = _simd_set1_ps(samplePos.X(firstCoveredSampleMaskSample));
vYSample = _simd_set1_ps(samplePos.Y(firstCoveredSampleMaskSample));
// blend in case 3a pixel locations
psContext.vX.centroid = _simd_blendv_ps(psContext.vX.centroid, vXSample, _simd_castsi_ps(vCase3a));
psContext.vY.centroid = _simd_blendv_ps(psContext.vY.centroid, vYSample, _simd_castsi_ps(vCase3a));
}
INLINE void CalcCentroidBarycentrics(const BarycentricCoeffs& coeffs, SWR_PS_CONTEXT &psContext,
const simdscalar &vXSamplePosUL, const simdscalar &vYSamplePosUL)
{
// evaluate I,J
psContext.vI.centroid = vplaneps(coeffs.vIa, coeffs.vIb, coeffs.vIc, psContext.vX.centroid, psContext.vY.centroid);
psContext.vJ.centroid = vplaneps(coeffs.vJa, coeffs.vJb, coeffs.vJc, psContext.vX.centroid, psContext.vY.centroid);
psContext.vI.centroid = _simd_mul_ps(psContext.vI.centroid, coeffs.vRecipDet);
psContext.vJ.centroid = _simd_mul_ps(psContext.vJ.centroid, coeffs.vRecipDet);
// interpolate 1/w
psContext.vOneOverW.centroid = vplaneps(coeffs.vAOneOverW, coeffs.vBOneOverW, coeffs.vCOneOverW, psContext.vI.centroid, psContext.vJ.centroid);
}
INLINE simdmask CalcDepthBoundsAcceptMask(simdscalar const &z, float minz, float maxz)
{
const simdscalar minzMask = _simd_cmpge_ps(z, _simd_set1_ps(minz));
const simdscalar maxzMask = _simd_cmple_ps(z, _simd_set1_ps(maxz));
return _simd_movemask_ps(_simd_and_ps(minzMask, maxzMask));
}
template<typename T>
INLINE uint32_t GetNumOMSamples(SWR_MULTISAMPLE_COUNT blendSampleCount)
{
// RT has to be single sample if we're in forcedMSAA mode
if(T::bForcedSampleCount && (T::MultisampleT::sampleCount > SWR_MULTISAMPLE_1X))
{
return 1;
}
// unless we're forced to single sample, in which case we run the OM at the sample count of the RT
else if(T::bForcedSampleCount && (T::MultisampleT::sampleCount == SWR_MULTISAMPLE_1X))
{
return GetNumSamples(blendSampleCount);
}
// else we're in normal MSAA mode and rasterizer and OM are running at the same sample count
else
{
return T::MultisampleT::numSamples;
}
}
inline void SetupBarycentricCoeffs(BarycentricCoeffs *coeffs, const SWR_TRIANGLE_DESC &work)
{
// broadcast scalars
coeffs->vIa = _simd_broadcast_ss(&work.I[0]);
coeffs->vIb = _simd_broadcast_ss(&work.I[1]);
coeffs->vIc = _simd_broadcast_ss(&work.I[2]);
coeffs->vJa = _simd_broadcast_ss(&work.J[0]);
coeffs->vJb = _simd_broadcast_ss(&work.J[1]);
coeffs->vJc = _simd_broadcast_ss(&work.J[2]);
coeffs->vZa = _simd_broadcast_ss(&work.Z[0]);
coeffs->vZb = _simd_broadcast_ss(&work.Z[1]);
coeffs->vZc = _simd_broadcast_ss(&work.Z[2]);
coeffs->vRecipDet = _simd_broadcast_ss(&work.recipDet);
coeffs->vAOneOverW = _simd_broadcast_ss(&work.OneOverW[0]);
coeffs->vBOneOverW = _simd_broadcast_ss(&work.OneOverW[1]);
coeffs->vCOneOverW = _simd_broadcast_ss(&work.OneOverW[2]);
}
inline void SetupRenderBuffers(uint8_t *pColorBuffer[SWR_NUM_RENDERTARGETS], uint8_t **pDepthBuffer, uint8_t **pStencilBuffer, uint32_t colorHotTileMask, RenderOutputBuffers &renderBuffers)
{
DWORD index;
while (_BitScanForward(&index, colorHotTileMask))
{
assert(index < SWR_NUM_RENDERTARGETS);
colorHotTileMask &= ~(1 << index);
pColorBuffer[index] = renderBuffers.pColor[index];
}
if (pDepthBuffer)
{
*pDepthBuffer = renderBuffers.pDepth;
}
if (pStencilBuffer)
{
*pStencilBuffer = renderBuffers.pStencil;;
}
}
template<typename T>
void SetupPixelShaderContext(SWR_PS_CONTEXT *psContext, const SWR_MULTISAMPLE_POS& samplePos, SWR_TRIANGLE_DESC &work)
{
psContext->pAttribs = work.pAttribs;
psContext->pPerspAttribs = work.pPerspAttribs;
psContext->frontFace = work.triFlags.frontFacing;
psContext->renderTargetArrayIndex = work.triFlags.renderTargetArrayIndex;
// save Ia/Ib/Ic and Ja/Jb/Jc if we need to reevaluate i/j/k in the shader because of pull attribs
psContext->I = work.I;
psContext->J = work.J;
psContext->recipDet = work.recipDet;
psContext->pRecipW = work.pRecipW;
psContext->pSamplePosX = samplePos.X();//reinterpret_cast<const float *>(&T::MultisampleT::samplePosX);
psContext->pSamplePosY = samplePos.Y();//reinterpret_cast<const float *>(&T::MultisampleT::samplePosY);
psContext->rasterizerSampleCount = T::MultisampleT::numSamples;
psContext->sampleIndex = 0;
}
template<typename T, bool IsSingleSample>
void CalcCentroid(SWR_PS_CONTEXT *psContext, const SWR_MULTISAMPLE_POS& samplePos,
const BarycentricCoeffs &coeffs, const uint64_t * const coverageMask, uint32_t sampleMask)
{
if (IsSingleSample) // if (T::MultisampleT::numSamples == 1) // doesn't cut it, the centroid positions are still different
{
// for 1x case, centroid is pixel center
psContext->vX.centroid = psContext->vX.center;
psContext->vY.centroid = psContext->vY.center;
psContext->vI.centroid = psContext->vI.center;
psContext->vJ.centroid = psContext->vJ.center;
psContext->vOneOverW.centroid = psContext->vOneOverW.center;
}
else
{
if (T::bCentroidPos)
{
///@ todo: don't need to genererate input coverage 2x if input coverage and centroid
if (T::bIsCenterPattern)
{
psContext->vX.centroid = _simd_add_ps(psContext->vX.UL, _simd_set1_ps(0.5f));
psContext->vY.centroid = _simd_add_ps(psContext->vY.UL, _simd_set1_ps(0.5f));
}
else
{
// add param: const uint32_t inputMask[KNOB_SIMD_WIDTH] to eliminate 'generate coverage 2X'..
CalcCentroidPos<T>(*psContext, samplePos, coverageMask, sampleMask, psContext->vX.UL, psContext->vY.UL);
}
CalcCentroidBarycentrics(coeffs, *psContext, psContext->vX.UL, psContext->vY.UL);
}
else
{
psContext->vX.centroid = psContext->vX.sample;
psContext->vY.centroid = psContext->vY.sample;
}
}
}
template<typename T>
struct PixelRateZTestLoop
{
PixelRateZTestLoop(DRAW_CONTEXT *DC, uint32_t _workerId, const SWR_TRIANGLE_DESC &Work, const BarycentricCoeffs& Coeffs, const API_STATE& apiState,
uint8_t*& depthBuffer, uint8_t*& stencilBuffer, const uint8_t ClipDistanceMask) :
pDC(DC), workerId(_workerId), work(Work), coeffs(Coeffs), state(apiState), psState(apiState.psState),
samplePos(state.rastState.samplePositions),
clipDistanceMask(ClipDistanceMask), pDepthBuffer(depthBuffer), pStencilBuffer(stencilBuffer){};
INLINE
uint32_t operator()(simdscalar& activeLanes, SWR_PS_CONTEXT& psContext,
const CORE_BUCKETS BEDepthBucket, uint32_t currentSimdIn8x8 = 0)
{
SWR_CONTEXT *pContext = pDC->pContext;
uint32_t statCount = 0;
simdscalar anyDepthSamplePassed = _simd_setzero_ps();
for(uint32_t sample = 0; sample < T::MultisampleT::numCoverageSamples; sample++)
{
const uint8_t *pCoverageMask = (uint8_t*)&work.coverageMask[sample];
vCoverageMask[sample] = _simd_and_ps(activeLanes, _simd_vmask_ps(pCoverageMask[currentSimdIn8x8] & MASK));
if(!_simd_movemask_ps(vCoverageMask[sample]))
{
vCoverageMask[sample] = depthPassMask[sample] = stencilPassMask[sample] = _simd_setzero_ps();
continue;
}
// offset depth/stencil buffers current sample
uint8_t *pDepthSample = pDepthBuffer + RasterTileDepthOffset(sample);
uint8_t * pStencilSample = pStencilBuffer + RasterTileStencilOffset(sample);
if (state.depthHottileEnable && state.depthBoundsState.depthBoundsTestEnable)
{
static_assert(KNOB_DEPTH_HOT_TILE_FORMAT == R32_FLOAT, "Unsupported depth hot tile format");
const simdscalar z = _simd_load_ps(reinterpret_cast<const float *>(pDepthSample));
const float minz = state.depthBoundsState.depthBoundsTestMinValue;
const float maxz = state.depthBoundsState.depthBoundsTestMaxValue;
vCoverageMask[sample] = _simd_and_ps(vCoverageMask[sample], _simd_vmask_ps(CalcDepthBoundsAcceptMask(z, minz, maxz)));
}
AR_BEGIN(BEBarycentric, pDC->drawId);
// calculate per sample positions
psContext.vX.sample = _simd_add_ps(psContext.vX.UL, samplePos.vX(sample));
psContext.vY.sample = _simd_add_ps(psContext.vY.UL, samplePos.vY(sample));
// calc I & J per sample
CalcSampleBarycentrics(coeffs, psContext);
if(psState.writesODepth)
{
{
// broadcast and test oDepth(psContext.vZ) written from the PS for each sample
vZ[sample] = psContext.vZ;
}
}
else
{
vZ[sample] = vplaneps(coeffs.vZa, coeffs.vZb, coeffs.vZc, psContext.vI.sample, psContext.vJ.sample);
vZ[sample] = state.pfnQuantizeDepth(vZ[sample]);
}
AR_END(BEBarycentric, 0);
///@todo: perspective correct vs non-perspective correct clipping?
// if clip distances are enabled, we need to interpolate for each sample
if(clipDistanceMask)
{
uint8_t clipMask = ComputeUserClipMask(clipDistanceMask, work.pUserClipBuffer, psContext.vI.sample, psContext.vJ.sample);
vCoverageMask[sample] = _simd_and_ps(vCoverageMask[sample], _simd_vmask_ps(~clipMask));
}
// ZTest for this sample
///@todo Need to uncomment out this bucket.
//AR_BEGIN(BEDepthBucket, pDC->drawId);
depthPassMask[sample] = vCoverageMask[sample];
stencilPassMask[sample] = vCoverageMask[sample];
depthPassMask[sample] = DepthStencilTest(&state, work.triFlags.frontFacing, work.triFlags.viewportIndex,
vZ[sample], pDepthSample, vCoverageMask[sample],
pStencilSample, &stencilPassMask[sample]);
//AR_END(BEDepthBucket, 0);
// early-exit if no pixels passed depth or earlyZ is forced on
if(psState.forceEarlyZ || !_simd_movemask_ps(depthPassMask[sample]))
{
DepthStencilWrite(&state.vp[work.triFlags.viewportIndex], &state.depthStencilState, work.triFlags.frontFacing, vZ[sample],
pDepthSample, depthPassMask[sample], vCoverageMask[sample], pStencilSample, stencilPassMask[sample]);
if(!_simd_movemask_ps(depthPassMask[sample]))
{
continue;
}
}
anyDepthSamplePassed = _simd_or_ps(anyDepthSamplePassed, depthPassMask[sample]);
uint32_t statMask = _simd_movemask_ps(depthPassMask[sample]);
statCount += _mm_popcnt_u32(statMask);
}
activeLanes = _simd_and_ps(anyDepthSamplePassed, activeLanes);
// return number of samples that passed depth and coverage
return statCount;
}
// saved depth/stencil/coverage masks and interpolated Z used in OM and DepthWrite
simdscalar vZ[T::MultisampleT::numCoverageSamples];
simdscalar vCoverageMask[T::MultisampleT::numCoverageSamples];
simdscalar depthPassMask[T::MultisampleT::numCoverageSamples];
simdscalar stencilPassMask[T::MultisampleT::numCoverageSamples];
private:
// functor inputs
DRAW_CONTEXT* pDC;
uint32_t workerId;
const SWR_TRIANGLE_DESC& work;
const BarycentricCoeffs& coeffs;
const API_STATE& state;
const SWR_PS_STATE& psState;
const SWR_MULTISAMPLE_POS& samplePos;
const uint8_t clipDistanceMask;
uint8_t*& pDepthBuffer;
uint8_t*& pStencilBuffer;
};
INLINE void CalcPixelBarycentrics(const BarycentricCoeffs& coeffs, SWR_PS_CONTEXT &psContext)
{
// evaluate I,J
psContext.vI.center = vplaneps(coeffs.vIa, coeffs.vIb, coeffs.vIc, psContext.vX.center, psContext.vY.center);
psContext.vJ.center = vplaneps(coeffs.vJa, coeffs.vJb, coeffs.vJc, psContext.vX.center, psContext.vY.center);
psContext.vI.center = _simd_mul_ps(psContext.vI.center, coeffs.vRecipDet);
psContext.vJ.center = _simd_mul_ps(psContext.vJ.center, coeffs.vRecipDet);
// interpolate 1/w
psContext.vOneOverW.center = vplaneps(coeffs.vAOneOverW, coeffs.vBOneOverW, coeffs.vCOneOverW, psContext.vI.center, psContext.vJ.center);
}
static INLINE void CalcSampleBarycentrics(const BarycentricCoeffs& coeffs, SWR_PS_CONTEXT &psContext)
{
// evaluate I,J
psContext.vI.sample = vplaneps(coeffs.vIa, coeffs.vIb, coeffs.vIc, psContext.vX.sample, psContext.vY.sample);
psContext.vJ.sample = vplaneps(coeffs.vJa, coeffs.vJb, coeffs.vJc, psContext.vX.sample, psContext.vY.sample);
psContext.vI.sample = _simd_mul_ps(psContext.vI.sample, coeffs.vRecipDet);
psContext.vJ.sample = _simd_mul_ps(psContext.vJ.sample, coeffs.vRecipDet);
// interpolate 1/w
psContext.vOneOverW.sample = vplaneps(coeffs.vAOneOverW, coeffs.vBOneOverW, coeffs.vCOneOverW, psContext.vI.sample, psContext.vJ.sample);
}
// Merge Output to 4x2 SIMD Tile Format
INLINE void OutputMerger4x2(SWR_PS_CONTEXT &psContext, uint8_t* (&pColorBase)[SWR_NUM_RENDERTARGETS], uint32_t sample, const SWR_BLEND_STATE *pBlendState,
const PFN_BLEND_JIT_FUNC (&pfnBlendFunc)[SWR_NUM_RENDERTARGETS], simdscalar &coverageMask, simdscalar const &depthPassMask, uint32_t renderTargetMask)
{
// type safety guaranteed from template instantiation in BEChooser<>::GetFunc
const uint32_t rasterTileColorOffset = RasterTileColorOffset(sample);
simdvector blendOut;
DWORD rt = 0;
while (_BitScanForward(&rt, renderTargetMask))
{
renderTargetMask &= ~(1 << rt);
uint8_t *pColorSample = pColorBase[rt] + rasterTileColorOffset;
const SWR_RENDER_TARGET_BLEND_STATE *pRTBlend = &pBlendState->renderTarget[rt];
{
// pfnBlendFunc may not update all channels. Initialize with PS output.
/// TODO: move this into the blend JIT.
blendOut = psContext.shaded[rt];
// Blend outputs and update coverage mask for alpha test
if(pfnBlendFunc[rt] != nullptr)
{
pfnBlendFunc[rt](
pBlendState,
psContext.shaded[rt],
psContext.shaded[1],
psContext.shaded[0].w,
sample,
pColorSample,
blendOut,
&psContext.oMask,
(simdscalari*)&coverageMask);
}
}
// final write mask
simdscalari outputMask = _simd_castps_si(_simd_and_ps(coverageMask, depthPassMask));
///@todo can only use maskstore fast path if bpc is 32. Assuming hot tile is RGBA32_FLOAT.
static_assert(KNOB_COLOR_HOT_TILE_FORMAT == R32G32B32A32_FLOAT, "Unsupported hot tile format");
const uint32_t simd = KNOB_SIMD_WIDTH * sizeof(float);
// store with color mask
if(!pRTBlend->writeDisableRed)
{
_simd_maskstore_ps((float*)pColorSample, outputMask, blendOut.x);
}
if(!pRTBlend->writeDisableGreen)
{
_simd_maskstore_ps((float*)(pColorSample + simd), outputMask, blendOut.y);
}
if(!pRTBlend->writeDisableBlue)
{
_simd_maskstore_ps((float*)(pColorSample + simd * 2), outputMask, blendOut.z);
}
if(!pRTBlend->writeDisableAlpha)
{
_simd_maskstore_ps((float*)(pColorSample + simd * 3), outputMask, blendOut.w);
}
}
}
#if USE_8x2_TILE_BACKEND
// Merge Output to 8x2 SIMD16 Tile Format
INLINE void OutputMerger8x2(SWR_PS_CONTEXT &psContext, uint8_t* (&pColorBase)[SWR_NUM_RENDERTARGETS], uint32_t sample, const SWR_BLEND_STATE *pBlendState,
const PFN_BLEND_JIT_FUNC(&pfnBlendFunc)[SWR_NUM_RENDERTARGETS], simdscalar &coverageMask, simdscalar const &depthPassMask, uint32_t renderTargetMask, bool useAlternateOffset)
{
// type safety guaranteed from template instantiation in BEChooser<>::GetFunc
uint32_t rasterTileColorOffset = RasterTileColorOffset(sample);
if (useAlternateOffset)
{
rasterTileColorOffset += sizeof(simdscalar);
}
simdvector blendSrc;
simdvector blendOut;
DWORD rt;
while (_BitScanForward(&rt, renderTargetMask))
{
renderTargetMask &= ~(1 << rt);
const SWR_RENDER_TARGET_BLEND_STATE *pRTBlend = &pBlendState->renderTarget[rt];
simdscalar* pColorSample;
bool hotTileEnable = !pRTBlend->writeDisableAlpha || !pRTBlend->writeDisableRed || !pRTBlend->writeDisableGreen || !pRTBlend->writeDisableBlue;
if (hotTileEnable)
{
pColorSample = reinterpret_cast<simdscalar *>(pColorBase[rt] + rasterTileColorOffset);
blendSrc[0] = pColorSample[0];
blendSrc[1] = pColorSample[2];
blendSrc[2] = pColorSample[4];
blendSrc[3] = pColorSample[6];
}
else
{
pColorSample = nullptr;
}
{
// pfnBlendFunc may not update all channels. Initialize with PS output.
/// TODO: move this into the blend JIT.
blendOut = psContext.shaded[rt];
// Blend outputs and update coverage mask for alpha test
if(pfnBlendFunc[rt] != nullptr)
{
pfnBlendFunc[rt](
pBlendState,
psContext.shaded[rt],
psContext.shaded[1],
psContext.shaded[0].w,
sample,
reinterpret_cast<uint8_t *>(&blendSrc),
blendOut,
&psContext.oMask,
reinterpret_cast<simdscalari *>(&coverageMask));
}
}
// final write mask
simdscalari outputMask = _simd_castps_si(_simd_and_ps(coverageMask, depthPassMask));
///@todo can only use maskstore fast path if bpc is 32. Assuming hot tile is RGBA32_FLOAT.
static_assert(KNOB_COLOR_HOT_TILE_FORMAT == R32G32B32A32_FLOAT, "Unsupported hot tile format");
// store with color mask
if (!pRTBlend->writeDisableRed)
{
_simd_maskstore_ps(reinterpret_cast<float *>(&pColorSample[0]), outputMask, blendOut.x);
}
if (!pRTBlend->writeDisableGreen)
{
_simd_maskstore_ps(reinterpret_cast<float *>(&pColorSample[2]), outputMask, blendOut.y);
}
if (!pRTBlend->writeDisableBlue)
{
_simd_maskstore_ps(reinterpret_cast<float *>(&pColorSample[4]), outputMask, blendOut.z);
}
if (!pRTBlend->writeDisableAlpha)
{
_simd_maskstore_ps(reinterpret_cast<float *>(&pColorSample[6]), outputMask, blendOut.w);
}
}
}
#endif
template<typename T>
void BackendPixelRate(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint32_t y, SWR_TRIANGLE_DESC &work, RenderOutputBuffers &renderBuffers)
{
///@todo: Need to move locals off stack to prevent __chkstk's from being generated for the backend
SWR_CONTEXT *pContext = pDC->pContext;
AR_BEGIN(BEPixelRateBackend, pDC->drawId);
AR_BEGIN(BESetup, pDC->drawId);
const API_STATE &state = GetApiState(pDC);
BarycentricCoeffs coeffs;
SetupBarycentricCoeffs(&coeffs, work);
SWR_PS_CONTEXT psContext;
const SWR_MULTISAMPLE_POS& samplePos = state.rastState.samplePositions;
SetupPixelShaderContext<T>(&psContext, samplePos, work);
uint8_t *pDepthBuffer, *pStencilBuffer;
SetupRenderBuffers(psContext.pColorBuffer, &pDepthBuffer, &pStencilBuffer, state.colorHottileEnable, renderBuffers);
AR_END(BESetup, 0);
PixelRateZTestLoop<T> PixelRateZTest(pDC, workerId, work, coeffs, state, pDepthBuffer, pStencilBuffer, state.backendState.clipDistanceMask);
psContext.vY.UL = _simd_add_ps(vULOffsetsY, _simd_set1_ps(static_cast<float>(y)));
psContext.vY.center = _simd_add_ps(vCenterOffsetsY, _simd_set1_ps(static_cast<float>(y)));
const simdscalar dy = _simd_set1_ps(static_cast<float>(SIMD_TILE_Y_DIM));
for(uint32_t yy = y; yy < y + KNOB_TILE_Y_DIM; yy += SIMD_TILE_Y_DIM)
{
psContext.vX.UL = _simd_add_ps(vULOffsetsX, _simd_set1_ps(static_cast<float>(x)));
psContext.vX.center = _simd_add_ps(vCenterOffsetsX, _simd_set1_ps(static_cast<float>(x)));
const simdscalar dx = _simd_set1_ps(static_cast<float>(SIMD_TILE_X_DIM));
for(uint32_t xx = x; xx < x + KNOB_TILE_X_DIM; xx += SIMD_TILE_X_DIM)
{
#if USE_8x2_TILE_BACKEND
const bool useAlternateOffset = ((xx & SIMD_TILE_X_DIM) != 0);
#endif
simdscalar activeLanes;
if(!(work.anyCoveredSamples & MASK)) {goto Endtile;};
activeLanes = _simd_vmask_ps(work.anyCoveredSamples & MASK);
if (T::InputCoverage != SWR_INPUT_COVERAGE_NONE)
{
const uint64_t* pCoverageMask = (T::InputCoverage == SWR_INPUT_COVERAGE_INNER_CONSERVATIVE) ? &work.innerCoverageMask : &work.coverageMask[0];
generateInputCoverage<T, T::InputCoverage>(pCoverageMask, psContext.inputMask, state.blendState.sampleMask);
}
AR_BEGIN(BEBarycentric, pDC->drawId);
CalcPixelBarycentrics(coeffs, psContext);
CalcCentroid<T, false>(&psContext, samplePos, coeffs, work.coverageMask, state.blendState.sampleMask);
AR_END(BEBarycentric, 0);
if(T::bForcedSampleCount)
{
// candidate pixels (that passed coverage) will cause shader invocation if any bits in the samplemask are set
const simdscalar vSampleMask = _simd_castsi_ps(_simd_cmpgt_epi32(_simd_set1_epi32(state.blendState.sampleMask), _simd_setzero_si()));
activeLanes = _simd_and_ps(activeLanes, vSampleMask);
}
// Early-Z?
if(T::bCanEarlyZ && !T::bForcedSampleCount)
{
uint32_t depthPassCount = PixelRateZTest(activeLanes, psContext, BEEarlyDepthTest);
UPDATE_STAT_BE(DepthPassCount, depthPassCount);
AR_EVENT(EarlyDepthInfoPixelRate(depthPassCount, _simd_movemask_ps(activeLanes)));
}
// if we have no covered samples that passed depth at this point, go to next tile
if(!_simd_movemask_ps(activeLanes)) { goto Endtile; };
if(state.psState.usesSourceDepth)
{
AR_BEGIN(BEBarycentric, pDC->drawId);
// interpolate and quantize z
psContext.vZ = vplaneps(coeffs.vZa, coeffs.vZb, coeffs.vZc, psContext.vI.center, psContext.vJ.center);
psContext.vZ = state.pfnQuantizeDepth(psContext.vZ);
AR_END(BEBarycentric, 0);
}
// pixels that are currently active
psContext.activeMask = _simd_castps_si(activeLanes);
psContext.oMask = T::MultisampleT::FullSampleMask();
// execute pixel shader
AR_BEGIN(BEPixelShader, pDC->drawId);
state.psState.pfnPixelShader(GetPrivateState(pDC), &psContext);
UPDATE_STAT_BE(PsInvocations, _mm_popcnt_u32(_simd_movemask_ps(activeLanes)));
AR_END(BEPixelShader, 0);
// update active lanes to remove any discarded or oMask'd pixels
activeLanes = _simd_castsi_ps(_simd_and_si(psContext.activeMask, _simd_cmpgt_epi32(psContext.oMask, _simd_setzero_si())));
if(!_simd_movemask_ps(activeLanes)) { goto Endtile; };
// late-Z
if(!T::bCanEarlyZ && !T::bForcedSampleCount)
{
uint32_t depthPassCount = PixelRateZTest(activeLanes, psContext, BELateDepthTest);
UPDATE_STAT_BE(DepthPassCount, depthPassCount);
AR_EVENT(LateDepthInfoPixelRate(depthPassCount, _simd_movemask_ps(activeLanes)));
}
// if we have no covered samples that passed depth at this point, skip OM and go to next tile
if(!_simd_movemask_ps(activeLanes)) { goto Endtile; };
// output merger
// loop over all samples, broadcasting the results of the PS to all passing pixels
for(uint32_t sample = 0; sample < GetNumOMSamples<T>(state.blendState.sampleCount); sample++)
{
AR_BEGIN(BEOutputMerger, pDC->drawId);
// center pattern does a single coverage/depth/stencil test, standard pattern tests all samples
uint32_t coverageSampleNum = (T::bIsCenterPattern) ? 0 : sample;
simdscalar coverageMask, depthMask;
if(T::bForcedSampleCount)
{
coverageMask = depthMask = activeLanes;
}
else
{
coverageMask = PixelRateZTest.vCoverageMask[coverageSampleNum];
depthMask = PixelRateZTest.depthPassMask[coverageSampleNum];
if(!_simd_movemask_ps(depthMask))
{
// stencil should already have been written in early/lateZ tests
AR_END(BEOutputMerger, 0);
continue;
}
}
// broadcast the results of the PS to all passing pixels
#if USE_8x2_TILE_BACKEND
OutputMerger8x2(psContext, psContext.pColorBuffer, sample, &state.blendState,state.pfnBlendFunc, coverageMask, depthMask, state.psState.renderTargetMask, useAlternateOffset);
#else // USE_8x2_TILE_BACKEND
OutputMerger4x2(psContext, psContext.pColorBuffer, sample, &state.blendState, state.pfnBlendFunc, coverageMask, depthMask, state.psState.renderTargetMask);
#endif // USE_8x2_TILE_BACKEND
if(!state.psState.forceEarlyZ && !T::bForcedSampleCount)
{
uint8_t *pDepthSample = pDepthBuffer + RasterTileDepthOffset(sample);
uint8_t * pStencilSample = pStencilBuffer + RasterTileStencilOffset(sample);
DepthStencilWrite(&state.vp[work.triFlags.viewportIndex], &state.depthStencilState, work.triFlags.frontFacing, PixelRateZTest.vZ[coverageSampleNum],
pDepthSample, depthMask, coverageMask, pStencilSample, PixelRateZTest.stencilPassMask[coverageSampleNum]);
}
AR_END(BEOutputMerger, 0);
}
Endtile:
AR_BEGIN(BEEndTile, pDC->drawId);
for(uint32_t sample = 0; sample < T::MultisampleT::numCoverageSamples; sample++)
{
work.coverageMask[sample] >>= (SIMD_TILE_Y_DIM * SIMD_TILE_X_DIM);
}
if(T::InputCoverage == SWR_INPUT_COVERAGE_INNER_CONSERVATIVE)
{
work.innerCoverageMask >>= (SIMD_TILE_Y_DIM * SIMD_TILE_X_DIM);
}
work.anyCoveredSamples >>= (SIMD_TILE_Y_DIM * SIMD_TILE_X_DIM);
#if USE_8x2_TILE_BACKEND
if (useAlternateOffset)
{
DWORD rt;
uint32_t rtMask = state.colorHottileEnable;
while (_BitScanForward(&rt, rtMask))
{
rtMask &= ~(1 << rt);
psContext.pColorBuffer[rt] += (2 * KNOB_SIMD_WIDTH * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp) / 8;
}
}
#else
DWORD rt;
uint32_t rtMask = state.colorHottileEnable;
while (_BitScanForward(&rt, rtMask))
{
rtMask &= ~(1 << rt);
psContext.pColorBuffer[rt] += (KNOB_SIMD_WIDTH * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp) / 8;
}
#endif
pDepthBuffer += (KNOB_SIMD_WIDTH * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp) / 8;
pStencilBuffer += (KNOB_SIMD_WIDTH * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp) / 8;
AR_END(BEEndTile, 0);
psContext.vX.UL = _simd_add_ps(psContext.vX.UL, dx);
psContext.vX.center = _simd_add_ps(psContext.vX.center, dx);
}
psContext.vY.UL = _simd_add_ps(psContext.vY.UL, dy);
psContext.vY.center = _simd_add_ps(psContext.vY.center, dy);
}
AR_END(BEPixelRateBackend, 0);
}
template<uint32_t sampleCountT = SWR_MULTISAMPLE_1X, uint32_t isCenter = 0,
uint32_t coverage = 0, uint32_t centroid = 0, uint32_t forced = 0, uint32_t canEarlyZ = 0
>
struct SwrBackendTraits
{
static const bool bIsCenterPattern = (isCenter == 1);
static const uint32_t InputCoverage = coverage;
static const bool bCentroidPos = (centroid == 1);
static const bool bForcedSampleCount = (forced == 1);
static const bool bCanEarlyZ = (canEarlyZ == 1);
typedef MultisampleTraits<(SWR_MULTISAMPLE_COUNT)sampleCountT, bIsCenterPattern> MultisampleT;
};