| /**************************************************************************** |
| * Copyright (C) 2014-2018 Intel Corporation. All Rights Reserved. |
| * |
| * Permission is hereby granted, free of charge, to any person obtaining a |
| * copy of this software and associated documentation files (the "Software"), |
| * to deal in the Software without restriction, including without limitation |
| * the rights to use, copy, modify, merge, publish, distribute, sublicense, |
| * and/or sell copies of the Software, and to permit persons to whom the |
| * Software is furnished to do so, subject to the following conditions: |
| * |
| * The above copyright notice and this permission notice (including the next |
| * paragraph) shall be included in all copies or substantial portions of the |
| * Software. |
| * |
| * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR |
| * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, |
| * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL |
| * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER |
| * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING |
| * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS |
| * IN THE SOFTWARE. |
| * |
| * @file rasterizer.cpp |
| * |
| * @brief Implementation for the rasterizer. |
| * |
| ******************************************************************************/ |
| |
| #include <vector> |
| #include <algorithm> |
| |
| #include "rasterizer.h" |
| #include "rdtsc_core.h" |
| #include "backend.h" |
| #include "utils.h" |
| #include "frontend.h" |
| #include "tilemgr.h" |
| #include "memory/tilingtraits.h" |
| |
| extern PFN_WORK_FUNC gRasterizerFuncs[SWR_MULTISAMPLE_TYPE_COUNT][2][2][SWR_INPUT_COVERAGE_COUNT] |
| [STATE_VALID_TRI_EDGE_COUNT][2]; |
| |
| template <uint32_t numSamples = 1> |
| void GetRenderHotTiles(DRAW_CONTEXT* pDC, |
| uint32_t workerId, |
| uint32_t macroID, |
| uint32_t x, |
| uint32_t y, |
| RenderOutputBuffers& renderBuffers, |
| uint32_t renderTargetArrayIndex); |
| template <typename RT> |
| void StepRasterTileX(uint32_t colorHotTileMask, RenderOutputBuffers& buffers); |
| template <typename RT> |
| void StepRasterTileY(uint32_t colorHotTileMask, |
| RenderOutputBuffers& buffers, |
| RenderOutputBuffers& startBufferRow); |
| |
| #define MASKTOVEC(i3, i2, i1, i0) \ |
| { \ |
| -i0, -i1, -i2, -i3 \ |
| } |
| static const __m256d gMaskToVecpd[] = { |
| MASKTOVEC(0, 0, 0, 0), |
| MASKTOVEC(0, 0, 0, 1), |
| MASKTOVEC(0, 0, 1, 0), |
| MASKTOVEC(0, 0, 1, 1), |
| MASKTOVEC(0, 1, 0, 0), |
| MASKTOVEC(0, 1, 0, 1), |
| MASKTOVEC(0, 1, 1, 0), |
| MASKTOVEC(0, 1, 1, 1), |
| MASKTOVEC(1, 0, 0, 0), |
| MASKTOVEC(1, 0, 0, 1), |
| MASKTOVEC(1, 0, 1, 0), |
| MASKTOVEC(1, 0, 1, 1), |
| MASKTOVEC(1, 1, 0, 0), |
| MASKTOVEC(1, 1, 0, 1), |
| MASKTOVEC(1, 1, 1, 0), |
| MASKTOVEC(1, 1, 1, 1), |
| }; |
| |
| struct POS |
| { |
| int32_t x, y; |
| }; |
| |
| struct EDGE |
| { |
| double a, b; // a, b edge coefficients in fix8 |
| double stepQuadX; // step to adjacent horizontal quad in fix16 |
| double stepQuadY; // step to adjacent vertical quad in fix16 |
| double stepRasterTileX; // step to adjacent horizontal raster tile in fix16 |
| double stepRasterTileY; // step to adjacent vertical raster tile in fix16 |
| |
| __m256d vQuadOffsets; // offsets for 4 samples of a quad |
| __m256d vRasterTileOffsets; // offsets for the 4 corners of a raster tile |
| }; |
| |
| ////////////////////////////////////////////////////////////////////////// |
| /// @brief rasterize a raster tile partially covered by the triangle |
| /// @param vEdge0-2 - edge equations evaluated at sample pos at each of the 4 corners of a raster |
| /// tile |
| /// @param vA, vB - A & B coefs for each edge of the triangle (Ax + Bx + C) |
| /// @param vStepQuad0-2 - edge equations evaluated at the UL corners of the 2x2 pixel quad. |
| /// Used to step between quads when sweeping over the raster tile. |
| template <uint32_t NumEdges, typename EdgeMaskT> |
| INLINE uint64_t rasterizePartialTile(DRAW_CONTEXT* pDC, |
| double startEdges[NumEdges], |
| EDGE* pRastEdges) |
| { |
| uint64_t coverageMask = 0; |
| |
| __m256d vEdges[NumEdges]; |
| __m256d vStepX[NumEdges]; |
| __m256d vStepY[NumEdges]; |
| |
| for (uint32_t e = 0; e < NumEdges; ++e) |
| { |
| // Step to the pixel sample locations of the 1st quad |
| vEdges[e] = _mm256_add_pd(_mm256_set1_pd(startEdges[e]), pRastEdges[e].vQuadOffsets); |
| |
| // compute step to next quad (mul by 2 in x and y direction) |
| vStepX[e] = _mm256_set1_pd(pRastEdges[e].stepQuadX); |
| vStepY[e] = _mm256_set1_pd(pRastEdges[e].stepQuadY); |
| } |
| |
| // fast unrolled version for 8x8 tile |
| #if KNOB_TILE_X_DIM == 8 && KNOB_TILE_Y_DIM == 8 |
| int edgeMask[NumEdges]; |
| uint64_t mask; |
| |
| auto eval_lambda = [&](int e) { edgeMask[e] = _mm256_movemask_pd(vEdges[e]); }; |
| auto update_lambda = [&](int e) { mask &= edgeMask[e]; }; |
| auto incx_lambda = [&](int e) { vEdges[e] = _mm256_add_pd(vEdges[e], vStepX[e]); }; |
| auto incy_lambda = [&](int e) { vEdges[e] = _mm256_add_pd(vEdges[e], vStepY[e]); }; |
| auto decx_lambda = [&](int e) { vEdges[e] = _mm256_sub_pd(vEdges[e], vStepX[e]); }; |
| |
| // evaluate which pixels in the quad are covered |
| #define EVAL UnrollerLMask<0, NumEdges, 1, EdgeMaskT::value>::step(eval_lambda); |
| |
| // update coverage mask |
| // if edge 0 is degenerate and will be skipped; init the mask |
| #define UPDATE_MASK(bit) \ |
| if (std::is_same<EdgeMaskT, E1E2ValidT>::value || \ |
| std::is_same<EdgeMaskT, NoEdgesValidT>::value) \ |
| { \ |
| mask = 0xf; \ |
| } \ |
| else \ |
| { \ |
| mask = edgeMask[0]; \ |
| } \ |
| UnrollerLMask<1, NumEdges, 1, EdgeMaskT::value>::step(update_lambda); \ |
| coverageMask |= (mask << bit); |
| |
| // step in the +x direction to the next quad |
| #define INCX UnrollerLMask<0, NumEdges, 1, EdgeMaskT::value>::step(incx_lambda); |
| |
| // step in the +y direction to the next quad |
| #define INCY UnrollerLMask<0, NumEdges, 1, EdgeMaskT::value>::step(incy_lambda); |
| |
| // step in the -x direction to the next quad |
| #define DECX UnrollerLMask<0, NumEdges, 1, EdgeMaskT::value>::step(decx_lambda); |
| |
| // sweep 2x2 quad back and forth through the raster tile, |
| // computing coverage masks for the entire tile |
| |
| // raster tile |
| // 0 1 2 3 4 5 6 7 |
| // x x |
| // x x ------------------> |
| // x x | |
| // <-----------------x x V |
| // .. |
| |
| // row 0 |
| EVAL; |
| UPDATE_MASK(0); |
| INCX; |
| EVAL; |
| UPDATE_MASK(4); |
| INCX; |
| EVAL; |
| UPDATE_MASK(8); |
| INCX; |
| EVAL; |
| UPDATE_MASK(12); |
| INCY; |
| |
| // row 1 |
| EVAL; |
| UPDATE_MASK(28); |
| DECX; |
| EVAL; |
| UPDATE_MASK(24); |
| DECX; |
| EVAL; |
| UPDATE_MASK(20); |
| DECX; |
| EVAL; |
| UPDATE_MASK(16); |
| INCY; |
| |
| // row 2 |
| EVAL; |
| UPDATE_MASK(32); |
| INCX; |
| EVAL; |
| UPDATE_MASK(36); |
| INCX; |
| EVAL; |
| UPDATE_MASK(40); |
| INCX; |
| EVAL; |
| UPDATE_MASK(44); |
| INCY; |
| |
| // row 3 |
| EVAL; |
| UPDATE_MASK(60); |
| DECX; |
| EVAL; |
| UPDATE_MASK(56); |
| DECX; |
| EVAL; |
| UPDATE_MASK(52); |
| DECX; |
| EVAL; |
| UPDATE_MASK(48); |
| #else |
| uint32_t bit = 0; |
| for (uint32_t y = 0; y < KNOB_TILE_Y_DIM / 2; ++y) |
| { |
| __m256d vStartOfRowEdge[NumEdges]; |
| for (uint32_t e = 0; e < NumEdges; ++e) |
| { |
| vStartOfRowEdge[e] = vEdges[e]; |
| } |
| |
| for (uint32_t x = 0; x < KNOB_TILE_X_DIM / 2; ++x) |
| { |
| int edgeMask[NumEdges]; |
| for (uint32_t e = 0; e < NumEdges; ++e) |
| { |
| edgeMask[e] = _mm256_movemask_pd(vEdges[e]); |
| } |
| |
| uint64_t mask = edgeMask[0]; |
| for (uint32_t e = 1; e < NumEdges; ++e) |
| { |
| mask &= edgeMask[e]; |
| } |
| coverageMask |= (mask << bit); |
| |
| // step to the next pixel in the x |
| for (uint32_t e = 0; e < NumEdges; ++e) |
| { |
| vEdges[e] = _mm256_add_pd(vEdges[e], vStepX[e]); |
| } |
| bit += 4; |
| } |
| |
| // step to the next row |
| for (uint32_t e = 0; e < NumEdges; ++e) |
| { |
| vEdges[e] = _mm256_add_pd(vStartOfRowEdge[e], vStepY[e]); |
| } |
| } |
| #endif |
| return coverageMask; |
| } |
| // Top left rule: |
| // Top: if an edge is horizontal, and it is above other edges in tri pixel space, it is a 'top' edge |
| // Left: if an edge is not horizontal, and it is on the left side of the triangle in pixel space, it |
| // is a 'left' edge Top left: a sample is in if it is a top or left edge. Out: !(horizontal && |
| // above) = !horizontal && below Out: !horizontal && left = !(!horizontal && left) = horizontal and |
| // right |
| INLINE void adjustTopLeftRuleIntFix16(const __m128i vA, const __m128i vB, __m256d& vEdge) |
| { |
| // if vA < 0, vC-- |
| // if vA == 0 && vB < 0, vC-- |
| |
| __m256d vEdgeOut = vEdge; |
| __m256d vEdgeAdjust = _mm256_sub_pd(vEdge, _mm256_set1_pd(1.0)); |
| |
| // if vA < 0 (line is not horizontal and below) |
| int msk = _mm_movemask_ps(_mm_castsi128_ps(vA)); |
| |
| // if vA == 0 && vB < 0 (line is horizontal and we're on the left edge of a tri) |
| __m128i vCmp = _mm_cmpeq_epi32(vA, _mm_setzero_si128()); |
| int msk2 = _mm_movemask_ps(_mm_castsi128_ps(vCmp)); |
| msk2 &= _mm_movemask_ps(_mm_castsi128_ps(vB)); |
| |
| // if either of these are true and we're on the line (edge == 0), bump it outside the line |
| vEdge = _mm256_blendv_pd(vEdgeOut, vEdgeAdjust, gMaskToVecpd[msk | msk2]); |
| } |
| |
| ////////////////////////////////////////////////////////////////////////// |
| /// @brief calculates difference in precision between the result of manh |
| /// calculation and the edge precision, based on compile time trait values |
| template <typename RT> |
| constexpr int64_t ManhToEdgePrecisionAdjust() |
| { |
| static_assert(RT::PrecisionT::BitsT::value + RT::ConservativePrecisionT::BitsT::value >= |
| RT::EdgePrecisionT::BitsT::value, |
| "Inadequate precision of result of manh calculation "); |
| return ((RT::PrecisionT::BitsT::value + RT::ConservativePrecisionT::BitsT::value) - |
| RT::EdgePrecisionT::BitsT::value); |
| } |
| |
| ////////////////////////////////////////////////////////////////////////// |
| /// @struct adjustEdgeConservative |
| /// @brief Primary template definition used for partially specializing |
| /// the adjustEdgeConservative function. This struct should never |
| /// be instantiated. |
| /// @tparam RT: rasterizer traits |
| /// @tparam ConservativeEdgeOffsetT: does the edge need offsetting? |
| template <typename RT, typename ConservativeEdgeOffsetT> |
| struct adjustEdgeConservative |
| { |
| ////////////////////////////////////////////////////////////////////////// |
| /// @brief Performs calculations to adjust each edge of a triangle away |
| /// from the pixel center by 1/2 pixel + uncertainty region in both the x and y |
| /// direction. |
| /// |
| /// Uncertainty regions arise from fixed point rounding, which |
| /// can snap a vertex +/- by min fixed point value. |
| /// Adding 1/2 pixel in x/y bumps the edge equation tests out towards the pixel corners. |
| /// This allows the rasterizer to test for coverage only at the pixel center, |
| /// instead of having to test individual pixel corners for conservative coverage |
| INLINE adjustEdgeConservative(const __m128i& vAi, const __m128i& vBi, __m256d& vEdge) |
| { |
| // Assumes CCW winding order. Subtracting from the evaluated edge equation moves the edge |
| // away from the pixel center (in the direction of the edge normal A/B) |
| |
| // edge = Ax + Bx + C - (manh/e) |
| // manh = manhattan distance = abs(A) + abs(B) |
| // e = absolute rounding error from snapping from float to fixed point precision |
| |
| // 'fixed point' multiply (in double to be avx1 friendly) |
| // need doubles to hold result of a fixed multiply: 16.8 * 16.9 = 32.17, for example |
| __m256d vAai = _mm256_cvtepi32_pd(_mm_abs_epi32(vAi)), |
| vBai = _mm256_cvtepi32_pd(_mm_abs_epi32(vBi)); |
| __m256d manh = |
| _mm256_add_pd(_mm256_mul_pd(vAai, _mm256_set1_pd(ConservativeEdgeOffsetT::value)), |
| _mm256_mul_pd(vBai, _mm256_set1_pd(ConservativeEdgeOffsetT::value))); |
| |
| static_assert(RT::PrecisionT::BitsT::value + RT::ConservativePrecisionT::BitsT::value >= |
| RT::EdgePrecisionT::BitsT::value, |
| "Inadequate precision of result of manh calculation "); |
| |
| // rasterizer incoming edge precision is x.16, so we need to get our edge offset into the |
| // same precision since we're doing fixed math in double format, multiply by multiples of |
| // 1/2 instead of a bit shift right |
| manh = _mm256_mul_pd(manh, _mm256_set1_pd(ManhToEdgePrecisionAdjust<RT>() * 0.5)); |
| |
| // move the edge away from the pixel center by the required conservative precision + 1/2 |
| // pixel this allows the rasterizer to do a single conservative coverage test to see if the |
| // primitive intersects the pixel at all |
| vEdge = _mm256_sub_pd(vEdge, manh); |
| }; |
| }; |
| |
| ////////////////////////////////////////////////////////////////////////// |
| /// @brief adjustEdgeConservative specialization where no edge offset is needed |
| template <typename RT> |
| struct adjustEdgeConservative<RT, std::integral_constant<int32_t, 0>> |
| { |
| INLINE adjustEdgeConservative(const __m128i& vAi, const __m128i& vBi, __m256d& vEdge){}; |
| }; |
| |
| ////////////////////////////////////////////////////////////////////////// |
| /// @brief calculates the distance a degenerate BBox needs to be adjusted |
| /// for conservative rast based on compile time trait values |
| template <typename RT> |
| constexpr int64_t ConservativeScissorOffset() |
| { |
| static_assert(RT::ConservativePrecisionT::BitsT::value - RT::PrecisionT::BitsT::value >= 0, |
| "Rasterizer precision > conservative precision"); |
| // if we have a degenerate triangle, we need to compensate for adjusting the degenerate BBox |
| // when calculating scissor edges |
| typedef std::integral_constant<int32_t, (RT::ValidEdgeMaskT::value == ALL_EDGES_VALID) ? 0 : 1> |
| DegenerateEdgeOffsetT; |
| // 1/2 pixel edge offset + conservative offset - degenerateTriangle |
| return RT::ConservativeEdgeOffsetT::value - |
| (DegenerateEdgeOffsetT::value |
| << (RT::ConservativePrecisionT::BitsT::value - RT::PrecisionT::BitsT::value)); |
| } |
| |
| ////////////////////////////////////////////////////////////////////////// |
| /// @brief Performs calculations to adjust each a vector of evaluated edges out |
| /// from the pixel center by 1/2 pixel + uncertainty region in both the x and y |
| /// direction. |
| template <typename RT> |
| INLINE void adjustScissorEdge(const double a, const double b, __m256d& vEdge) |
| { |
| int64_t aabs = std::abs(static_cast<int64_t>(a)), babs = std::abs(static_cast<int64_t>(b)); |
| int64_t manh = |
| ((aabs * ConservativeScissorOffset<RT>()) + (babs * ConservativeScissorOffset<RT>())) >> |
| ManhToEdgePrecisionAdjust<RT>(); |
| vEdge = _mm256_sub_pd(vEdge, _mm256_set1_pd(manh)); |
| }; |
| |
| ////////////////////////////////////////////////////////////////////////// |
| /// @brief Performs calculations to adjust each a scalar evaluated edge out |
| /// from the pixel center by 1/2 pixel + uncertainty region in both the x and y |
| /// direction. |
| template <typename RT, typename OffsetT> |
| INLINE double adjustScalarEdge(const double a, const double b, const double Edge) |
| { |
| int64_t aabs = std::abs(static_cast<int64_t>(a)), babs = std::abs(static_cast<int64_t>(b)); |
| int64_t manh = |
| ((aabs * OffsetT::value) + (babs * OffsetT::value)) >> ManhToEdgePrecisionAdjust<RT>(); |
| return (Edge - manh); |
| }; |
| |
| ////////////////////////////////////////////////////////////////////////// |
| /// @brief Perform any needed adjustments to evaluated triangle edges |
| template <typename RT, typename EdgeOffsetT> |
| struct adjustEdgesFix16 |
| { |
| INLINE adjustEdgesFix16(const __m128i& vAi, const __m128i& vBi, __m256d& vEdge) |
| { |
| static_assert( |
| std::is_same<typename RT::EdgePrecisionT, FixedPointTraits<Fixed_X_16>>::value, |
| "Edge equation expected to be in x.16 fixed point"); |
| |
| static_assert(RT::IsConservativeT::value, |
| "Edge offset assumes conservative rasterization is enabled"); |
| |
| // need to apply any edge offsets before applying the top-left rule |
| adjustEdgeConservative<RT, EdgeOffsetT>(vAi, vBi, vEdge); |
| |
| adjustTopLeftRuleIntFix16(vAi, vBi, vEdge); |
| } |
| }; |
| |
| ////////////////////////////////////////////////////////////////////////// |
| /// @brief Perform top left adjustments to evaluated triangle edges |
| template <typename RT> |
| struct adjustEdgesFix16<RT, std::integral_constant<int32_t, 0>> |
| { |
| INLINE adjustEdgesFix16(const __m128i& vAi, const __m128i& vBi, __m256d& vEdge) |
| { |
| adjustTopLeftRuleIntFix16(vAi, vBi, vEdge); |
| } |
| }; |
| |
| // max(abs(dz/dx), abs(dz,dy) |
| INLINE float ComputeMaxDepthSlope(const SWR_TRIANGLE_DESC* pDesc) |
| { |
| /* |
| // evaluate i,j at (0,0) |
| float i00 = pDesc->I[0] * 0.0f + pDesc->I[1] * 0.0f + pDesc->I[2]; |
| float j00 = pDesc->J[0] * 0.0f + pDesc->J[1] * 0.0f + pDesc->J[2]; |
| |
| // evaluate i,j at (1,0) |
| float i10 = pDesc->I[0] * 1.0f + pDesc->I[1] * 0.0f + pDesc->I[2]; |
| float j10 = pDesc->J[0] * 1.0f + pDesc->J[1] * 0.0f + pDesc->J[2]; |
| |
| // compute dz/dx |
| float d00 = pDesc->Z[0] * i00 + pDesc->Z[1] * j00 + pDesc->Z[2]; |
| float d10 = pDesc->Z[0] * i10 + pDesc->Z[1] * j10 + pDesc->Z[2]; |
| float dzdx = abs(d10 - d00); |
| |
| // evaluate i,j at (0,1) |
| float i01 = pDesc->I[0] * 0.0f + pDesc->I[1] * 1.0f + pDesc->I[2]; |
| float j01 = pDesc->J[0] * 0.0f + pDesc->J[1] * 1.0f + pDesc->J[2]; |
| |
| float d01 = pDesc->Z[0] * i01 + pDesc->Z[1] * j01 + pDesc->Z[2]; |
| float dzdy = abs(d01 - d00); |
| */ |
| |
| // optimized version of above |
| float dzdx = fabsf(pDesc->recipDet * (pDesc->Z[0] * pDesc->I[0] + pDesc->Z[1] * pDesc->J[0])); |
| float dzdy = fabsf(pDesc->recipDet * (pDesc->Z[0] * pDesc->I[1] + pDesc->Z[1] * pDesc->J[1])); |
| |
| return std::max(dzdx, dzdy); |
| } |
| |
| INLINE float |
| ComputeBiasFactor(const SWR_RASTSTATE* pState, const SWR_TRIANGLE_DESC* pDesc, const float* z) |
| { |
| if (pState->depthFormat == R24_UNORM_X8_TYPELESS) |
| { |
| return (1.0f / (1 << 24)); |
| } |
| else if (pState->depthFormat == R16_UNORM) |
| { |
| return (1.0f / (1 << 16)); |
| } |
| else |
| { |
| SWR_ASSERT(pState->depthFormat == R32_FLOAT); |
| |
| // for f32 depth, factor = 2^(exponent(max(abs(z) - 23) |
| float zMax = std::max(fabsf(z[0]), std::max(fabsf(z[1]), fabsf(z[2]))); |
| uint32_t zMaxInt = *(uint32_t*)&zMax; |
| zMaxInt &= 0x7f800000; |
| zMax = *(float*)&zMaxInt; |
| |
| return zMax * (1.0f / (1 << 23)); |
| } |
| } |
| |
| INLINE float |
| ComputeDepthBias(const SWR_RASTSTATE* pState, const SWR_TRIANGLE_DESC* pTri, const float* z) |
| { |
| if (pState->depthBias == 0 && pState->slopeScaledDepthBias == 0) |
| { |
| return 0.0f; |
| } |
| |
| float scale = pState->slopeScaledDepthBias; |
| if (scale != 0.0f) |
| { |
| scale *= ComputeMaxDepthSlope(pTri); |
| } |
| |
| float bias = pState->depthBias; |
| if (!pState->depthBiasPreAdjusted) |
| { |
| bias *= ComputeBiasFactor(pState, pTri, z); |
| } |
| bias += scale; |
| |
| if (pState->depthBiasClamp > 0.0f) |
| { |
| bias = std::min(bias, pState->depthBiasClamp); |
| } |
| else if (pState->depthBiasClamp < 0.0f) |
| { |
| bias = std::max(bias, pState->depthBiasClamp); |
| } |
| |
| return bias; |
| } |
| |
| // Prevent DCE by writing coverage mask from rasterizer to volatile |
| #if KNOB_ENABLE_TOSS_POINTS |
| __declspec(thread) volatile uint64_t gToss; |
| #endif |
| |
| static const uint32_t vertsPerTri = 3, componentsPerAttrib = 4; |
| // try to avoid _chkstk insertions; make this thread local |
| static THREAD |
| OSALIGNLINE(float) perspAttribsTLS[vertsPerTri * SWR_VTX_NUM_SLOTS * componentsPerAttrib]; |
| |
| INLINE |
| void ComputeEdgeData(int32_t a, int32_t b, EDGE& edge) |
| { |
| edge.a = a; |
| edge.b = b; |
| |
| // compute constant steps to adjacent quads |
| edge.stepQuadX = (double)((int64_t)a * (int64_t)(2 * FIXED_POINT_SCALE)); |
| edge.stepQuadY = (double)((int64_t)b * (int64_t)(2 * FIXED_POINT_SCALE)); |
| |
| // compute constant steps to adjacent raster tiles |
| edge.stepRasterTileX = (double)((int64_t)a * (int64_t)(KNOB_TILE_X_DIM * FIXED_POINT_SCALE)); |
| edge.stepRasterTileY = (double)((int64_t)b * (int64_t)(KNOB_TILE_Y_DIM * FIXED_POINT_SCALE)); |
| |
| // compute quad offsets |
| const __m256d vQuadOffsetsXIntFix8 = _mm256_set_pd(FIXED_POINT_SCALE, 0, FIXED_POINT_SCALE, 0); |
| const __m256d vQuadOffsetsYIntFix8 = _mm256_set_pd(FIXED_POINT_SCALE, FIXED_POINT_SCALE, 0, 0); |
| |
| __m256d vQuadStepXFix16 = _mm256_mul_pd(_mm256_set1_pd(edge.a), vQuadOffsetsXIntFix8); |
| __m256d vQuadStepYFix16 = _mm256_mul_pd(_mm256_set1_pd(edge.b), vQuadOffsetsYIntFix8); |
| edge.vQuadOffsets = _mm256_add_pd(vQuadStepXFix16, vQuadStepYFix16); |
| |
| // compute raster tile offsets |
| const __m256d vTileOffsetsXIntFix8 = _mm256_set_pd( |
| (KNOB_TILE_X_DIM - 1) * FIXED_POINT_SCALE, 0, (KNOB_TILE_X_DIM - 1) * FIXED_POINT_SCALE, 0); |
| const __m256d vTileOffsetsYIntFix8 = _mm256_set_pd( |
| (KNOB_TILE_Y_DIM - 1) * FIXED_POINT_SCALE, (KNOB_TILE_Y_DIM - 1) * FIXED_POINT_SCALE, 0, 0); |
| |
| __m256d vTileStepXFix16 = _mm256_mul_pd(_mm256_set1_pd(edge.a), vTileOffsetsXIntFix8); |
| __m256d vTileStepYFix16 = _mm256_mul_pd(_mm256_set1_pd(edge.b), vTileOffsetsYIntFix8); |
| edge.vRasterTileOffsets = _mm256_add_pd(vTileStepXFix16, vTileStepYFix16); |
| } |
| |
| INLINE |
| void ComputeEdgeData(const POS& p0, const POS& p1, EDGE& edge) |
| { |
| ComputeEdgeData(p0.y - p1.y, p1.x - p0.x, edge); |
| } |
| |
| ////////////////////////////////////////////////////////////////////////// |
| /// @brief Primary template definition used for partially specializing |
| /// the UpdateEdgeMasks function. Offset evaluated edges from UL pixel |
| /// corner to sample position, and test for coverage |
| /// @tparam sampleCount: multisample count |
| template <typename NumSamplesT> |
| INLINE void UpdateEdgeMasks(const __m256d (&vEdgeTileBbox)[3], |
| const __m256d* vEdgeFix16, |
| int32_t& mask0, |
| int32_t& mask1, |
| int32_t& mask2) |
| { |
| __m256d vSampleBboxTest0, vSampleBboxTest1, vSampleBboxTest2; |
| // evaluate edge equations at the tile multisample bounding box |
| vSampleBboxTest0 = _mm256_add_pd(vEdgeTileBbox[0], vEdgeFix16[0]); |
| vSampleBboxTest1 = _mm256_add_pd(vEdgeTileBbox[1], vEdgeFix16[1]); |
| vSampleBboxTest2 = _mm256_add_pd(vEdgeTileBbox[2], vEdgeFix16[2]); |
| mask0 = _mm256_movemask_pd(vSampleBboxTest0); |
| mask1 = _mm256_movemask_pd(vSampleBboxTest1); |
| mask2 = _mm256_movemask_pd(vSampleBboxTest2); |
| } |
| |
| ////////////////////////////////////////////////////////////////////////// |
| /// @brief UpdateEdgeMasks<SingleSampleT> specialization, instantiated |
| /// when only rasterizing a single coverage test point |
| template <> |
| INLINE void UpdateEdgeMasks<SingleSampleT>( |
| const __m256d (&)[3], const __m256d* vEdgeFix16, int32_t& mask0, int32_t& mask1, int32_t& mask2) |
| { |
| mask0 = _mm256_movemask_pd(vEdgeFix16[0]); |
| mask1 = _mm256_movemask_pd(vEdgeFix16[1]); |
| mask2 = _mm256_movemask_pd(vEdgeFix16[2]); |
| } |
| |
| ////////////////////////////////////////////////////////////////////////// |
| /// @struct ComputeScissorEdges |
| /// @brief Primary template definition. Allows the function to be generically |
| /// called. When paired with below specializations, will result in an empty |
| /// inlined function if scissor is not enabled |
| /// @tparam RasterScissorEdgesT: is scissor enabled? |
| /// @tparam IsConservativeT: is conservative rast enabled? |
| /// @tparam RT: rasterizer traits |
| template <typename RasterScissorEdgesT, typename IsConservativeT, typename RT> |
| struct ComputeScissorEdges |
| { |
| INLINE ComputeScissorEdges(const SWR_RECT& triBBox, |
| const SWR_RECT& scissorBBox, |
| const int32_t x, |
| const int32_t y, |
| EDGE (&rastEdges)[RT::NumEdgesT::value], |
| __m256d (&vEdgeFix16)[7]){}; |
| }; |
| |
| ////////////////////////////////////////////////////////////////////////// |
| /// @brief ComputeScissorEdges<std::true_type, std::true_type, RT> partial |
| /// specialization. Instantiated when conservative rast and scissor are enabled |
| template <typename RT> |
| struct ComputeScissorEdges<std::true_type, std::true_type, RT> |
| { |
| ////////////////////////////////////////////////////////////////////////// |
| /// @brief Intersect tri bbox with scissor, compute scissor edge vectors, |
| /// evaluate edge equations and offset them away from pixel center. |
| INLINE ComputeScissorEdges(const SWR_RECT& triBBox, |
| const SWR_RECT& scissorBBox, |
| const int32_t x, |
| const int32_t y, |
| EDGE (&rastEdges)[RT::NumEdgesT::value], |
| __m256d (&vEdgeFix16)[7]) |
| { |
| // if conservative rasterizing, triangle bbox intersected with scissor bbox is used |
| SWR_RECT scissor; |
| scissor.xmin = std::max(triBBox.xmin, scissorBBox.xmin); |
| scissor.xmax = std::min(triBBox.xmax, scissorBBox.xmax); |
| scissor.ymin = std::max(triBBox.ymin, scissorBBox.ymin); |
| scissor.ymax = std::min(triBBox.ymax, scissorBBox.ymax); |
| |
| POS topLeft{scissor.xmin, scissor.ymin}; |
| POS bottomLeft{scissor.xmin, scissor.ymax}; |
| POS topRight{scissor.xmax, scissor.ymin}; |
| POS bottomRight{scissor.xmax, scissor.ymax}; |
| |
| // construct 4 scissor edges in ccw direction |
| ComputeEdgeData(topLeft, bottomLeft, rastEdges[3]); |
| ComputeEdgeData(bottomLeft, bottomRight, rastEdges[4]); |
| ComputeEdgeData(bottomRight, topRight, rastEdges[5]); |
| ComputeEdgeData(topRight, topLeft, rastEdges[6]); |
| |
| vEdgeFix16[3] = _mm256_set1_pd((rastEdges[3].a * (x - scissor.xmin)) + |
| (rastEdges[3].b * (y - scissor.ymin))); |
| vEdgeFix16[4] = _mm256_set1_pd((rastEdges[4].a * (x - scissor.xmin)) + |
| (rastEdges[4].b * (y - scissor.ymax))); |
| vEdgeFix16[5] = _mm256_set1_pd((rastEdges[5].a * (x - scissor.xmax)) + |
| (rastEdges[5].b * (y - scissor.ymax))); |
| vEdgeFix16[6] = _mm256_set1_pd((rastEdges[6].a * (x - scissor.xmax)) + |
| (rastEdges[6].b * (y - scissor.ymin))); |
| |
| // if conservative rasterizing, need to bump the scissor edges out by the conservative |
| // uncertainty distance, else do nothing |
| adjustScissorEdge<RT>(rastEdges[3].a, rastEdges[3].b, vEdgeFix16[3]); |
| adjustScissorEdge<RT>(rastEdges[4].a, rastEdges[4].b, vEdgeFix16[4]); |
| adjustScissorEdge<RT>(rastEdges[5].a, rastEdges[5].b, vEdgeFix16[5]); |
| adjustScissorEdge<RT>(rastEdges[6].a, rastEdges[6].b, vEdgeFix16[6]); |
| |
| // Upper left rule for scissor |
| vEdgeFix16[3] = _mm256_sub_pd(vEdgeFix16[3], _mm256_set1_pd(1.0)); |
| vEdgeFix16[6] = _mm256_sub_pd(vEdgeFix16[6], _mm256_set1_pd(1.0)); |
| } |
| }; |
| |
| ////////////////////////////////////////////////////////////////////////// |
| /// @brief ComputeScissorEdges<std::true_type, std::false_type, RT> partial |
| /// specialization. Instantiated when scissor is enabled and conservative rast |
| /// is disabled. |
| template <typename RT> |
| struct ComputeScissorEdges<std::true_type, std::false_type, RT> |
| { |
| ////////////////////////////////////////////////////////////////////////// |
| /// @brief Compute scissor edge vectors and evaluate edge equations |
| INLINE ComputeScissorEdges(const SWR_RECT&, |
| const SWR_RECT& scissorBBox, |
| const int32_t x, |
| const int32_t y, |
| EDGE (&rastEdges)[RT::NumEdgesT::value], |
| __m256d (&vEdgeFix16)[7]) |
| { |
| const SWR_RECT& scissor = scissorBBox; |
| POS topLeft{scissor.xmin, scissor.ymin}; |
| POS bottomLeft{scissor.xmin, scissor.ymax}; |
| POS topRight{scissor.xmax, scissor.ymin}; |
| POS bottomRight{scissor.xmax, scissor.ymax}; |
| |
| // construct 4 scissor edges in ccw direction |
| ComputeEdgeData(topLeft, bottomLeft, rastEdges[3]); |
| ComputeEdgeData(bottomLeft, bottomRight, rastEdges[4]); |
| ComputeEdgeData(bottomRight, topRight, rastEdges[5]); |
| ComputeEdgeData(topRight, topLeft, rastEdges[6]); |
| |
| vEdgeFix16[3] = _mm256_set1_pd((rastEdges[3].a * (x - scissor.xmin)) + |
| (rastEdges[3].b * (y - scissor.ymin))); |
| vEdgeFix16[4] = _mm256_set1_pd((rastEdges[4].a * (x - scissor.xmin)) + |
| (rastEdges[4].b * (y - scissor.ymax))); |
| vEdgeFix16[5] = _mm256_set1_pd((rastEdges[5].a * (x - scissor.xmax)) + |
| (rastEdges[5].b * (y - scissor.ymax))); |
| vEdgeFix16[6] = _mm256_set1_pd((rastEdges[6].a * (x - scissor.xmax)) + |
| (rastEdges[6].b * (y - scissor.ymin))); |
| |
| // Upper left rule for scissor |
| vEdgeFix16[3] = _mm256_sub_pd(vEdgeFix16[3], _mm256_set1_pd(1.0)); |
| vEdgeFix16[6] = _mm256_sub_pd(vEdgeFix16[6], _mm256_set1_pd(1.0)); |
| } |
| }; |
| |
| ////////////////////////////////////////////////////////////////////////// |
| /// @brief Primary function template for TrivialRejectTest. Should |
| /// never be called, but TemplateUnroller instantiates a few unused values, |
| /// so it calls a runtime assert instead of a static_assert. |
| template <typename ValidEdgeMaskT> |
| INLINE bool TrivialRejectTest(const int, const int, const int) |
| { |
| SWR_INVALID("Primary templated function should never be called"); |
| return false; |
| }; |
| |
| ////////////////////////////////////////////////////////////////////////// |
| /// @brief E0E1ValidT specialization of TrivialRejectTest. Tests edge 0 |
| /// and edge 1 for trivial coverage reject |
| template <> |
| INLINE bool TrivialRejectTest<E0E1ValidT>(const int mask0, const int mask1, const int) |
| { |
| return (!(mask0 && mask1)) ? true : false; |
| }; |
| |
| ////////////////////////////////////////////////////////////////////////// |
| /// @brief E0E2ValidT specialization of TrivialRejectTest. Tests edge 0 |
| /// and edge 2 for trivial coverage reject |
| template <> |
| INLINE bool TrivialRejectTest<E0E2ValidT>(const int mask0, const int, const int mask2) |
| { |
| return (!(mask0 && mask2)) ? true : false; |
| }; |
| |
| ////////////////////////////////////////////////////////////////////////// |
| /// @brief E1E2ValidT specialization of TrivialRejectTest. Tests edge 1 |
| /// and edge 2 for trivial coverage reject |
| template <> |
| INLINE bool TrivialRejectTest<E1E2ValidT>(const int, const int mask1, const int mask2) |
| { |
| return (!(mask1 && mask2)) ? true : false; |
| }; |
| |
| ////////////////////////////////////////////////////////////////////////// |
| /// @brief AllEdgesValidT specialization of TrivialRejectTest. Tests all |
| /// primitive edges for trivial coverage reject |
| template <> |
| INLINE bool TrivialRejectTest<AllEdgesValidT>(const int mask0, const int mask1, const int mask2) |
| { |
| return (!(mask0 && mask1 && mask2)) ? true : false; |
| ; |
| }; |
| |
| ////////////////////////////////////////////////////////////////////////// |
| /// @brief NoEdgesValidT specialization of TrivialRejectTest. Degenerate |
| /// point, so return false and rasterize against conservative BBox |
| template <> |
| INLINE bool TrivialRejectTest<NoEdgesValidT>(const int, const int, const int) |
| { |
| return false; |
| }; |
| |
| ////////////////////////////////////////////////////////////////////////// |
| /// @brief Primary function template for TrivialAcceptTest. Always returns |
| /// false, since it will only be called for degenerate tris, and as such |
| /// will never cover the entire raster tile |
| template <typename ScissorEnableT> |
| INLINE bool TrivialAcceptTest(const int, const int, const int) |
| { |
| return false; |
| }; |
| |
| ////////////////////////////////////////////////////////////////////////// |
| /// @brief AllEdgesValidT specialization for TrivialAcceptTest. Test all |
| /// edge masks for a fully covered raster tile |
| template <> |
| INLINE bool TrivialAcceptTest<std::false_type>(const int mask0, const int mask1, const int mask2) |
| { |
| return ((mask0 & mask1 & mask2) == 0xf); |
| }; |
| |
| ////////////////////////////////////////////////////////////////////////// |
| /// @brief Primary function template for GenerateSVInnerCoverage. Results |
| /// in an empty function call if SVInnerCoverage isn't requested |
| template <typename RT, typename ValidEdgeMaskT, typename InputCoverageT> |
| struct GenerateSVInnerCoverage |
| { |
| INLINE GenerateSVInnerCoverage(DRAW_CONTEXT*, uint32_t, EDGE*, double*, uint64_t&){}; |
| }; |
| |
| ////////////////////////////////////////////////////////////////////////// |
| /// @brief Specialization of GenerateSVInnerCoverage where all edges |
| /// are non-degenerate and SVInnerCoverage is requested. Offsets the evaluated |
| /// edge values from OuterConservative to InnerConservative and rasterizes. |
| template <typename RT> |
| struct GenerateSVInnerCoverage<RT, AllEdgesValidT, InnerConservativeCoverageT> |
| { |
| INLINE GenerateSVInnerCoverage(DRAW_CONTEXT* pDC, |
| uint32_t workerId, |
| EDGE* pRastEdges, |
| double* pStartQuadEdges, |
| uint64_t& innerCoverageMask) |
| { |
| double startQuadEdgesAdj[RT::NumEdgesT::value]; |
| for (uint32_t e = 0; e < RT::NumEdgesT::value; ++e) |
| { |
| startQuadEdgesAdj[e] = adjustScalarEdge<RT, typename RT::InnerConservativeEdgeOffsetT>( |
| pRastEdges[e].a, pRastEdges[e].b, pStartQuadEdges[e]); |
| } |
| |
| // not trivial accept or reject, must rasterize full tile |
| RDTSC_BEGIN(BERasterizePartial, pDC->drawId); |
| innerCoverageMask = rasterizePartialTile<RT::NumEdgesT::value, typename RT::ValidEdgeMaskT>( |
| pDC, startQuadEdgesAdj, pRastEdges); |
| RDTSC_END(BERasterizePartial, 0); |
| } |
| }; |
| |
| ////////////////////////////////////////////////////////////////////////// |
| /// @brief Primary function template for UpdateEdgeMasksInnerConservative. Results |
| /// in an empty function call if SVInnerCoverage isn't requested |
| template <typename RT, typename ValidEdgeMaskT, typename InputCoverageT> |
| struct UpdateEdgeMasksInnerConservative |
| { |
| INLINE UpdateEdgeMasksInnerConservative(const __m256d (&vEdgeTileBbox)[3], |
| const __m256d*, |
| const __m128i, |
| const __m128i, |
| int32_t&, |
| int32_t&, |
| int32_t&){}; |
| }; |
| |
| ////////////////////////////////////////////////////////////////////////// |
| /// @brief Specialization of UpdateEdgeMasksInnerConservative where all edges |
| /// are non-degenerate and SVInnerCoverage is requested. Offsets the edges |
| /// evaluated at raster tile corners to inner conservative position and |
| /// updates edge masks |
| template <typename RT> |
| struct UpdateEdgeMasksInnerConservative<RT, AllEdgesValidT, InnerConservativeCoverageT> |
| { |
| INLINE UpdateEdgeMasksInnerConservative(const __m256d (&vEdgeTileBbox)[3], |
| const __m256d* vEdgeFix16, |
| const __m128i vAi, |
| const __m128i vBi, |
| int32_t& mask0, |
| int32_t& mask1, |
| int32_t& mask2) |
| { |
| __m256d vTempEdge[3]{vEdgeFix16[0], vEdgeFix16[1], vEdgeFix16[2]}; |
| |
| // instead of keeping 2 copies of evaluated edges around, just compensate for the outer |
| // conservative evaluated edge when adjusting the edge in for inner conservative tests |
| adjustEdgeConservative<RT, typename RT::InnerConservativeEdgeOffsetT>( |
| vAi, vBi, vTempEdge[0]); |
| adjustEdgeConservative<RT, typename RT::InnerConservativeEdgeOffsetT>( |
| vAi, vBi, vTempEdge[1]); |
| adjustEdgeConservative<RT, typename RT::InnerConservativeEdgeOffsetT>( |
| vAi, vBi, vTempEdge[2]); |
| |
| UpdateEdgeMasks<typename RT::NumCoverageSamplesT>( |
| vEdgeTileBbox, vTempEdge, mask0, mask1, mask2); |
| } |
| }; |
| |
| ////////////////////////////////////////////////////////////////////////// |
| /// @brief Specialization of UpdateEdgeMasksInnerConservative where SVInnerCoverage |
| /// is requested but at least one edge is degenerate. Since a degenerate triangle cannot |
| /// cover an entire raster tile, set mask0 to 0 to force it down the |
| /// rastierizePartialTile path |
| template <typename RT, typename ValidEdgeMaskT> |
| struct UpdateEdgeMasksInnerConservative<RT, ValidEdgeMaskT, InnerConservativeCoverageT> |
| { |
| INLINE UpdateEdgeMasksInnerConservative(const __m256d (&)[3], |
| const __m256d*, |
| const __m128i, |
| const __m128i, |
| int32_t& mask0, |
| int32_t&, |
| int32_t&) |
| { |
| // set one mask to zero to force the triangle down the rastierizePartialTile path |
| mask0 = 0; |
| } |
| }; |
| |
| template <typename RT> |
| void RasterizeTriangle(DRAW_CONTEXT* pDC, uint32_t workerId, uint32_t macroTile, void* pDesc) |
| { |
| const TRIANGLE_WORK_DESC& workDesc = *((TRIANGLE_WORK_DESC*)pDesc); |
| #if KNOB_ENABLE_TOSS_POINTS |
| if (KNOB_TOSS_BIN_TRIS) |
| { |
| return; |
| } |
| #endif |
| RDTSC_BEGIN(BERasterizeTriangle, pDC->drawId); |
| RDTSC_BEGIN(BETriangleSetup, pDC->drawId); |
| |
| const API_STATE& state = GetApiState(pDC); |
| const SWR_RASTSTATE& rastState = state.rastState; |
| const BACKEND_FUNCS& backendFuncs = pDC->pState->backendFuncs; |
| |
| OSALIGNSIMD(SWR_TRIANGLE_DESC) triDesc; |
| triDesc.pUserClipBuffer = workDesc.pUserClipBuffer; |
| |
| __m128 vX, vY, vZ, vRecipW; |
| |
| // pTriBuffer data layout: grouped components of the 3 triangle points and 1 don't care |
| // eg: vX = [x0 x1 x2 dc] |
| vX = _mm_load_ps(workDesc.pTriBuffer); |
| vY = _mm_load_ps(workDesc.pTriBuffer + 4); |
| vZ = _mm_load_ps(workDesc.pTriBuffer + 8); |
| vRecipW = _mm_load_ps(workDesc.pTriBuffer + 12); |
| |
| // convert to fixed point |
| static_assert(std::is_same<typename RT::PrecisionT, FixedPointTraits<Fixed_16_8>>::value, |
| "Rasterizer expects 16.8 fixed point precision"); |
| __m128i vXi = fpToFixedPoint(vX); |
| __m128i vYi = fpToFixedPoint(vY); |
| |
| // quantize floating point position to fixed point precision |
| // to prevent attribute creep around the triangle vertices |
| vX = _mm_mul_ps(_mm_cvtepi32_ps(vXi), _mm_set1_ps(1.0f / FIXED_POINT_SCALE)); |
| vY = _mm_mul_ps(_mm_cvtepi32_ps(vYi), _mm_set1_ps(1.0f / FIXED_POINT_SCALE)); |
| |
| // triangle setup - A and B edge equation coefs |
| __m128 vA, vB; |
| triangleSetupAB(vX, vY, vA, vB); |
| |
| __m128i vAi, vBi; |
| triangleSetupABInt(vXi, vYi, vAi, vBi); |
| |
| // determinant |
| float det = calcDeterminantInt(vAi, vBi); |
| |
| // Verts in Pixel Coordinate Space at this point |
| // Det > 0 = CW winding order |
| // Convert CW triangles to CCW |
| if (det > 0.0) |
| { |
| vA = _mm_mul_ps(vA, _mm_set1_ps(-1)); |
| vB = _mm_mul_ps(vB, _mm_set1_ps(-1)); |
| vAi = _mm_mullo_epi32(vAi, _mm_set1_epi32(-1)); |
| vBi = _mm_mullo_epi32(vBi, _mm_set1_epi32(-1)); |
| det = -det; |
| } |
| |
| __m128 vC; |
| // Finish triangle setup - C edge coef |
| triangleSetupC(vX, vY, vA, vB, vC); |
| |
| if (RT::ValidEdgeMaskT::value != ALL_EDGES_VALID) |
| { |
| // If we have degenerate edge(s) to rasterize, set I and J coefs |
| // to 0 for constant interpolation of attributes |
| triDesc.I[0] = 0.0f; |
| triDesc.I[1] = 0.0f; |
| triDesc.I[2] = 0.0f; |
| triDesc.J[0] = 0.0f; |
| triDesc.J[1] = 0.0f; |
| triDesc.J[2] = 0.0f; |
| |
| // Degenerate triangles have no area |
| triDesc.recipDet = 0.0f; |
| } |
| else |
| { |
| // only extract coefs for 2 of the barycentrics; the 3rd can be |
| // determined from the barycentric equation: |
| // i + j + k = 1 <=> k = 1 - j - i |
| _MM_EXTRACT_FLOAT(triDesc.I[0], vA, 1); |
| _MM_EXTRACT_FLOAT(triDesc.I[1], vB, 1); |
| _MM_EXTRACT_FLOAT(triDesc.I[2], vC, 1); |
| _MM_EXTRACT_FLOAT(triDesc.J[0], vA, 2); |
| _MM_EXTRACT_FLOAT(triDesc.J[1], vB, 2); |
| _MM_EXTRACT_FLOAT(triDesc.J[2], vC, 2); |
| |
| // compute recipDet, used to calculate barycentric i and j in the backend |
| triDesc.recipDet = 1.0f / det; |
| } |
| |
| OSALIGNSIMD(float) oneOverW[4]; |
| _mm_store_ps(oneOverW, vRecipW); |
| triDesc.OneOverW[0] = oneOverW[0] - oneOverW[2]; |
| triDesc.OneOverW[1] = oneOverW[1] - oneOverW[2]; |
| triDesc.OneOverW[2] = oneOverW[2]; |
| |
| // calculate perspective correct coefs per vertex attrib |
| float* pPerspAttribs = perspAttribsTLS; |
| float* pAttribs = workDesc.pAttribs; |
| triDesc.pPerspAttribs = pPerspAttribs; |
| triDesc.pAttribs = pAttribs; |
| float* pRecipW = workDesc.pTriBuffer + 12; |
| triDesc.pRecipW = pRecipW; |
| __m128 vOneOverWV0 = _mm_broadcast_ss(pRecipW); |
| __m128 vOneOverWV1 = _mm_broadcast_ss(pRecipW += 1); |
| __m128 vOneOverWV2 = _mm_broadcast_ss(pRecipW += 1); |
| for (uint32_t i = 0; i < workDesc.numAttribs; i++) |
| { |
| __m128 attribA = _mm_load_ps(pAttribs); |
| __m128 attribB = _mm_load_ps(pAttribs += 4); |
| __m128 attribC = _mm_load_ps(pAttribs += 4); |
| pAttribs += 4; |
| |
| attribA = _mm_mul_ps(attribA, vOneOverWV0); |
| attribB = _mm_mul_ps(attribB, vOneOverWV1); |
| attribC = _mm_mul_ps(attribC, vOneOverWV2); |
| |
| _mm_store_ps(pPerspAttribs, attribA); |
| _mm_store_ps(pPerspAttribs += 4, attribB); |
| _mm_store_ps(pPerspAttribs += 4, attribC); |
| pPerspAttribs += 4; |
| } |
| |
| // compute bary Z |
| // zInterp = zVert0 + i(zVert1-zVert0) + j (zVert2 - zVert0) |
| OSALIGNSIMD(float) a[4]; |
| _mm_store_ps(a, vZ); |
| triDesc.Z[0] = a[0] - a[2]; |
| triDesc.Z[1] = a[1] - a[2]; |
| triDesc.Z[2] = a[2]; |
| |
| // add depth bias |
| triDesc.Z[2] += ComputeDepthBias(&rastState, &triDesc, workDesc.pTriBuffer + 8); |
| |
| // Calc bounding box of triangle |
| OSALIGNSIMD(SWR_RECT) bbox; |
| calcBoundingBoxInt(vXi, vYi, bbox); |
| |
| const SWR_RECT& scissorInFixedPoint = |
| state.scissorsInFixedPoint[workDesc.triFlags.viewportIndex]; |
| |
| if (RT::ValidEdgeMaskT::value != ALL_EDGES_VALID) |
| { |
| // If we're rasterizing a degenerate triangle, expand bounding box to guarantee the BBox is |
| // valid |
| bbox.xmin--; |
| bbox.xmax++; |
| bbox.ymin--; |
| bbox.ymax++; |
| SWR_ASSERT(scissorInFixedPoint.xmin >= 0 && scissorInFixedPoint.ymin >= 0, |
| "Conservative rast degenerate handling requires a valid scissor rect"); |
| } |
| |
| // Intersect with scissor/viewport |
| OSALIGNSIMD(SWR_RECT) intersect; |
| intersect.xmin = std::max(bbox.xmin, scissorInFixedPoint.xmin); |
| intersect.xmax = std::min(bbox.xmax - 1, scissorInFixedPoint.xmax); |
| intersect.ymin = std::max(bbox.ymin, scissorInFixedPoint.ymin); |
| intersect.ymax = std::min(bbox.ymax - 1, scissorInFixedPoint.ymax); |
| |
| triDesc.triFlags = workDesc.triFlags; |
| |
| // further constrain backend to intersecting bounding box of macro tile and scissored triangle |
| // bbox |
| uint32_t macroX, macroY; |
| MacroTileMgr::getTileIndices(macroTile, macroX, macroY); |
| int32_t macroBoxLeft = macroX * KNOB_MACROTILE_X_DIM_FIXED; |
| int32_t macroBoxRight = macroBoxLeft + KNOB_MACROTILE_X_DIM_FIXED - 1; |
| int32_t macroBoxTop = macroY * KNOB_MACROTILE_Y_DIM_FIXED; |
| int32_t macroBoxBottom = macroBoxTop + KNOB_MACROTILE_Y_DIM_FIXED - 1; |
| |
| intersect.xmin = std::max(intersect.xmin, macroBoxLeft); |
| intersect.ymin = std::max(intersect.ymin, macroBoxTop); |
| intersect.xmax = std::min(intersect.xmax, macroBoxRight); |
| intersect.ymax = std::min(intersect.ymax, macroBoxBottom); |
| |
| SWR_ASSERT(intersect.xmin <= intersect.xmax && intersect.ymin <= intersect.ymax && |
| intersect.xmin >= 0 && intersect.xmax >= 0 && intersect.ymin >= 0 && |
| intersect.ymax >= 0); |
| |
| RDTSC_END(BETriangleSetup, 0); |
| |
| // update triangle desc |
| uint32_t minTileX = intersect.xmin >> (KNOB_TILE_X_DIM_SHIFT + FIXED_POINT_SHIFT); |
| uint32_t minTileY = intersect.ymin >> (KNOB_TILE_Y_DIM_SHIFT + FIXED_POINT_SHIFT); |
| uint32_t maxTileX = intersect.xmax >> (KNOB_TILE_X_DIM_SHIFT + FIXED_POINT_SHIFT); |
| uint32_t maxTileY = intersect.ymax >> (KNOB_TILE_Y_DIM_SHIFT + FIXED_POINT_SHIFT); |
| uint32_t numTilesX = maxTileX - minTileX + 1; |
| uint32_t numTilesY = maxTileY - minTileY + 1; |
| |
| if (numTilesX == 0 || numTilesY == 0) |
| { |
| RDTSC_EVENT(BEEmptyTriangle, 1, 0); |
| RDTSC_END(BERasterizeTriangle, 1); |
| return; |
| } |
| |
| RDTSC_BEGIN(BEStepSetup, pDC->drawId); |
| |
| // Step to pixel center of top-left pixel of the triangle bbox |
| // Align intersect bbox (top/left) to raster tile's (top/left). |
| int32_t x = AlignDown(intersect.xmin, (FIXED_POINT_SCALE * KNOB_TILE_X_DIM)); |
| int32_t y = AlignDown(intersect.ymin, (FIXED_POINT_SCALE * KNOB_TILE_Y_DIM)); |
| |
| // convenience typedef |
| typedef typename RT::NumCoverageSamplesT NumCoverageSamplesT; |
| |
| // single sample rasterization evaluates edges at pixel center, |
| // multisample evaluates edges UL pixel corner and steps to each sample position |
| if (std::is_same<NumCoverageSamplesT, SingleSampleT>::value) |
| { |
| // Add 0.5, in fixed point, to offset to pixel center |
| x += (FIXED_POINT_SCALE / 2); |
| y += (FIXED_POINT_SCALE / 2); |
| } |
| |
| __m128i vTopLeftX = _mm_set1_epi32(x); |
| __m128i vTopLeftY = _mm_set1_epi32(y); |
| |
| // evaluate edge equations at top-left pixel using 64bit math |
| // |
| // line = Ax + By + C |
| // solving for C: |
| // C = -Ax - By |
| // we know x0 and y0 are on the line; plug them in: |
| // C = -Ax0 - By0 |
| // plug C back into line equation: |
| // line = Ax - By - Ax0 - By0 |
| // line = A(x - x0) + B(y - y0) |
| // dX = (x-x0), dY = (y-y0) |
| // so all this simplifies to |
| // edge = A(dX) + B(dY), our first test at the top left of the bbox we're rasterizing within |
| |
| __m128i vDeltaX = _mm_sub_epi32(vTopLeftX, vXi); |
| __m128i vDeltaY = _mm_sub_epi32(vTopLeftY, vYi); |
| |
| // evaluate A(dx) and B(dY) for all points |
| __m256d vAipd = _mm256_cvtepi32_pd(vAi); |
| __m256d vBipd = _mm256_cvtepi32_pd(vBi); |
| __m256d vDeltaXpd = _mm256_cvtepi32_pd(vDeltaX); |
| __m256d vDeltaYpd = _mm256_cvtepi32_pd(vDeltaY); |
| |
| __m256d vAiDeltaXFix16 = _mm256_mul_pd(vAipd, vDeltaXpd); |
| __m256d vBiDeltaYFix16 = _mm256_mul_pd(vBipd, vDeltaYpd); |
| __m256d vEdge = _mm256_add_pd(vAiDeltaXFix16, vBiDeltaYFix16); |
| |
| // apply any edge adjustments(top-left, crast, etc) |
| adjustEdgesFix16<RT, typename RT::ConservativeEdgeOffsetT>(vAi, vBi, vEdge); |
| |
| // broadcast respective edge results to all lanes |
| double* pEdge = (double*)&vEdge; |
| __m256d vEdgeFix16[7]; |
| vEdgeFix16[0] = _mm256_set1_pd(pEdge[0]); |
| vEdgeFix16[1] = _mm256_set1_pd(pEdge[1]); |
| vEdgeFix16[2] = _mm256_set1_pd(pEdge[2]); |
| |
| OSALIGNSIMD(int32_t) aAi[4], aBi[4]; |
| _mm_store_si128((__m128i*)aAi, vAi); |
| _mm_store_si128((__m128i*)aBi, vBi); |
| EDGE rastEdges[RT::NumEdgesT::value]; |
| |
| // Compute and store triangle edge data |
| ComputeEdgeData(aAi[0], aBi[0], rastEdges[0]); |
| ComputeEdgeData(aAi[1], aBi[1], rastEdges[1]); |
| ComputeEdgeData(aAi[2], aBi[2], rastEdges[2]); |
| |
| // Compute and store triangle edge data if scissor needs to rasterized |
| ComputeScissorEdges<typename RT::RasterizeScissorEdgesT, typename RT::IsConservativeT, RT>( |
| bbox, scissorInFixedPoint, x, y, rastEdges, vEdgeFix16); |
| |
| // Evaluate edge equations at sample positions of each of the 4 corners of a raster tile |
| // used to for testing if entire raster tile is inside a triangle |
| for (uint32_t e = 0; e < RT::NumEdgesT::value; ++e) |
| { |
| vEdgeFix16[e] = _mm256_add_pd(vEdgeFix16[e], rastEdges[e].vRasterTileOffsets); |
| } |
| |
| // at this point vEdge has been evaluated at the UL pixel corners of raster tile bbox |
| // step sample positions to the raster tile bbox of multisample points |
| // min(xSamples),min(ySamples) ------ max(xSamples),min(ySamples) |
| // | | |
| // | | |
| // min(xSamples),max(ySamples) ------ max(xSamples),max(ySamples) |
| __m256d vEdgeTileBbox[3]; |
| if (NumCoverageSamplesT::value > 1) |
| { |
| const SWR_MULTISAMPLE_POS& samplePos = rastState.samplePositions; |
| const __m128i vTileSampleBBoxXh = samplePos.TileSampleOffsetsX(); |
| const __m128i vTileSampleBBoxYh = samplePos.TileSampleOffsetsY(); |
| |
| __m256d vTileSampleBBoxXFix8 = _mm256_cvtepi32_pd(vTileSampleBBoxXh); |
| __m256d vTileSampleBBoxYFix8 = _mm256_cvtepi32_pd(vTileSampleBBoxYh); |
| |
| // step edge equation tests from Tile |
| // used to for testing if entire raster tile is inside a triangle |
| for (uint32_t e = 0; e < 3; ++e) |
| { |
| __m256d vResultAxFix16 = |
| _mm256_mul_pd(_mm256_set1_pd(rastEdges[e].a), vTileSampleBBoxXFix8); |
| __m256d vResultByFix16 = |
| _mm256_mul_pd(_mm256_set1_pd(rastEdges[e].b), vTileSampleBBoxYFix8); |
| vEdgeTileBbox[e] = _mm256_add_pd(vResultAxFix16, vResultByFix16); |
| |
| // adjust for msaa tile bbox edges outward for conservative rast, if enabled |
| adjustEdgeConservative<RT, typename RT::ConservativeEdgeOffsetT>( |
| vAi, vBi, vEdgeTileBbox[e]); |
| } |
| } |
| |
| RDTSC_END(BEStepSetup, 0); |
| |
| uint32_t tY = minTileY; |
| uint32_t tX = minTileX; |
| uint32_t maxY = maxTileY; |
| uint32_t maxX = maxTileX; |
| |
| RenderOutputBuffers renderBuffers, currentRenderBufferRow; |
| GetRenderHotTiles<RT::MT::numSamples>(pDC, |
| workerId, |
| macroTile, |
| minTileX, |
| minTileY, |
| renderBuffers, |
| triDesc.triFlags.renderTargetArrayIndex); |
| currentRenderBufferRow = renderBuffers; |
| |
| // rasterize and generate coverage masks per sample |
| for (uint32_t tileY = tY; tileY <= maxY; ++tileY) |
| { |
| __m256d vStartOfRowEdge[RT::NumEdgesT::value]; |
| for (uint32_t e = 0; e < RT::NumEdgesT::value; ++e) |
| { |
| vStartOfRowEdge[e] = vEdgeFix16[e]; |
| } |
| |
| for (uint32_t tileX = tX; tileX <= maxX; ++tileX) |
| { |
| triDesc.anyCoveredSamples = 0; |
| |
| // is the corner of the edge outside of the raster tile? (vEdge < 0) |
| int mask0, mask1, mask2; |
| UpdateEdgeMasks<NumCoverageSamplesT>(vEdgeTileBbox, vEdgeFix16, mask0, mask1, mask2); |
| |
| for (uint32_t sampleNum = 0; sampleNum < NumCoverageSamplesT::value; sampleNum++) |
| { |
| // trivial reject, at least one edge has all 4 corners of raster tile outside |
| bool trivialReject = |
| TrivialRejectTest<typename RT::ValidEdgeMaskT>(mask0, mask1, mask2); |
| |
| if (!trivialReject) |
| { |
| // trivial accept mask |
| triDesc.coverageMask[sampleNum] = 0xffffffffffffffffULL; |
| |
| // Update the raster tile edge masks based on inner conservative edge offsets, |
| // if enabled |
| UpdateEdgeMasksInnerConservative<RT, |
| typename RT::ValidEdgeMaskT, |
| typename RT::InputCoverageT>( |
| vEdgeTileBbox, vEdgeFix16, vAi, vBi, mask0, mask1, mask2); |
| |
| // @todo Make this a bit smarter to allow use of trivial accept when: |
| // 1) scissor/vp intersection rect is raster tile aligned |
| // 2) raster tile is entirely within scissor/vp intersection rect |
| if (TrivialAcceptTest<typename RT::RasterizeScissorEdgesT>(mask0, mask1, mask2)) |
| { |
| // trivial accept, all 4 corners of all 3 edges are negative |
| // i.e. raster tile completely inside triangle |
| triDesc.anyCoveredSamples = triDesc.coverageMask[sampleNum]; |
| if (std::is_same<typename RT::InputCoverageT, |
| InnerConservativeCoverageT>::value) |
| { |
| triDesc.innerCoverageMask = 0xffffffffffffffffULL; |
| } |
| RDTSC_EVENT(BETrivialAccept, 1, 0); |
| } |
| else |
| { |
| __m256d vEdgeAtSample[RT::NumEdgesT::value]; |
| if (std::is_same<NumCoverageSamplesT, SingleSampleT>::value) |
| { |
| // should get optimized out for single sample case (global value |
| // numbering or copy propagation) |
| for (uint32_t e = 0; e < RT::NumEdgesT::value; ++e) |
| { |
| vEdgeAtSample[e] = vEdgeFix16[e]; |
| } |
| } |
| else |
| { |
| const SWR_MULTISAMPLE_POS& samplePos = rastState.samplePositions; |
| __m128i vSampleOffsetXh = samplePos.vXi(sampleNum); |
| __m128i vSampleOffsetYh = samplePos.vYi(sampleNum); |
| __m256d vSampleOffsetX = _mm256_cvtepi32_pd(vSampleOffsetXh); |
| __m256d vSampleOffsetY = _mm256_cvtepi32_pd(vSampleOffsetYh); |
| |
| // step edge equation tests from UL tile corner to pixel sample position |
| for (uint32_t e = 0; e < RT::NumEdgesT::value; ++e) |
| { |
| __m256d vResultAxFix16 = |
| _mm256_mul_pd(_mm256_set1_pd(rastEdges[e].a), vSampleOffsetX); |
| __m256d vResultByFix16 = |
| _mm256_mul_pd(_mm256_set1_pd(rastEdges[e].b), vSampleOffsetY); |
| vEdgeAtSample[e] = _mm256_add_pd(vResultAxFix16, vResultByFix16); |
| vEdgeAtSample[e] = _mm256_add_pd(vEdgeFix16[e], vEdgeAtSample[e]); |
| } |
| } |
| |
| double startQuadEdges[RT::NumEdgesT::value]; |
| const __m256i vLane0Mask = _mm256_set_epi32(0, 0, 0, 0, 0, 0, -1, -1); |
| for (uint32_t e = 0; e < RT::NumEdgesT::value; ++e) |
| { |
| _mm256_maskstore_pd(&startQuadEdges[e], vLane0Mask, vEdgeAtSample[e]); |
| } |
| |
| // not trivial accept or reject, must rasterize full tile |
| RDTSC_BEGIN(BERasterizePartial, pDC->drawId); |
| triDesc.coverageMask[sampleNum] = |
| rasterizePartialTile<RT::NumEdgesT::value, typename RT::ValidEdgeMaskT>( |
| pDC, startQuadEdges, rastEdges); |
| RDTSC_END(BERasterizePartial, 0); |
| |
| triDesc.anyCoveredSamples |= triDesc.coverageMask[sampleNum]; |
| |
| // Output SV InnerCoverage, if needed |
| GenerateSVInnerCoverage<RT, |
| typename RT::ValidEdgeMaskT, |
| typename RT::InputCoverageT>( |
| pDC, workerId, rastEdges, startQuadEdges, triDesc.innerCoverageMask); |
| } |
| } |
| else |
| { |
| // if we're calculating coverage per sample, need to store it off. otherwise no |
| // covered samples, don't need to do anything |
| if (NumCoverageSamplesT::value > 1) |
| { |
| triDesc.coverageMask[sampleNum] = 0; |
| } |
| RDTSC_EVENT(BETrivialReject, 1, 0); |
| } |
| } |
| |
| #if KNOB_ENABLE_TOSS_POINTS |
| if (KNOB_TOSS_RS) |
| { |
| gToss = triDesc.coverageMask[0]; |
| } |
| else |
| #endif |
| if (triDesc.anyCoveredSamples) |
| { |
| // if conservative rast and MSAA are enabled, conservative coverage for a pixel |
| // means all samples in that pixel are covered copy conservative coverage result to |
| // all samples |
| if (RT::IsConservativeT::value) |
| { |
| auto copyCoverage = [&](int sample) { |
| triDesc.coverageMask[sample] = triDesc.coverageMask[0]; |
| }; |
| UnrollerL<1, RT::MT::numSamples, 1>::step(copyCoverage); |
| } |
| |
| // Track rasterized subspans |
| AR_EVENT(RasterTileCount(pDC->drawId, 1)); |
| |
| RDTSC_BEGIN(BEPixelBackend, pDC->drawId); |
| backendFuncs.pfnBackend(pDC, |
| workerId, |
| tileX << KNOB_TILE_X_DIM_SHIFT, |
| tileY << KNOB_TILE_Y_DIM_SHIFT, |
| triDesc, |
| renderBuffers); |
| RDTSC_END(BEPixelBackend, 0); |
| } |
| |
| // step to the next tile in X |
| for (uint32_t e = 0; e < RT::NumEdgesT::value; ++e) |
| { |
| vEdgeFix16[e] = |
| _mm256_add_pd(vEdgeFix16[e], _mm256_set1_pd(rastEdges[e].stepRasterTileX)); |
| } |
| StepRasterTileX<RT>(state.colorHottileEnable, renderBuffers); |
| } |
| |
| // step to the next tile in Y |
| for (uint32_t e = 0; e < RT::NumEdgesT::value; ++e) |
| { |
| vEdgeFix16[e] = |
| _mm256_add_pd(vStartOfRowEdge[e], _mm256_set1_pd(rastEdges[e].stepRasterTileY)); |
| } |
| StepRasterTileY<RT>(state.colorHottileEnable, renderBuffers, currentRenderBufferRow); |
| } |
| |
| RDTSC_END(BERasterizeTriangle, 1); |
| } |
| |
| // Get pointers to hot tile memory for color RT, depth, stencil |
| template <uint32_t numSamples> |
| void GetRenderHotTiles(DRAW_CONTEXT* pDC, |
| uint32_t workerId, |
| uint32_t macroID, |
| uint32_t tileX, |
| uint32_t tileY, |
| RenderOutputBuffers& renderBuffers, |
| uint32_t renderTargetArrayIndex) |
| { |
| const API_STATE& state = GetApiState(pDC); |
| SWR_CONTEXT* pContext = pDC->pContext; |
| HANDLE hWorkerPrivateData = pContext->threadPool.pThreadData[workerId].pWorkerPrivateData; |
| |
| uint32_t mx, my; |
| MacroTileMgr::getTileIndices(macroID, mx, my); |
| tileX -= KNOB_MACROTILE_X_DIM_IN_TILES * mx; |
| tileY -= KNOB_MACROTILE_Y_DIM_IN_TILES * my; |
| |
| // compute tile offset for active hottile buffers |
| const uint32_t pitch = KNOB_MACROTILE_X_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8; |
| uint32_t offset = ComputeTileOffset2D< |
| TilingTraits<SWR_TILE_SWRZ, FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp>>( |
| pitch, tileX, tileY); |
| offset *= numSamples; |
| |
| unsigned long rtSlot = 0; |
| uint32_t colorHottileEnableMask = state.colorHottileEnable; |
| while (_BitScanForward(&rtSlot, colorHottileEnableMask)) |
| { |
| HOTTILE* pColor = pContext->pHotTileMgr->GetHotTile( |
| pContext, |
| pDC, |
| hWorkerPrivateData, |
| macroID, |
| (SWR_RENDERTARGET_ATTACHMENT)(SWR_ATTACHMENT_COLOR0 + rtSlot), |
| true, |
| numSamples, |
| renderTargetArrayIndex); |
| pColor->state = HOTTILE_DIRTY; |
| renderBuffers.pColor[rtSlot] = pColor->pBuffer + offset; |
| |
| colorHottileEnableMask &= ~(1 << rtSlot); |
| } |
| if (state.depthHottileEnable) |
| { |
| const uint32_t pitch = |
| KNOB_MACROTILE_X_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8; |
| uint32_t offset = ComputeTileOffset2D< |
| TilingTraits<SWR_TILE_SWRZ, FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp>>( |
| pitch, tileX, tileY); |
| offset *= numSamples; |
| HOTTILE* pDepth = pContext->pHotTileMgr->GetHotTile(pContext, |
| pDC, |
| hWorkerPrivateData, |
| macroID, |
| SWR_ATTACHMENT_DEPTH, |
| true, |
| numSamples, |
| renderTargetArrayIndex); |
| pDepth->state = HOTTILE_DIRTY; |
| SWR_ASSERT(pDepth->pBuffer != nullptr); |
| renderBuffers.pDepth = pDepth->pBuffer + offset; |
| } |
| if (state.stencilHottileEnable) |
| { |
| const uint32_t pitch = |
| KNOB_MACROTILE_X_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8; |
| uint32_t offset = ComputeTileOffset2D< |
| TilingTraits<SWR_TILE_SWRZ, FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp>>( |
| pitch, tileX, tileY); |
| offset *= numSamples; |
| HOTTILE* pStencil = pContext->pHotTileMgr->GetHotTile(pContext, |
| pDC, |
| hWorkerPrivateData, |
| macroID, |
| SWR_ATTACHMENT_STENCIL, |
| true, |
| numSamples, |
| renderTargetArrayIndex); |
| pStencil->state = HOTTILE_DIRTY; |
| SWR_ASSERT(pStencil->pBuffer != nullptr); |
| renderBuffers.pStencil = pStencil->pBuffer + offset; |
| } |
| } |
| |
| template <typename RT> |
| INLINE void StepRasterTileX(uint32_t colorHotTileMask, RenderOutputBuffers& buffers) |
| { |
| DWORD rt = 0; |
| while (_BitScanForward(&rt, colorHotTileMask)) |
| { |
| colorHotTileMask &= ~(1 << rt); |
| buffers.pColor[rt] += RT::colorRasterTileStep; |
| } |
| |
| buffers.pDepth += RT::depthRasterTileStep; |
| buffers.pStencil += RT::stencilRasterTileStep; |
| } |
| |
| template <typename RT> |
| INLINE void StepRasterTileY(uint32_t colorHotTileMask, |
| RenderOutputBuffers& buffers, |
| RenderOutputBuffers& startBufferRow) |
| { |
| DWORD rt = 0; |
| while (_BitScanForward(&rt, colorHotTileMask)) |
| { |
| colorHotTileMask &= ~(1 << rt); |
| startBufferRow.pColor[rt] += RT::colorRasterTileRowStep; |
| buffers.pColor[rt] = startBufferRow.pColor[rt]; |
| } |
| startBufferRow.pDepth += RT::depthRasterTileRowStep; |
| buffers.pDepth = startBufferRow.pDepth; |
| |
| startBufferRow.pStencil += RT::stencilRasterTileRowStep; |
| buffers.pStencil = startBufferRow.pStencil; |
| } |