src/gallium/drivers/swr/rasterizer/core/rasterizer_impl.h - platform/external/mesa3d - Git at Google

 /****************************************************************************
  * Copyright (C) 2014-2018 Intel Corporation.   All Rights Reserved.
  *
  * Permission is hereby granted, free of charge, to any person obtaining a
  * copy of this software and associated documentation files (the "Software"),
  * to deal in the Software without restriction, including without limitation
  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
  * and/or sell copies of the Software, and to permit persons to whom the
  * Software is furnished to do so, subject to the following conditions:
  *
  * The above copyright notice and this permission notice (including the next
  * paragraph) shall be included in all copies or substantial portions of the
  * Software.
  *
  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  * IN THE SOFTWARE.
  *
  * @file rasterizer.cpp
  *
  * @brief Implementation for the rasterizer.
  *
  ******************************************************************************/

 #include <vector>
 #include <algorithm>

 #include "rasterizer.h"
 #include "rdtsc_core.h"
 #include "backend.h"
 #include "utils.h"
 #include "frontend.h"
 #include "tilemgr.h"
 #include "memory/tilingtraits.h"

 extern PFN_WORK_FUNC gRasterizerFuncs[SWR_MULTISAMPLE_TYPE_COUNT][2][2][SWR_INPUT_COVERAGE_COUNT]
                                      [STATE_VALID_TRI_EDGE_COUNT][2];

 template <uint32_t numSamples = 1>
 void GetRenderHotTiles(DRAW_CONTEXT*        pDC,
                        uint32_t             workerId,
                        uint32_t             macroID,
                        uint32_t             x,
                        uint32_t             y,
                        RenderOutputBuffers& renderBuffers,
                        uint32_t             renderTargetArrayIndex);
 template <typename RT>
 void StepRasterTileX(uint32_t colorHotTileMask, RenderOutputBuffers& buffers);
 template <typename RT>
 void StepRasterTileY(uint32_t             colorHotTileMask,
                      RenderOutputBuffers& buffers,
                      RenderOutputBuffers& startBufferRow);

 #define MASKTOVEC(i3, i2, i1, i0) \
     {                             \
         -i0, -i1, -i2, -i3        \
     }
 static const __m256d gMaskToVecpd[] = {
     MASKTOVEC(0, 0, 0, 0),
     MASKTOVEC(0, 0, 0, 1),
     MASKTOVEC(0, 0, 1, 0),
     MASKTOVEC(0, 0, 1, 1),
     MASKTOVEC(0, 1, 0, 0),
     MASKTOVEC(0, 1, 0, 1),
     MASKTOVEC(0, 1, 1, 0),
     MASKTOVEC(0, 1, 1, 1),
     MASKTOVEC(1, 0, 0, 0),
     MASKTOVEC(1, 0, 0, 1),
     MASKTOVEC(1, 0, 1, 0),
     MASKTOVEC(1, 0, 1, 1),
     MASKTOVEC(1, 1, 0, 0),
     MASKTOVEC(1, 1, 0, 1),
     MASKTOVEC(1, 1, 1, 0),
     MASKTOVEC(1, 1, 1, 1),
 };

 struct POS
 {
     int32_t x, y;
 };

 struct EDGE
 {
     double a, b;            // a, b edge coefficients in fix8
     double stepQuadX;       // step to adjacent horizontal quad in fix16
     double stepQuadY;       // step to adjacent vertical quad in fix16
     double stepRasterTileX; // step to adjacent horizontal raster tile in fix16
     double stepRasterTileY; // step to adjacent vertical raster tile in fix16

     __m256d vQuadOffsets;       // offsets for 4 samples of a quad
     __m256d vRasterTileOffsets; // offsets for the 4 corners of a raster tile
 };

 //////////////////////////////////////////////////////////////////////////
 /// @brief rasterize a raster tile partially covered by the triangle
 /// @param vEdge0-2 - edge equations evaluated at sample pos at each of the 4 corners of a raster
 /// tile
 /// @param vA, vB - A & B coefs for each edge of the triangle (Ax + Bx + C)
 /// @param vStepQuad0-2 - edge equations evaluated at the UL corners of the 2x2 pixel quad.
 ///        Used to step between quads when sweeping over the raster tile.
 template <uint32_t NumEdges, typename EdgeMaskT>
 INLINE uint64_t rasterizePartialTile(DRAW_CONTEXT* pDC,
                                      double        startEdges[NumEdges],
                                      EDGE*         pRastEdges)
 {
     uint64_t coverageMask = 0;

     __m256d vEdges[NumEdges];
     __m256d vStepX[NumEdges];
     __m256d vStepY[NumEdges];

     for (uint32_t e = 0; e < NumEdges; ++e)
     {
         // Step to the pixel sample locations of the 1st quad
         vEdges[e] = _mm256_add_pd(_mm256_set1_pd(startEdges[e]), pRastEdges[e].vQuadOffsets);

         // compute step to next quad (mul by 2 in x and y direction)
         vStepX[e] = _mm256_set1_pd(pRastEdges[e].stepQuadX);
         vStepY[e] = _mm256_set1_pd(pRastEdges[e].stepQuadY);
     }

     // fast unrolled version for 8x8 tile
 #if KNOB_TILE_X_DIM == 8 && KNOB_TILE_Y_DIM == 8
     int      edgeMask[NumEdges];
     uint64_t mask;

     auto eval_lambda   = [&](int e) { edgeMask[e] = _mm256_movemask_pd(vEdges[e]); };
     auto update_lambda = [&](int e) { mask &= edgeMask[e]; };
     auto incx_lambda   = [&](int e) { vEdges[e] = _mm256_add_pd(vEdges[e], vStepX[e]); };
     auto incy_lambda   = [&](int e) { vEdges[e] = _mm256_add_pd(vEdges[e], vStepY[e]); };
     auto decx_lambda   = [&](int e) { vEdges[e] = _mm256_sub_pd(vEdges[e], vStepX[e]); };

 // evaluate which pixels in the quad are covered
 #define EVAL UnrollerLMask<0, NumEdges, 1, EdgeMaskT::value>::step(eval_lambda);

     // update coverage mask
     // if edge 0 is degenerate and will be skipped; init the mask
 #define UPDATE_MASK(bit)                                                  \
     if (std::is_same<EdgeMaskT, E1E2ValidT>::value ||                     \
         std::is_same<EdgeMaskT, NoEdgesValidT>::value)                    \
     {                                                                     \
         mask = 0xf;                                                       \
     }                                                                     \
     else                                                                  \
     {                                                                     \
         mask = edgeMask[0];                                               \
     }                                                                     \
     UnrollerLMask<1, NumEdges, 1, EdgeMaskT::value>::step(update_lambda); \
     coverageMask |= (mask << bit);

     // step in the +x direction to the next quad
 #define INCX UnrollerLMask<0, NumEdges, 1, EdgeMaskT::value>::step(incx_lambda);

     // step in the +y direction to the next quad
 #define INCY UnrollerLMask<0, NumEdges, 1, EdgeMaskT::value>::step(incy_lambda);

     // step in the -x direction to the next quad
 #define DECX UnrollerLMask<0, NumEdges, 1, EdgeMaskT::value>::step(decx_lambda);

     // sweep 2x2 quad back and forth through the raster tile,
     // computing coverage masks for the entire tile

     // raster tile
     // 0  1  2  3  4  5  6  7
     // x  x
     // x  x ------------------>
     //                   x  x  |
     // <-----------------x  x  V
     // ..

     // row 0
     EVAL;
     UPDATE_MASK(0);
     INCX;
     EVAL;
     UPDATE_MASK(4);
     INCX;
     EVAL;
     UPDATE_MASK(8);
     INCX;
     EVAL;
     UPDATE_MASK(12);
     INCY;

     // row 1
     EVAL;
     UPDATE_MASK(28);
     DECX;
     EVAL;
     UPDATE_MASK(24);
     DECX;
     EVAL;
     UPDATE_MASK(20);
     DECX;
     EVAL;
     UPDATE_MASK(16);
     INCY;

     // row 2
     EVAL;
     UPDATE_MASK(32);
     INCX;
     EVAL;
     UPDATE_MASK(36);
     INCX;
     EVAL;
     UPDATE_MASK(40);
     INCX;
     EVAL;
     UPDATE_MASK(44);
     INCY;

     // row 3
     EVAL;
     UPDATE_MASK(60);
     DECX;
     EVAL;
     UPDATE_MASK(56);
     DECX;
     EVAL;
     UPDATE_MASK(52);
     DECX;
     EVAL;
     UPDATE_MASK(48);
 #else
     uint32_t bit = 0;
     for (uint32_t y = 0; y < KNOB_TILE_Y_DIM / 2; ++y)
     {
         __m256d vStartOfRowEdge[NumEdges];
         for (uint32_t e = 0; e < NumEdges; ++e)
         {
             vStartOfRowEdge[e] = vEdges[e];
         }

         for (uint32_t x = 0; x < KNOB_TILE_X_DIM / 2; ++x)
         {
             int edgeMask[NumEdges];
             for (uint32_t e = 0; e < NumEdges; ++e)
             {
                 edgeMask[e] = _mm256_movemask_pd(vEdges[e]);
             }

             uint64_t mask = edgeMask[0];
             for (uint32_t e = 1; e < NumEdges; ++e)
             {
                 mask &= edgeMask[e];
             }
             coverageMask |= (mask << bit);

             // step to the next pixel in the x
             for (uint32_t e = 0; e < NumEdges; ++e)
             {
                 vEdges[e] = _mm256_add_pd(vEdges[e], vStepX[e]);
             }
             bit += 4;
         }

         // step to the next row
         for (uint32_t e = 0; e < NumEdges; ++e)
         {
             vEdges[e] = _mm256_add_pd(vStartOfRowEdge[e], vStepY[e]);
         }
     }
 #endif
     return coverageMask;
 }
 // Top left rule:
 // Top: if an edge is horizontal, and it is above other edges in tri pixel space, it is a 'top' edge
 // Left: if an edge is not horizontal, and it is on the left side of the triangle in pixel space, it
 // is a 'left' edge Top left: a sample is in if it is a top or left edge. Out: !(horizontal &&
 // above) = !horizontal && below Out: !horizontal && left = !(!horizontal && left) = horizontal and
 // right
 INLINE void adjustTopLeftRuleIntFix16(const __m128i vA, const __m128i vB, __m256d& vEdge)
 {
     // if vA < 0, vC--
     // if vA == 0 && vB < 0, vC--

     __m256d vEdgeOut    = vEdge;
     __m256d vEdgeAdjust = _mm256_sub_pd(vEdge, _mm256_set1_pd(1.0));

     // if vA < 0 (line is not horizontal and below)
     int msk = _mm_movemask_ps(_mm_castsi128_ps(vA));

     // if vA == 0 && vB < 0 (line is horizontal and we're on the left edge of a tri)
     __m128i vCmp = _mm_cmpeq_epi32(vA, _mm_setzero_si128());
     int     msk2 = _mm_movemask_ps(_mm_castsi128_ps(vCmp));
     msk2 &= _mm_movemask_ps(_mm_castsi128_ps(vB));

     // if either of these are true and we're on the line (edge == 0), bump it outside the line
     vEdge = _mm256_blendv_pd(vEdgeOut, vEdgeAdjust, gMaskToVecpd[msk | msk2]);
 }

 //////////////////////////////////////////////////////////////////////////
 /// @brief calculates difference in precision between the result of manh
 /// calculation and the edge precision, based on compile time trait values
 template <typename RT>
 constexpr int64_t ManhToEdgePrecisionAdjust()
 {
     static_assert(RT::PrecisionT::BitsT::value + RT::ConservativePrecisionT::BitsT::value >=
                       RT::EdgePrecisionT::BitsT::value,
                   "Inadequate precision of result of manh calculation ");
     return ((RT::PrecisionT::BitsT::value + RT::ConservativePrecisionT::BitsT::value) -
             RT::EdgePrecisionT::BitsT::value);
 }

 //////////////////////////////////////////////////////////////////////////
 /// @struct adjustEdgeConservative
 /// @brief Primary template definition used for partially specializing
 /// the adjustEdgeConservative function. This struct should never
 /// be instantiated.
 /// @tparam RT: rasterizer traits
 /// @tparam ConservativeEdgeOffsetT: does the edge need offsetting?
 template <typename RT, typename ConservativeEdgeOffsetT>
 struct adjustEdgeConservative
 {
     //////////////////////////////////////////////////////////////////////////
     /// @brief Performs calculations to adjust each edge of a triangle away
     /// from the pixel center by 1/2 pixel + uncertainty region in both the x and y
     /// direction.
     ///
     /// Uncertainty regions arise from fixed point rounding, which
     /// can snap a vertex +/- by min fixed point value.
     /// Adding 1/2 pixel in x/y bumps the edge equation tests out towards the pixel corners.
     /// This allows the rasterizer to test for coverage only at the pixel center,
     /// instead of having to test individual pixel corners for conservative coverage
     INLINE adjustEdgeConservative(const __m128i& vAi, const __m128i& vBi, __m256d& vEdge)
     {
         // Assumes CCW winding order. Subtracting from the evaluated edge equation moves the edge
         // away from the pixel center (in the direction of the edge normal A/B)

         // edge = Ax + Bx + C - (manh/e)
         // manh = manhattan distance = abs(A) + abs(B)
         // e = absolute rounding error from snapping from float to fixed point precision

         // 'fixed point' multiply (in double to be avx1 friendly)
         // need doubles to hold result of a fixed multiply: 16.8 * 16.9 = 32.17, for example
         __m256d vAai = _mm256_cvtepi32_pd(_mm_abs_epi32(vAi)),
                 vBai = _mm256_cvtepi32_pd(_mm_abs_epi32(vBi));
         __m256d manh =
             _mm256_add_pd(_mm256_mul_pd(vAai, _mm256_set1_pd(ConservativeEdgeOffsetT::value)),
                           _mm256_mul_pd(vBai, _mm256_set1_pd(ConservativeEdgeOffsetT::value)));

         static_assert(RT::PrecisionT::BitsT::value + RT::ConservativePrecisionT::BitsT::value >=
                           RT::EdgePrecisionT::BitsT::value,
                       "Inadequate precision of result of manh calculation ");

         // rasterizer incoming edge precision is x.16, so we need to get our edge offset into the
         // same precision since we're doing fixed math in double format, multiply by multiples of
         // 1/2 instead of a bit shift right
         manh = _mm256_mul_pd(manh, _mm256_set1_pd(ManhToEdgePrecisionAdjust<RT>() * 0.5));

         // move the edge away from the pixel center by the required conservative precision + 1/2
         // pixel this allows the rasterizer to do a single conservative coverage test to see if the
         // primitive intersects the pixel at all
         vEdge = _mm256_sub_pd(vEdge, manh);
     };
 };

 //////////////////////////////////////////////////////////////////////////
 /// @brief adjustEdgeConservative specialization where no edge offset is needed
 template <typename RT>
 struct adjustEdgeConservative<RT, std::integral_constant<int32_t, 0>>
 {
     INLINE adjustEdgeConservative(const __m128i& vAi, const __m128i& vBi, __m256d& vEdge){};
 };

 //////////////////////////////////////////////////////////////////////////
 /// @brief calculates the distance a degenerate BBox needs to be adjusted
 /// for conservative rast based on compile time trait values
 template <typename RT>
 constexpr int64_t ConservativeScissorOffset()
 {
     static_assert(RT::ConservativePrecisionT::BitsT::value - RT::PrecisionT::BitsT::value >= 0,
                   "Rasterizer precision > conservative precision");
     // if we have a degenerate triangle, we need to compensate for adjusting the degenerate BBox
     // when calculating scissor edges
     typedef std::integral_constant<int32_t, (RT::ValidEdgeMaskT::value == ALL_EDGES_VALID) ? 0 : 1>
         DegenerateEdgeOffsetT;
     // 1/2 pixel edge offset + conservative offset - degenerateTriangle
     return RT::ConservativeEdgeOffsetT::value -
            (DegenerateEdgeOffsetT::value
             << (RT::ConservativePrecisionT::BitsT::value - RT::PrecisionT::BitsT::value));
 }

 //////////////////////////////////////////////////////////////////////////
 /// @brief Performs calculations to adjust each a vector of evaluated edges out
 /// from the pixel center by 1/2 pixel + uncertainty region in both the x and y
 /// direction.
 template <typename RT>
 INLINE void adjustScissorEdge(const double a, const double b, __m256d& vEdge)
 {
     int64_t aabs = std::abs(static_cast<int64_t>(a)), babs = std::abs(static_cast<int64_t>(b));
     int64_t manh =
         ((aabs * ConservativeScissorOffset<RT>()) + (babs * ConservativeScissorOffset<RT>())) >>
         ManhToEdgePrecisionAdjust<RT>();
     vEdge = _mm256_sub_pd(vEdge, _mm256_set1_pd(manh));
 };

 //////////////////////////////////////////////////////////////////////////
 /// @brief Performs calculations to adjust each a scalar evaluated edge out
 /// from the pixel center by 1/2 pixel + uncertainty region in both the x and y
 /// direction.
 template <typename RT, typename OffsetT>
 INLINE double adjustScalarEdge(const double a, const double b, const double Edge)
 {
     int64_t aabs = std::abs(static_cast<int64_t>(a)), babs = std::abs(static_cast<int64_t>(b));
     int64_t manh =
         ((aabs * OffsetT::value) + (babs * OffsetT::value)) >> ManhToEdgePrecisionAdjust<RT>();
     return (Edge - manh);
 };

 //////////////////////////////////////////////////////////////////////////
 /// @brief Perform any needed adjustments to evaluated triangle edges
 template <typename RT, typename EdgeOffsetT>
 struct adjustEdgesFix16
 {
     INLINE adjustEdgesFix16(const __m128i& vAi, const __m128i& vBi, __m256d& vEdge)
     {
         static_assert(
             std::is_same<typename RT::EdgePrecisionT, FixedPointTraits<Fixed_X_16>>::value,
             "Edge equation expected to be in x.16 fixed point");

         static_assert(RT::IsConservativeT::value,
                       "Edge offset assumes conservative rasterization is enabled");

         // need to apply any edge offsets before applying the top-left rule
         adjustEdgeConservative<RT, EdgeOffsetT>(vAi, vBi, vEdge);

         adjustTopLeftRuleIntFix16(vAi, vBi, vEdge);
     }
 };

 //////////////////////////////////////////////////////////////////////////
 /// @brief Perform top left adjustments to evaluated triangle edges
 template <typename RT>
 struct adjustEdgesFix16<RT, std::integral_constant<int32_t, 0>>
 {
     INLINE adjustEdgesFix16(const __m128i& vAi, const __m128i& vBi, __m256d& vEdge)
     {
         adjustTopLeftRuleIntFix16(vAi, vBi, vEdge);
     }
 };

 // max(abs(dz/dx), abs(dz,dy)
 INLINE float ComputeMaxDepthSlope(const SWR_TRIANGLE_DESC* pDesc)
 {
     /*
     // evaluate i,j at (0,0)
     float i00 = pDesc->I[0] * 0.0f + pDesc->I[1] * 0.0f + pDesc->I[2];
     float j00 = pDesc->J[0] * 0.0f + pDesc->J[1] * 0.0f + pDesc->J[2];

     // evaluate i,j at (1,0)
     float i10 = pDesc->I[0] * 1.0f + pDesc->I[1] * 0.0f + pDesc->I[2];
     float j10 = pDesc->J[0] * 1.0f + pDesc->J[1] * 0.0f + pDesc->J[2];

     // compute dz/dx
     float d00 = pDesc->Z[0] * i00 + pDesc->Z[1] * j00 + pDesc->Z[2];
     float d10 = pDesc->Z[0] * i10 + pDesc->Z[1] * j10 + pDesc->Z[2];
     float dzdx = abs(d10 - d00);

     // evaluate i,j at (0,1)
     float i01 = pDesc->I[0] * 0.0f + pDesc->I[1] * 1.0f + pDesc->I[2];
     float j01 = pDesc->J[0] * 0.0f + pDesc->J[1] * 1.0f + pDesc->J[2];

     float d01 = pDesc->Z[0] * i01 + pDesc->Z[1] * j01 + pDesc->Z[2];
     float dzdy = abs(d01 - d00);
     */

     // optimized version of above
     float dzdx = fabsf(pDesc->recipDet * (pDesc->Z[0] * pDesc->I[0] + pDesc->Z[1] * pDesc->J[0]));
     float dzdy = fabsf(pDesc->recipDet * (pDesc->Z[0] * pDesc->I[1] + pDesc->Z[1] * pDesc->J[1]));

     return std::max(dzdx, dzdy);
 }

 INLINE float
 ComputeBiasFactor(const SWR_RASTSTATE* pState, const SWR_TRIANGLE_DESC* pDesc, const float* z)
 {
     if (pState->depthFormat == R24_UNORM_X8_TYPELESS)
     {
         return (1.0f / (1 << 24));
     }
     else if (pState->depthFormat == R16_UNORM)
     {
         return (1.0f / (1 << 16));
     }
     else
     {
         SWR_ASSERT(pState->depthFormat == R32_FLOAT);

         // for f32 depth, factor = 2^(exponent(max(abs(z) - 23)
         float    zMax    = std::max(fabsf(z[0]), std::max(fabsf(z[1]), fabsf(z[2])));
         uint32_t zMaxInt = *(uint32_t*)&zMax;
         zMaxInt &= 0x7f800000;
         zMax = *(float*)&zMaxInt;

         return zMax * (1.0f / (1 << 23));
     }
 }

 INLINE float
 ComputeDepthBias(const SWR_RASTSTATE* pState, const SWR_TRIANGLE_DESC* pTri, const float* z)
 {
     if (pState->depthBias == 0 && pState->slopeScaledDepthBias == 0)
     {
         return 0.0f;
     }

     float scale = pState->slopeScaledDepthBias;
     if (scale != 0.0f)
     {
         scale *= ComputeMaxDepthSlope(pTri);
     }

     float bias = pState->depthBias;
     if (!pState->depthBiasPreAdjusted)
     {
         bias *= ComputeBiasFactor(pState, pTri, z);
     }
     bias += scale;

     if (pState->depthBiasClamp > 0.0f)
     {
         bias = std::min(bias, pState->depthBiasClamp);
     }
     else if (pState->depthBiasClamp < 0.0f)
     {
         bias = std::max(bias, pState->depthBiasClamp);
     }

     return bias;
 }

 // Prevent DCE by writing coverage mask from rasterizer to volatile
 #if KNOB_ENABLE_TOSS_POINTS
 __declspec(thread) volatile uint64_t gToss;
 #endif

 static const uint32_t vertsPerTri = 3, componentsPerAttrib = 4;
 // try to avoid _chkstk insertions; make this thread local
 static THREAD
 OSALIGNLINE(float) perspAttribsTLS[vertsPerTri * SWR_VTX_NUM_SLOTS * componentsPerAttrib];

 INLINE
 void ComputeEdgeData(int32_t a, int32_t b, EDGE& edge)
 {
     edge.a = a;
     edge.b = b;

     // compute constant steps to adjacent quads
     edge.stepQuadX = (double)((int64_t)a * (int64_t)(2 * FIXED_POINT_SCALE));
     edge.stepQuadY = (double)((int64_t)b * (int64_t)(2 * FIXED_POINT_SCALE));

     // compute constant steps to adjacent raster tiles
     edge.stepRasterTileX = (double)((int64_t)a * (int64_t)(KNOB_TILE_X_DIM * FIXED_POINT_SCALE));
     edge.stepRasterTileY = (double)((int64_t)b * (int64_t)(KNOB_TILE_Y_DIM * FIXED_POINT_SCALE));

     // compute quad offsets
     const __m256d vQuadOffsetsXIntFix8 = _mm256_set_pd(FIXED_POINT_SCALE, 0, FIXED_POINT_SCALE, 0);
     const __m256d vQuadOffsetsYIntFix8 = _mm256_set_pd(FIXED_POINT_SCALE, FIXED_POINT_SCALE, 0, 0);

     __m256d vQuadStepXFix16 = _mm256_mul_pd(_mm256_set1_pd(edge.a), vQuadOffsetsXIntFix8);
     __m256d vQuadStepYFix16 = _mm256_mul_pd(_mm256_set1_pd(edge.b), vQuadOffsetsYIntFix8);
     edge.vQuadOffsets       = _mm256_add_pd(vQuadStepXFix16, vQuadStepYFix16);

     // compute raster tile offsets
     const __m256d vTileOffsetsXIntFix8 = _mm256_set_pd(
         (KNOB_TILE_X_DIM - 1) * FIXED_POINT_SCALE, 0, (KNOB_TILE_X_DIM - 1) * FIXED_POINT_SCALE, 0);
     const __m256d vTileOffsetsYIntFix8 = _mm256_set_pd(
         (KNOB_TILE_Y_DIM - 1) * FIXED_POINT_SCALE, (KNOB_TILE_Y_DIM - 1) * FIXED_POINT_SCALE, 0, 0);

     __m256d vTileStepXFix16 = _mm256_mul_pd(_mm256_set1_pd(edge.a), vTileOffsetsXIntFix8);
     __m256d vTileStepYFix16 = _mm256_mul_pd(_mm256_set1_pd(edge.b), vTileOffsetsYIntFix8);
     edge.vRasterTileOffsets = _mm256_add_pd(vTileStepXFix16, vTileStepYFix16);
 }

 INLINE
 void ComputeEdgeData(const POS& p0, const POS& p1, EDGE& edge)
 {
     ComputeEdgeData(p0.y - p1.y, p1.x - p0.x, edge);
 }

 //////////////////////////////////////////////////////////////////////////
 /// @brief Primary template definition used for partially specializing
 /// the UpdateEdgeMasks function. Offset evaluated edges from UL pixel
 /// corner to sample position, and test for coverage
 /// @tparam sampleCount: multisample count
 template <typename NumSamplesT>
 INLINE void UpdateEdgeMasks(const __m256d (&vEdgeTileBbox)[3],
                             const __m256d* vEdgeFix16,
                             int32_t&       mask0,
                             int32_t&       mask1,
                             int32_t&       mask2)
 {
     __m256d vSampleBboxTest0, vSampleBboxTest1, vSampleBboxTest2;
     // evaluate edge equations at the tile multisample bounding box
     vSampleBboxTest0 = _mm256_add_pd(vEdgeTileBbox[0], vEdgeFix16[0]);
     vSampleBboxTest1 = _mm256_add_pd(vEdgeTileBbox[1], vEdgeFix16[1]);
     vSampleBboxTest2 = _mm256_add_pd(vEdgeTileBbox[2], vEdgeFix16[2]);
     mask0            = _mm256_movemask_pd(vSampleBboxTest0);
     mask1            = _mm256_movemask_pd(vSampleBboxTest1);
     mask2            = _mm256_movemask_pd(vSampleBboxTest2);
 }

 //////////////////////////////////////////////////////////////////////////
 /// @brief UpdateEdgeMasks<SingleSampleT> specialization, instantiated
 /// when only rasterizing a single coverage test point
 template <>
 INLINE void UpdateEdgeMasks<SingleSampleT>(
     const __m256d (&)[3], const __m256d* vEdgeFix16, int32_t& mask0, int32_t& mask1, int32_t& mask2)
 {
     mask0 = _mm256_movemask_pd(vEdgeFix16[0]);
     mask1 = _mm256_movemask_pd(vEdgeFix16[1]);
     mask2 = _mm256_movemask_pd(vEdgeFix16[2]);
 }

 //////////////////////////////////////////////////////////////////////////
 /// @struct ComputeScissorEdges
 /// @brief Primary template definition. Allows the function to be generically
 /// called. When paired with below specializations, will result in an empty
 /// inlined function if scissor is not enabled
 /// @tparam RasterScissorEdgesT: is scissor enabled?
 /// @tparam IsConservativeT: is conservative rast enabled?
 /// @tparam RT: rasterizer traits
 template <typename RasterScissorEdgesT, typename IsConservativeT, typename RT>
 struct ComputeScissorEdges
 {
     INLINE ComputeScissorEdges(const SWR_RECT& triBBox,
                                const SWR_RECT& scissorBBox,
                                const int32_t   x,
                                const int32_t   y,
                                EDGE (&rastEdges)[RT::NumEdgesT::value],
                                __m256d (&vEdgeFix16)[7]){};
 };

 //////////////////////////////////////////////////////////////////////////
 /// @brief ComputeScissorEdges<std::true_type, std::true_type, RT> partial
 /// specialization. Instantiated when conservative rast and scissor are enabled
 template <typename RT>
 struct ComputeScissorEdges<std::true_type, std::true_type, RT>
 {
     //////////////////////////////////////////////////////////////////////////
     /// @brief Intersect tri bbox with scissor, compute scissor edge vectors,
     /// evaluate edge equations and offset them away from pixel center.
     INLINE ComputeScissorEdges(const SWR_RECT& triBBox,
                                const SWR_RECT& scissorBBox,
                                const int32_t   x,
                                const int32_t   y,
                                EDGE (&rastEdges)[RT::NumEdgesT::value],
                                __m256d (&vEdgeFix16)[7])
     {
         // if conservative rasterizing, triangle bbox intersected with scissor bbox is used
         SWR_RECT scissor;
         scissor.xmin = std::max(triBBox.xmin, scissorBBox.xmin);
         scissor.xmax = std::min(triBBox.xmax, scissorBBox.xmax);
         scissor.ymin = std::max(triBBox.ymin, scissorBBox.ymin);
         scissor.ymax = std::min(triBBox.ymax, scissorBBox.ymax);

         POS topLeft{scissor.xmin, scissor.ymin};
         POS bottomLeft{scissor.xmin, scissor.ymax};
         POS topRight{scissor.xmax, scissor.ymin};
         POS bottomRight{scissor.xmax, scissor.ymax};

         // construct 4 scissor edges in ccw direction
         ComputeEdgeData(topLeft, bottomLeft, rastEdges[3]);
         ComputeEdgeData(bottomLeft, bottomRight, rastEdges[4]);
         ComputeEdgeData(bottomRight, topRight, rastEdges[5]);
         ComputeEdgeData(topRight, topLeft, rastEdges[6]);

         vEdgeFix16[3] = _mm256_set1_pd((rastEdges[3].a * (x - scissor.xmin)) +
                                        (rastEdges[3].b * (y - scissor.ymin)));
         vEdgeFix16[4] = _mm256_set1_pd((rastEdges[4].a * (x - scissor.xmin)) +
                                        (rastEdges[4].b * (y - scissor.ymax)));
         vEdgeFix16[5] = _mm256_set1_pd((rastEdges[5].a * (x - scissor.xmax)) +
                                        (rastEdges[5].b * (y - scissor.ymax)));
         vEdgeFix16[6] = _mm256_set1_pd((rastEdges[6].a * (x - scissor.xmax)) +
                                        (rastEdges[6].b * (y - scissor.ymin)));

         // if conservative rasterizing, need to bump the scissor edges out by the conservative
         // uncertainty distance, else do nothing
         adjustScissorEdge<RT>(rastEdges[3].a, rastEdges[3].b, vEdgeFix16[3]);
         adjustScissorEdge<RT>(rastEdges[4].a, rastEdges[4].b, vEdgeFix16[4]);
         adjustScissorEdge<RT>(rastEdges[5].a, rastEdges[5].b, vEdgeFix16[5]);
         adjustScissorEdge<RT>(rastEdges[6].a, rastEdges[6].b, vEdgeFix16[6]);

         // Upper left rule for scissor
         vEdgeFix16[3] = _mm256_sub_pd(vEdgeFix16[3], _mm256_set1_pd(1.0));
         vEdgeFix16[6] = _mm256_sub_pd(vEdgeFix16[6], _mm256_set1_pd(1.0));
     }
 };

 //////////////////////////////////////////////////////////////////////////
 /// @brief ComputeScissorEdges<std::true_type, std::false_type, RT> partial
 /// specialization. Instantiated when scissor is enabled and conservative rast
 /// is disabled.
 template <typename RT>
 struct ComputeScissorEdges<std::true_type, std::false_type, RT>
 {
     //////////////////////////////////////////////////////////////////////////
     /// @brief Compute scissor edge vectors and evaluate edge equations
     INLINE ComputeScissorEdges(const SWR_RECT&,
                                const SWR_RECT& scissorBBox,
                                const int32_t   x,
                                const int32_t   y,
                                EDGE (&rastEdges)[RT::NumEdgesT::value],
                                __m256d (&vEdgeFix16)[7])
     {
         const SWR_RECT& scissor = scissorBBox;
         POS             topLeft{scissor.xmin, scissor.ymin};
         POS             bottomLeft{scissor.xmin, scissor.ymax};
         POS             topRight{scissor.xmax, scissor.ymin};
         POS             bottomRight{scissor.xmax, scissor.ymax};

         // construct 4 scissor edges in ccw direction
         ComputeEdgeData(topLeft, bottomLeft, rastEdges[3]);
         ComputeEdgeData(bottomLeft, bottomRight, rastEdges[4]);
         ComputeEdgeData(bottomRight, topRight, rastEdges[5]);
         ComputeEdgeData(topRight, topLeft, rastEdges[6]);

         vEdgeFix16[3] = _mm256_set1_pd((rastEdges[3].a * (x - scissor.xmin)) +
                                        (rastEdges[3].b * (y - scissor.ymin)));
         vEdgeFix16[4] = _mm256_set1_pd((rastEdges[4].a * (x - scissor.xmin)) +
                                        (rastEdges[4].b * (y - scissor.ymax)));
         vEdgeFix16[5] = _mm256_set1_pd((rastEdges[5].a * (x - scissor.xmax)) +
                                        (rastEdges[5].b * (y - scissor.ymax)));
         vEdgeFix16[6] = _mm256_set1_pd((rastEdges[6].a * (x - scissor.xmax)) +
                                        (rastEdges[6].b * (y - scissor.ymin)));

         // Upper left rule for scissor
         vEdgeFix16[3] = _mm256_sub_pd(vEdgeFix16[3], _mm256_set1_pd(1.0));
         vEdgeFix16[6] = _mm256_sub_pd(vEdgeFix16[6], _mm256_set1_pd(1.0));
     }
 };

 //////////////////////////////////////////////////////////////////////////
 /// @brief Primary function template for TrivialRejectTest. Should
 /// never be called, but TemplateUnroller instantiates a few unused values,
 /// so it calls a runtime assert instead of a static_assert.
 template <typename ValidEdgeMaskT>
 INLINE bool TrivialRejectTest(const int, const int, const int)
 {
     SWR_INVALID("Primary templated function should never be called");
     return false;
 };

 //////////////////////////////////////////////////////////////////////////
 /// @brief E0E1ValidT specialization of TrivialRejectTest. Tests edge 0
 /// and edge 1 for trivial coverage reject
 template <>
 INLINE bool TrivialRejectTest<E0E1ValidT>(const int mask0, const int mask1, const int)
 {
     return (!(mask0 && mask1)) ? true : false;
 };

 //////////////////////////////////////////////////////////////////////////
 /// @brief E0E2ValidT specialization of TrivialRejectTest. Tests edge 0
 /// and edge 2 for trivial coverage reject
 template <>
 INLINE bool TrivialRejectTest<E0E2ValidT>(const int mask0, const int, const int mask2)
 {
     return (!(mask0 && mask2)) ? true : false;
 };

 //////////////////////////////////////////////////////////////////////////
 /// @brief E1E2ValidT specialization of TrivialRejectTest. Tests edge 1
 /// and edge 2 for trivial coverage reject
 template <>
 INLINE bool TrivialRejectTest<E1E2ValidT>(const int, const int mask1, const int mask2)
 {
     return (!(mask1 && mask2)) ? true : false;
 };

 //////////////////////////////////////////////////////////////////////////
 /// @brief AllEdgesValidT specialization of TrivialRejectTest. Tests all
 /// primitive edges for trivial coverage reject
 template <>
 INLINE bool TrivialRejectTest<AllEdgesValidT>(const int mask0, const int mask1, const int mask2)
 {
     return (!(mask0 && mask1 && mask2)) ? true : false;
     ;
 };

 //////////////////////////////////////////////////////////////////////////
 /// @brief NoEdgesValidT specialization of TrivialRejectTest. Degenerate
 /// point, so return false and rasterize against conservative BBox
 template <>
 INLINE bool TrivialRejectTest<NoEdgesValidT>(const int, const int, const int)
 {
     return false;
 };

 //////////////////////////////////////////////////////////////////////////
 /// @brief Primary function template for TrivialAcceptTest. Always returns
 /// false, since it will only be called for degenerate tris, and as such
 /// will never cover the entire raster tile
 template <typename ScissorEnableT>
 INLINE bool TrivialAcceptTest(const int, const int, const int)
 {
     return false;
 };

 //////////////////////////////////////////////////////////////////////////
 /// @brief AllEdgesValidT specialization for TrivialAcceptTest. Test all
 /// edge masks for a fully covered raster tile
 template <>
 INLINE bool TrivialAcceptTest<std::false_type>(const int mask0, const int mask1, const int mask2)
 {
     return ((mask0 & mask1 & mask2) == 0xf);
 };

 //////////////////////////////////////////////////////////////////////////
 /// @brief Primary function template for GenerateSVInnerCoverage. Results
 /// in an empty function call if SVInnerCoverage isn't requested
 template <typename RT, typename ValidEdgeMaskT, typename InputCoverageT>
 struct GenerateSVInnerCoverage
 {
     INLINE GenerateSVInnerCoverage(DRAW_CONTEXT*, uint32_t, EDGE*, double*, uint64_t&){};
 };

 //////////////////////////////////////////////////////////////////////////
 /// @brief Specialization of GenerateSVInnerCoverage where all edges
 /// are non-degenerate and SVInnerCoverage is requested. Offsets the evaluated
 /// edge values from OuterConservative to InnerConservative and rasterizes.
 template <typename RT>
 struct GenerateSVInnerCoverage<RT, AllEdgesValidT, InnerConservativeCoverageT>
 {
     INLINE GenerateSVInnerCoverage(DRAW_CONTEXT* pDC,
                                    uint32_t      workerId,
                                    EDGE*         pRastEdges,
                                    double*       pStartQuadEdges,
                                    uint64_t&     innerCoverageMask)
     {
         double startQuadEdgesAdj[RT::NumEdgesT::value];
         for (uint32_t e = 0; e < RT::NumEdgesT::value; ++e)
         {
             startQuadEdgesAdj[e] = adjustScalarEdge<RT, typename RT::InnerConservativeEdgeOffsetT>(
                 pRastEdges[e].a, pRastEdges[e].b, pStartQuadEdges[e]);
         }

         // not trivial accept or reject, must rasterize full tile
         RDTSC_BEGIN(BERasterizePartial, pDC->drawId);
         innerCoverageMask = rasterizePartialTile<RT::NumEdgesT::value, typename RT::ValidEdgeMaskT>(
             pDC, startQuadEdgesAdj, pRastEdges);
         RDTSC_END(BERasterizePartial, 0);
     }
 };

 //////////////////////////////////////////////////////////////////////////
 /// @brief Primary function template for UpdateEdgeMasksInnerConservative. Results
 /// in an empty function call if SVInnerCoverage isn't requested
 template <typename RT, typename ValidEdgeMaskT, typename InputCoverageT>
 struct UpdateEdgeMasksInnerConservative
 {
     INLINE UpdateEdgeMasksInnerConservative(const __m256d (&vEdgeTileBbox)[3],
                                             const __m256d*,
                                             const __m128i,
                                             const __m128i,
                                             int32_t&,
                                             int32_t&,
                                             int32_t&){};
 };

 //////////////////////////////////////////////////////////////////////////
 /// @brief Specialization of UpdateEdgeMasksInnerConservative where all edges
 /// are non-degenerate and SVInnerCoverage is requested. Offsets the edges
 /// evaluated at raster tile corners to inner conservative position and
 /// updates edge masks
 template <typename RT>
 struct UpdateEdgeMasksInnerConservative<RT, AllEdgesValidT, InnerConservativeCoverageT>
 {
     INLINE UpdateEdgeMasksInnerConservative(const __m256d (&vEdgeTileBbox)[3],
                                             const __m256d* vEdgeFix16,
                                             const __m128i  vAi,
                                             const __m128i  vBi,
                                             int32_t&       mask0,
                                             int32_t&       mask1,
                                             int32_t&       mask2)
     {
         __m256d vTempEdge[3]{vEdgeFix16[0], vEdgeFix16[1], vEdgeFix16[2]};

         // instead of keeping 2 copies of evaluated edges around, just compensate for the outer
         // conservative evaluated edge when adjusting the edge in for inner conservative tests
         adjustEdgeConservative<RT, typename RT::InnerConservativeEdgeOffsetT>(
             vAi, vBi, vTempEdge[0]);
         adjustEdgeConservative<RT, typename RT::InnerConservativeEdgeOffsetT>(
             vAi, vBi, vTempEdge[1]);
         adjustEdgeConservative<RT, typename RT::InnerConservativeEdgeOffsetT>(
             vAi, vBi, vTempEdge[2]);

         UpdateEdgeMasks<typename RT::NumCoverageSamplesT>(
             vEdgeTileBbox, vTempEdge, mask0, mask1, mask2);
     }
 };

 //////////////////////////////////////////////////////////////////////////
 /// @brief Specialization of UpdateEdgeMasksInnerConservative where SVInnerCoverage
 /// is requested but at least one edge is degenerate. Since a degenerate triangle cannot
 /// cover an entire raster tile, set mask0 to 0 to force it down the
 /// rastierizePartialTile path
 template <typename RT, typename ValidEdgeMaskT>
 struct UpdateEdgeMasksInnerConservative<RT, ValidEdgeMaskT, InnerConservativeCoverageT>
 {
     INLINE UpdateEdgeMasksInnerConservative(const __m256d (&)[3],
                                             const __m256d*,
                                             const __m128i,
                                             const __m128i,
                                             int32_t& mask0,
                                             int32_t&,
                                             int32_t&)
     {
         // set one mask to zero to force the triangle down the rastierizePartialTile path
         mask0 = 0;
     }
 };

 template <typename RT>
 void RasterizeTriangle(DRAW_CONTEXT* pDC, uint32_t workerId, uint32_t macroTile, void* pDesc)
 {
     const TRIANGLE_WORK_DESC& workDesc = *((TRIANGLE_WORK_DESC*)pDesc);
 #if KNOB_ENABLE_TOSS_POINTS
     if (KNOB_TOSS_BIN_TRIS)
     {
         return;
     }
 #endif
     RDTSC_BEGIN(BERasterizeTriangle, pDC->drawId);
     RDTSC_BEGIN(BETriangleSetup, pDC->drawId);

     const API_STATE&     state        = GetApiState(pDC);
     const SWR_RASTSTATE& rastState    = state.rastState;
     const BACKEND_FUNCS& backendFuncs = pDC->pState->backendFuncs;

     OSALIGNSIMD(SWR_TRIANGLE_DESC) triDesc;
     triDesc.pUserClipBuffer = workDesc.pUserClipBuffer;

     __m128 vX, vY, vZ, vRecipW;

     // pTriBuffer data layout: grouped components of the 3 triangle points and 1 don't care
     // eg: vX = [x0 x1 x2 dc]
     vX      = _mm_load_ps(workDesc.pTriBuffer);
     vY      = _mm_load_ps(workDesc.pTriBuffer + 4);
     vZ      = _mm_load_ps(workDesc.pTriBuffer + 8);
     vRecipW = _mm_load_ps(workDesc.pTriBuffer + 12);

     // convert to fixed point
     static_assert(std::is_same<typename RT::PrecisionT, FixedPointTraits<Fixed_16_8>>::value,
                   "Rasterizer expects 16.8 fixed point precision");
     __m128i vXi = fpToFixedPoint(vX);
     __m128i vYi = fpToFixedPoint(vY);

     // quantize floating point position to fixed point precision
     // to prevent attribute creep around the triangle vertices
     vX = _mm_mul_ps(_mm_cvtepi32_ps(vXi), _mm_set1_ps(1.0f / FIXED_POINT_SCALE));
     vY = _mm_mul_ps(_mm_cvtepi32_ps(vYi), _mm_set1_ps(1.0f / FIXED_POINT_SCALE));

     // triangle setup - A and B edge equation coefs
     __m128 vA, vB;
     triangleSetupAB(vX, vY, vA, vB);

     __m128i vAi, vBi;
     triangleSetupABInt(vXi, vYi, vAi, vBi);

     // determinant
     float det = calcDeterminantInt(vAi, vBi);

     // Verts in Pixel Coordinate Space at this point
     // Det > 0 = CW winding order
     // Convert CW triangles to CCW
     if (det > 0.0)
     {
         vA  = _mm_mul_ps(vA, _mm_set1_ps(-1));
         vB  = _mm_mul_ps(vB, _mm_set1_ps(-1));
         vAi = _mm_mullo_epi32(vAi, _mm_set1_epi32(-1));
         vBi = _mm_mullo_epi32(vBi, _mm_set1_epi32(-1));
         det = -det;
     }

     __m128 vC;
     // Finish triangle setup - C edge coef
     triangleSetupC(vX, vY, vA, vB, vC);

     if (RT::ValidEdgeMaskT::value != ALL_EDGES_VALID)
     {
         // If we have degenerate edge(s) to rasterize, set I and J coefs
         // to 0 for constant interpolation of attributes
         triDesc.I[0] = 0.0f;
         triDesc.I[1] = 0.0f;
         triDesc.I[2] = 0.0f;
         triDesc.J[0] = 0.0f;
         triDesc.J[1] = 0.0f;
         triDesc.J[2] = 0.0f;

         // Degenerate triangles have no area
         triDesc.recipDet = 0.0f;
     }
     else
     {
         // only extract coefs for 2 of the barycentrics; the 3rd can be
         // determined from the barycentric equation:
         // i + j + k = 1 <=> k = 1 - j - i
         _MM_EXTRACT_FLOAT(triDesc.I[0], vA, 1);
         _MM_EXTRACT_FLOAT(triDesc.I[1], vB, 1);
         _MM_EXTRACT_FLOAT(triDesc.I[2], vC, 1);
         _MM_EXTRACT_FLOAT(triDesc.J[0], vA, 2);
         _MM_EXTRACT_FLOAT(triDesc.J[1], vB, 2);
         _MM_EXTRACT_FLOAT(triDesc.J[2], vC, 2);

         // compute recipDet, used to calculate barycentric i and j in the backend
         triDesc.recipDet = 1.0f / det;
     }

     OSALIGNSIMD(float) oneOverW[4];
     _mm_store_ps(oneOverW, vRecipW);
     triDesc.OneOverW[0] = oneOverW[0] - oneOverW[2];
     triDesc.OneOverW[1] = oneOverW[1] - oneOverW[2];
     triDesc.OneOverW[2] = oneOverW[2];

     // calculate perspective correct coefs per vertex attrib
     float* pPerspAttribs  = perspAttribsTLS;
     float* pAttribs       = workDesc.pAttribs;
     triDesc.pPerspAttribs = pPerspAttribs;
     triDesc.pAttribs      = pAttribs;
     float* pRecipW        = workDesc.pTriBuffer + 12;
     triDesc.pRecipW       = pRecipW;
     __m128 vOneOverWV0    = _mm_broadcast_ss(pRecipW);
     __m128 vOneOverWV1    = _mm_broadcast_ss(pRecipW += 1);
     __m128 vOneOverWV2    = _mm_broadcast_ss(pRecipW += 1);
     for (uint32_t i = 0; i < workDesc.numAttribs; i++)
     {
         __m128 attribA = _mm_load_ps(pAttribs);
         __m128 attribB = _mm_load_ps(pAttribs += 4);
         __m128 attribC = _mm_load_ps(pAttribs += 4);
         pAttribs += 4;

         attribA = _mm_mul_ps(attribA, vOneOverWV0);
         attribB = _mm_mul_ps(attribB, vOneOverWV1);
         attribC = _mm_mul_ps(attribC, vOneOverWV2);

         _mm_store_ps(pPerspAttribs, attribA);
         _mm_store_ps(pPerspAttribs += 4, attribB);
         _mm_store_ps(pPerspAttribs += 4, attribC);
         pPerspAttribs += 4;
     }

     // compute bary Z
     // zInterp = zVert0 + i(zVert1-zVert0) + j (zVert2 - zVert0)
     OSALIGNSIMD(float) a[4];
     _mm_store_ps(a, vZ);
     triDesc.Z[0] = a[0] - a[2];
     triDesc.Z[1] = a[1] - a[2];
     triDesc.Z[2] = a[2];

     // add depth bias
     triDesc.Z[2] += ComputeDepthBias(&rastState, &triDesc, workDesc.pTriBuffer + 8);

     // Calc bounding box of triangle
     OSALIGNSIMD(SWR_RECT) bbox;
     calcBoundingBoxInt(vXi, vYi, bbox);

     const SWR_RECT& scissorInFixedPoint =
         state.scissorsInFixedPoint[workDesc.triFlags.viewportIndex];

     if (RT::ValidEdgeMaskT::value != ALL_EDGES_VALID)
     {
         // If we're rasterizing a degenerate triangle, expand bounding box to guarantee the BBox is
         // valid
         bbox.xmin--;
         bbox.xmax++;
         bbox.ymin--;
         bbox.ymax++;
         SWR_ASSERT(scissorInFixedPoint.xmin >= 0 && scissorInFixedPoint.ymin >= 0,
                    "Conservative rast degenerate handling requires a valid scissor rect");
     }

     // Intersect with scissor/viewport
     OSALIGNSIMD(SWR_RECT) intersect;
     intersect.xmin = std::max(bbox.xmin, scissorInFixedPoint.xmin);
     intersect.xmax = std::min(bbox.xmax - 1, scissorInFixedPoint.xmax);
     intersect.ymin = std::max(bbox.ymin, scissorInFixedPoint.ymin);
     intersect.ymax = std::min(bbox.ymax - 1, scissorInFixedPoint.ymax);

     triDesc.triFlags = workDesc.triFlags;

     // further constrain backend to intersecting bounding box of macro tile and scissored triangle
     // bbox
     uint32_t macroX, macroY;
     MacroTileMgr::getTileIndices(macroTile, macroX, macroY);
     int32_t macroBoxLeft   = macroX * KNOB_MACROTILE_X_DIM_FIXED;
     int32_t macroBoxRight  = macroBoxLeft + KNOB_MACROTILE_X_DIM_FIXED - 1;
     int32_t macroBoxTop    = macroY * KNOB_MACROTILE_Y_DIM_FIXED;
     int32_t macroBoxBottom = macroBoxTop + KNOB_MACROTILE_Y_DIM_FIXED - 1;

     intersect.xmin = std::max(intersect.xmin, macroBoxLeft);
     intersect.ymin = std::max(intersect.ymin, macroBoxTop);
     intersect.xmax = std::min(intersect.xmax, macroBoxRight);
     intersect.ymax = std::min(intersect.ymax, macroBoxBottom);

     SWR_ASSERT(intersect.xmin <= intersect.xmax && intersect.ymin <= intersect.ymax &&
                intersect.xmin >= 0 && intersect.xmax >= 0 && intersect.ymin >= 0 &&
                intersect.ymax >= 0);

     RDTSC_END(BETriangleSetup, 0);

     // update triangle desc
     uint32_t minTileX  = intersect.xmin >> (KNOB_TILE_X_DIM_SHIFT + FIXED_POINT_SHIFT);
     uint32_t minTileY  = intersect.ymin >> (KNOB_TILE_Y_DIM_SHIFT + FIXED_POINT_SHIFT);
     uint32_t maxTileX  = intersect.xmax >> (KNOB_TILE_X_DIM_SHIFT + FIXED_POINT_SHIFT);
     uint32_t maxTileY  = intersect.ymax >> (KNOB_TILE_Y_DIM_SHIFT + FIXED_POINT_SHIFT);
     uint32_t numTilesX = maxTileX - minTileX + 1;
     uint32_t numTilesY = maxTileY - minTileY + 1;

     if (numTilesX == 0 || numTilesY == 0)
     {
         RDTSC_EVENT(BEEmptyTriangle, 1, 0);
         RDTSC_END(BERasterizeTriangle, 1);
         return;
     }

     RDTSC_BEGIN(BEStepSetup, pDC->drawId);

     // Step to pixel center of top-left pixel of the triangle bbox
     // Align intersect bbox (top/left) to raster tile's (top/left).
     int32_t x = AlignDown(intersect.xmin, (FIXED_POINT_SCALE * KNOB_TILE_X_DIM));
     int32_t y = AlignDown(intersect.ymin, (FIXED_POINT_SCALE * KNOB_TILE_Y_DIM));

     // convenience typedef
     typedef typename RT::NumCoverageSamplesT NumCoverageSamplesT;

     // single sample rasterization evaluates edges at pixel center,
     // multisample evaluates edges UL pixel corner and steps to each sample position
     if (std::is_same<NumCoverageSamplesT, SingleSampleT>::value)
     {
         // Add 0.5, in fixed point, to offset to pixel center
         x += (FIXED_POINT_SCALE / 2);
         y += (FIXED_POINT_SCALE / 2);
     }

     __m128i vTopLeftX = _mm_set1_epi32(x);
     __m128i vTopLeftY = _mm_set1_epi32(y);

     // evaluate edge equations at top-left pixel using 64bit math
     //
     // line = Ax + By + C
     // solving for C:
     // C = -Ax - By
     // we know x0 and y0 are on the line; plug them in:
     // C = -Ax0 - By0
     // plug C back into line equation:
     // line = Ax - By - Ax0 - By0
     // line = A(x - x0) + B(y - y0)
     // dX = (x-x0), dY = (y-y0)
     // so all this simplifies to
     // edge = A(dX) + B(dY), our first test at the top left of the bbox we're rasterizing within

     __m128i vDeltaX = _mm_sub_epi32(vTopLeftX, vXi);
     __m128i vDeltaY = _mm_sub_epi32(vTopLeftY, vYi);

     // evaluate A(dx) and B(dY) for all points
     __m256d vAipd     = _mm256_cvtepi32_pd(vAi);
     __m256d vBipd     = _mm256_cvtepi32_pd(vBi);
     __m256d vDeltaXpd = _mm256_cvtepi32_pd(vDeltaX);
     __m256d vDeltaYpd = _mm256_cvtepi32_pd(vDeltaY);

     __m256d vAiDeltaXFix16 = _mm256_mul_pd(vAipd, vDeltaXpd);
     __m256d vBiDeltaYFix16 = _mm256_mul_pd(vBipd, vDeltaYpd);
     __m256d vEdge          = _mm256_add_pd(vAiDeltaXFix16, vBiDeltaYFix16);

     // apply any edge adjustments(top-left, crast, etc)
     adjustEdgesFix16<RT, typename RT::ConservativeEdgeOffsetT>(vAi, vBi, vEdge);

     // broadcast respective edge results to all lanes
     double* pEdge = (double*)&vEdge;
     __m256d vEdgeFix16[7];
     vEdgeFix16[0] = _mm256_set1_pd(pEdge[0]);
     vEdgeFix16[1] = _mm256_set1_pd(pEdge[1]);
     vEdgeFix16[2] = _mm256_set1_pd(pEdge[2]);

     OSALIGNSIMD(int32_t) aAi[4], aBi[4];
     _mm_store_si128((__m128i*)aAi, vAi);
     _mm_store_si128((__m128i*)aBi, vBi);
     EDGE rastEdges[RT::NumEdgesT::value];

     // Compute and store triangle edge data
     ComputeEdgeData(aAi[0], aBi[0], rastEdges[0]);
     ComputeEdgeData(aAi[1], aBi[1], rastEdges[1]);
     ComputeEdgeData(aAi[2], aBi[2], rastEdges[2]);

     // Compute and store triangle edge data if scissor needs to rasterized
     ComputeScissorEdges<typename RT::RasterizeScissorEdgesT, typename RT::IsConservativeT, RT>(
         bbox, scissorInFixedPoint, x, y, rastEdges, vEdgeFix16);

     // Evaluate edge equations at sample positions of each of the 4 corners of a raster tile
     // used to for testing if entire raster tile is inside a triangle
     for (uint32_t e = 0; e < RT::NumEdgesT::value; ++e)
     {
         vEdgeFix16[e] = _mm256_add_pd(vEdgeFix16[e], rastEdges[e].vRasterTileOffsets);
     }

     // at this point vEdge has been evaluated at the UL pixel corners of raster tile bbox
     // step sample positions to the raster tile bbox of multisample points
     // min(xSamples),min(ySamples)  ------  max(xSamples),min(ySamples)
     //                             |      |
     //                             |      |
     // min(xSamples),max(ySamples)  ------  max(xSamples),max(ySamples)
     __m256d vEdgeTileBbox[3];
     if (NumCoverageSamplesT::value > 1)
     {
         const SWR_MULTISAMPLE_POS& samplePos         = rastState.samplePositions;
         const __m128i              vTileSampleBBoxXh = samplePos.TileSampleOffsetsX();
         const __m128i              vTileSampleBBoxYh = samplePos.TileSampleOffsetsY();

         __m256d vTileSampleBBoxXFix8 = _mm256_cvtepi32_pd(vTileSampleBBoxXh);
         __m256d vTileSampleBBoxYFix8 = _mm256_cvtepi32_pd(vTileSampleBBoxYh);

         // step edge equation tests from Tile
         // used to for testing if entire raster tile is inside a triangle
         for (uint32_t e = 0; e < 3; ++e)
         {
             __m256d vResultAxFix16 =
                 _mm256_mul_pd(_mm256_set1_pd(rastEdges[e].a), vTileSampleBBoxXFix8);
             __m256d vResultByFix16 =
                 _mm256_mul_pd(_mm256_set1_pd(rastEdges[e].b), vTileSampleBBoxYFix8);
             vEdgeTileBbox[e] = _mm256_add_pd(vResultAxFix16, vResultByFix16);

             // adjust for msaa tile bbox edges outward for conservative rast, if enabled
             adjustEdgeConservative<RT, typename RT::ConservativeEdgeOffsetT>(
                 vAi, vBi, vEdgeTileBbox[e]);
         }
     }

     RDTSC_END(BEStepSetup, 0);

     uint32_t tY   = minTileY;
     uint32_t tX   = minTileX;
     uint32_t maxY = maxTileY;
     uint32_t maxX = maxTileX;

     RenderOutputBuffers renderBuffers, currentRenderBufferRow;
     GetRenderHotTiles<RT::MT::numSamples>(pDC,
                                           workerId,
                                           macroTile,
                                           minTileX,
                                           minTileY,
                                           renderBuffers,
                                           triDesc.triFlags.renderTargetArrayIndex);
     currentRenderBufferRow = renderBuffers;

     // rasterize and generate coverage masks per sample
     for (uint32_t tileY = tY; tileY <= maxY; ++tileY)
     {
         __m256d vStartOfRowEdge[RT::NumEdgesT::value];
         for (uint32_t e = 0; e < RT::NumEdgesT::value; ++e)
         {
             vStartOfRowEdge[e] = vEdgeFix16[e];
         }

         for (uint32_t tileX = tX; tileX <= maxX; ++tileX)
         {
             triDesc.anyCoveredSamples = 0;

             // is the corner of the edge outside of the raster tile? (vEdge < 0)
             int mask0, mask1, mask2;
             UpdateEdgeMasks<NumCoverageSamplesT>(vEdgeTileBbox, vEdgeFix16, mask0, mask1, mask2);

             for (uint32_t sampleNum = 0; sampleNum < NumCoverageSamplesT::value; sampleNum++)
             {
                 // trivial reject, at least one edge has all 4 corners of raster tile outside
                 bool trivialReject =
                     TrivialRejectTest<typename RT::ValidEdgeMaskT>(mask0, mask1, mask2);

                 if (!trivialReject)
                 {
                     // trivial accept mask
                     triDesc.coverageMask[sampleNum] = 0xffffffffffffffffULL;

                     // Update the raster tile edge masks based on inner conservative edge offsets,
                     // if enabled
                     UpdateEdgeMasksInnerConservative<RT,
                                                      typename RT::ValidEdgeMaskT,
                                                      typename RT::InputCoverageT>(
                         vEdgeTileBbox, vEdgeFix16, vAi, vBi, mask0, mask1, mask2);

                     // @todo Make this a bit smarter to allow use of trivial accept when:
                     //   1) scissor/vp intersection rect is raster tile aligned
                     //   2) raster tile is entirely within scissor/vp intersection rect
                     if (TrivialAcceptTest<typename RT::RasterizeScissorEdgesT>(mask0, mask1, mask2))
                     {
                         // trivial accept, all 4 corners of all 3 edges are negative
                         // i.e. raster tile completely inside triangle
                         triDesc.anyCoveredSamples = triDesc.coverageMask[sampleNum];
                         if (std::is_same<typename RT::InputCoverageT,
                                          InnerConservativeCoverageT>::value)
                         {
                             triDesc.innerCoverageMask = 0xffffffffffffffffULL;
                         }
                         RDTSC_EVENT(BETrivialAccept, 1, 0);
                     }
                     else
                     {
                         __m256d vEdgeAtSample[RT::NumEdgesT::value];
                         if (std::is_same<NumCoverageSamplesT, SingleSampleT>::value)
                         {
                             // should get optimized out for single sample case (global value
                             // numbering or copy propagation)
                             for (uint32_t e = 0; e < RT::NumEdgesT::value; ++e)
                             {
                                 vEdgeAtSample[e] = vEdgeFix16[e];
                             }
                         }
                         else
                         {
                             const SWR_MULTISAMPLE_POS& samplePos       = rastState.samplePositions;
                             __m128i                    vSampleOffsetXh = samplePos.vXi(sampleNum);
                             __m128i                    vSampleOffsetYh = samplePos.vYi(sampleNum);
                             __m256d vSampleOffsetX = _mm256_cvtepi32_pd(vSampleOffsetXh);
                             __m256d vSampleOffsetY = _mm256_cvtepi32_pd(vSampleOffsetYh);

                             // step edge equation tests from UL tile corner to pixel sample position
                             for (uint32_t e = 0; e < RT::NumEdgesT::value; ++e)
                             {
                                 __m256d vResultAxFix16 =
                                     _mm256_mul_pd(_mm256_set1_pd(rastEdges[e].a), vSampleOffsetX);
                                 __m256d vResultByFix16 =
                                     _mm256_mul_pd(_mm256_set1_pd(rastEdges[e].b), vSampleOffsetY);
                                 vEdgeAtSample[e] = _mm256_add_pd(vResultAxFix16, vResultByFix16);
                                 vEdgeAtSample[e] = _mm256_add_pd(vEdgeFix16[e], vEdgeAtSample[e]);
                             }
                         }

                         double        startQuadEdges[RT::NumEdgesT::value];
                         const __m256i vLane0Mask = _mm256_set_epi32(0, 0, 0, 0, 0, 0, -1, -1);
                         for (uint32_t e = 0; e < RT::NumEdgesT::value; ++e)
                         {
                             _mm256_maskstore_pd(&startQuadEdges[e], vLane0Mask, vEdgeAtSample[e]);
                         }

                         // not trivial accept or reject, must rasterize full tile
                         RDTSC_BEGIN(BERasterizePartial, pDC->drawId);
                         triDesc.coverageMask[sampleNum] =
                             rasterizePartialTile<RT::NumEdgesT::value, typename RT::ValidEdgeMaskT>(
                                 pDC, startQuadEdges, rastEdges);
                         RDTSC_END(BERasterizePartial, 0);

                         triDesc.anyCoveredSamples |= triDesc.coverageMask[sampleNum];

                         // Output SV InnerCoverage, if needed
                         GenerateSVInnerCoverage<RT,
                                                 typename RT::ValidEdgeMaskT,
                                                 typename RT::InputCoverageT>(
                             pDC, workerId, rastEdges, startQuadEdges, triDesc.innerCoverageMask);
                     }
                 }
                 else
                 {
                     // if we're calculating coverage per sample, need to store it off. otherwise no
                     // covered samples, don't need to do anything
                     if (NumCoverageSamplesT::value > 1)
                     {
                         triDesc.coverageMask[sampleNum] = 0;
                     }
                     RDTSC_EVENT(BETrivialReject, 1, 0);
                 }
             }

 #if KNOB_ENABLE_TOSS_POINTS
             if (KNOB_TOSS_RS)
             {
                 gToss = triDesc.coverageMask[0];
             }
             else
 #endif
                 if (triDesc.anyCoveredSamples)
             {
                 // if conservative rast and MSAA are enabled, conservative coverage for a pixel
                 // means all samples in that pixel are covered copy conservative coverage result to
                 // all samples
                 if (RT::IsConservativeT::value)
                 {
                     auto copyCoverage = [&](int sample) {
                         triDesc.coverageMask[sample] = triDesc.coverageMask[0];
                     };
                     UnrollerL<1, RT::MT::numSamples, 1>::step(copyCoverage);
                 }

                 // Track rasterized subspans
                 AR_EVENT(RasterTileCount(pDC->drawId, 1));

                 RDTSC_BEGIN(BEPixelBackend, pDC->drawId);
                 backendFuncs.pfnBackend(pDC,
                                         workerId,
                                         tileX << KNOB_TILE_X_DIM_SHIFT,
                                         tileY << KNOB_TILE_Y_DIM_SHIFT,
                                         triDesc,
                                         renderBuffers);
                 RDTSC_END(BEPixelBackend, 0);
             }

             // step to the next tile in X
             for (uint32_t e = 0; e < RT::NumEdgesT::value; ++e)
             {
                 vEdgeFix16[e] =
                     _mm256_add_pd(vEdgeFix16[e], _mm256_set1_pd(rastEdges[e].stepRasterTileX));
             }
             StepRasterTileX<RT>(state.colorHottileEnable, renderBuffers);
         }

         // step to the next tile in Y
         for (uint32_t e = 0; e < RT::NumEdgesT::value; ++e)
         {
             vEdgeFix16[e] =
                 _mm256_add_pd(vStartOfRowEdge[e], _mm256_set1_pd(rastEdges[e].stepRasterTileY));
         }
         StepRasterTileY<RT>(state.colorHottileEnable, renderBuffers, currentRenderBufferRow);
     }

     RDTSC_END(BERasterizeTriangle, 1);
 }

 // Get pointers to hot tile memory for color RT, depth, stencil
 template <uint32_t numSamples>
 void GetRenderHotTiles(DRAW_CONTEXT*        pDC,
                        uint32_t             workerId,
                        uint32_t             macroID,
                        uint32_t             tileX,
                        uint32_t             tileY,
                        RenderOutputBuffers& renderBuffers,
                        uint32_t             renderTargetArrayIndex)
 {
     const API_STATE& state    = GetApiState(pDC);
     SWR_CONTEXT*     pContext = pDC->pContext;
     HANDLE hWorkerPrivateData = pContext->threadPool.pThreadData[workerId].pWorkerPrivateData;

     uint32_t mx, my;
     MacroTileMgr::getTileIndices(macroID, mx, my);
     tileX -= KNOB_MACROTILE_X_DIM_IN_TILES * mx;
     tileY -= KNOB_MACROTILE_Y_DIM_IN_TILES * my;

     // compute tile offset for active hottile buffers
     const uint32_t pitch = KNOB_MACROTILE_X_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8;
     uint32_t       offset = ComputeTileOffset2D<
         TilingTraits<SWR_TILE_SWRZ, FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp>>(
         pitch, tileX, tileY);
     offset *= numSamples;

     unsigned long rtSlot                 = 0;
     uint32_t      colorHottileEnableMask = state.colorHottileEnable;
     while (_BitScanForward(&rtSlot, colorHottileEnableMask))
     {
         HOTTILE* pColor = pContext->pHotTileMgr->GetHotTile(
             pContext,
             pDC,
             hWorkerPrivateData,
             macroID,
             (SWR_RENDERTARGET_ATTACHMENT)(SWR_ATTACHMENT_COLOR0 + rtSlot),
             true,
             numSamples,
             renderTargetArrayIndex);
         pColor->state                = HOTTILE_DIRTY;
         renderBuffers.pColor[rtSlot] = pColor->pBuffer + offset;

         colorHottileEnableMask &= ~(1 << rtSlot);
     }
     if (state.depthHottileEnable)
     {
         const uint32_t pitch =
             KNOB_MACROTILE_X_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8;
         uint32_t offset = ComputeTileOffset2D<
             TilingTraits<SWR_TILE_SWRZ, FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp>>(
             pitch, tileX, tileY);
         offset *= numSamples;
         HOTTILE* pDepth = pContext->pHotTileMgr->GetHotTile(pContext,
                                                             pDC,
                                                             hWorkerPrivateData,
                                                             macroID,
                                                             SWR_ATTACHMENT_DEPTH,
                                                             true,
                                                             numSamples,
                                                             renderTargetArrayIndex);
         pDepth->state   = HOTTILE_DIRTY;
         SWR_ASSERT(pDepth->pBuffer != nullptr);
         renderBuffers.pDepth = pDepth->pBuffer + offset;
     }
     if (state.stencilHottileEnable)
     {
         const uint32_t pitch =
             KNOB_MACROTILE_X_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8;
         uint32_t offset = ComputeTileOffset2D<
             TilingTraits<SWR_TILE_SWRZ, FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp>>(
             pitch, tileX, tileY);
         offset *= numSamples;
         HOTTILE* pStencil = pContext->pHotTileMgr->GetHotTile(pContext,
                                                               pDC,
                                                               hWorkerPrivateData,
                                                               macroID,
                                                               SWR_ATTACHMENT_STENCIL,
                                                               true,
                                                               numSamples,
                                                               renderTargetArrayIndex);
         pStencil->state   = HOTTILE_DIRTY;
         SWR_ASSERT(pStencil->pBuffer != nullptr);
         renderBuffers.pStencil = pStencil->pBuffer + offset;
     }
 }

 template <typename RT>
 INLINE void StepRasterTileX(uint32_t colorHotTileMask, RenderOutputBuffers& buffers)
 {
     DWORD rt = 0;
     while (_BitScanForward(&rt, colorHotTileMask))
     {
         colorHotTileMask &= ~(1 << rt);
         buffers.pColor[rt] += RT::colorRasterTileStep;
     }

     buffers.pDepth += RT::depthRasterTileStep;
     buffers.pStencil += RT::stencilRasterTileStep;
 }

 template <typename RT>
 INLINE void StepRasterTileY(uint32_t             colorHotTileMask,
                             RenderOutputBuffers& buffers,
                             RenderOutputBuffers& startBufferRow)
 {
     DWORD rt = 0;
     while (_BitScanForward(&rt, colorHotTileMask))
     {
         colorHotTileMask &= ~(1 << rt);
         startBufferRow.pColor[rt] += RT::colorRasterTileRowStep;
         buffers.pColor[rt] = startBufferRow.pColor[rt];
     }
     startBufferRow.pDepth += RT::depthRasterTileRowStep;
     buffers.pDepth = startBufferRow.pDepth;

     startBufferRow.pStencil += RT::stencilRasterTileRowStep;
     buffers.pStencil = startBufferRow.pStencil;
 }