| /**************************************************************************** |
| * Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved. |
| * |
| * Permission is hereby granted, free of charge, to any person obtaining a |
| * copy of this software and associated documentation files (the "Software"), |
| * to deal in the Software without restriction, including without limitation |
| * the rights to use, copy, modify, merge, publish, distribute, sublicense, |
| * and/or sell copies of the Software, and to permit persons to whom the |
| * Software is furnished to do so, subject to the following conditions: |
| * |
| * The above copyright notice and this permission notice (including the next |
| * paragraph) shall be included in all copies or substantial portions of the |
| * Software. |
| * |
| * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR |
| * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, |
| * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL |
| * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER |
| * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING |
| * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS |
| * IN THE SOFTWARE. |
| * |
| * @file clip.h |
| * |
| * @brief Definitions for clipping |
| * |
| ******************************************************************************/ |
| #pragma once |
| |
| #include "common/simdintrin.h" |
| #include "core/context.h" |
| #include "core/pa.h" |
| #include "rdtsc_core.h" |
| |
| // Temp storage used by the clipper |
| extern THREAD SIMDVERTEX_T<SIMD256> tlsTempVertices[7]; |
| #if USE_SIMD16_FRONTEND |
| extern THREAD SIMDVERTEX_T<SIMD512> tlsTempVertices_simd16[7]; |
| #endif |
| |
| enum SWR_CLIPCODES |
| { |
| // Shift clip codes out of the mantissa to prevent denormalized values when used in float compare. |
| // Guardband is able to use a single high-bit with 4 separate LSBs, because it computes a union, rather than intersection, of clipcodes. |
| #define CLIPCODE_SHIFT 23 |
| FRUSTUM_LEFT = (0x01 << CLIPCODE_SHIFT), |
| FRUSTUM_TOP = (0x02 << CLIPCODE_SHIFT), |
| FRUSTUM_RIGHT = (0x04 << CLIPCODE_SHIFT), |
| FRUSTUM_BOTTOM = (0x08 << CLIPCODE_SHIFT), |
| |
| FRUSTUM_NEAR = (0x10 << CLIPCODE_SHIFT), |
| FRUSTUM_FAR = (0x20 << CLIPCODE_SHIFT), |
| |
| NEGW = (0x40 << CLIPCODE_SHIFT), |
| |
| GUARDBAND_LEFT = (0x80 << CLIPCODE_SHIFT | 0x1), |
| GUARDBAND_TOP = (0x80 << CLIPCODE_SHIFT | 0x2), |
| GUARDBAND_RIGHT = (0x80 << CLIPCODE_SHIFT | 0x4), |
| GUARDBAND_BOTTOM = (0x80 << CLIPCODE_SHIFT | 0x8) |
| }; |
| |
| #define GUARDBAND_CLIP_MASK (FRUSTUM_NEAR|FRUSTUM_FAR|GUARDBAND_LEFT|GUARDBAND_TOP|GUARDBAND_RIGHT|GUARDBAND_BOTTOM|NEGW) |
| |
| template<typename SIMD_T> |
| void ComputeClipCodes(const API_STATE &state, const typename SIMD_T::Vec4 &vertex, typename SIMD_T::Float &clipCodes, typename SIMD_T::Integer const &viewportIndexes) |
| { |
| clipCodes = SIMD_T::setzero_ps(); |
| |
| // -w |
| typename SIMD_T::Float vNegW = SIMD_T::mul_ps(vertex.w,SIMD_T::set1_ps(-1.0f)); |
| |
| // FRUSTUM_LEFT |
| typename SIMD_T::Float vRes = SIMD_T::cmplt_ps(vertex.x, vNegW); |
| clipCodes = SIMD_T::and_ps(vRes, SIMD_T::castsi_ps(SIMD_T::set1_epi32(FRUSTUM_LEFT))); |
| |
| // FRUSTUM_TOP |
| vRes = SIMD_T::cmplt_ps(vertex.y, vNegW); |
| clipCodes = SIMD_T::or_ps(clipCodes, SIMD_T::and_ps(vRes, SIMD_T::castsi_ps(SIMD_T::set1_epi32(FRUSTUM_TOP)))); |
| |
| // FRUSTUM_RIGHT |
| vRes = SIMD_T::cmpgt_ps(vertex.x, vertex.w); |
| clipCodes = SIMD_T::or_ps(clipCodes, SIMD_T::and_ps(vRes, SIMD_T::castsi_ps(SIMD_T::set1_epi32(FRUSTUM_RIGHT)))); |
| |
| // FRUSTUM_BOTTOM |
| vRes = SIMD_T::cmpgt_ps(vertex.y, vertex.w); |
| clipCodes = SIMD_T::or_ps(clipCodes, SIMD_T::and_ps(vRes, SIMD_T::castsi_ps(SIMD_T::set1_epi32(FRUSTUM_BOTTOM)))); |
| |
| if (state.rastState.depthClipEnable) |
| { |
| // FRUSTUM_NEAR |
| // DX clips depth [0..w], GL clips [-w..w] |
| if (state.rastState.clipHalfZ) |
| { |
| vRes = SIMD_T::cmplt_ps(vertex.z, SIMD_T::setzero_ps()); |
| } |
| else |
| { |
| vRes = SIMD_T::cmplt_ps(vertex.z, vNegW); |
| } |
| clipCodes = SIMD_T::or_ps(clipCodes, SIMD_T::and_ps(vRes, SIMD_T::castsi_ps(SIMD_T::set1_epi32(FRUSTUM_NEAR)))); |
| |
| // FRUSTUM_FAR |
| vRes = SIMD_T::cmpgt_ps(vertex.z, vertex.w); |
| clipCodes = SIMD_T::or_ps(clipCodes, SIMD_T::and_ps(vRes, SIMD_T::castsi_ps(SIMD_T::set1_epi32(FRUSTUM_FAR)))); |
| } |
| |
| // NEGW |
| vRes = SIMD_T::cmple_ps(vertex.w, SIMD_T::setzero_ps()); |
| clipCodes = SIMD_T::or_ps(clipCodes, SIMD_T::and_ps(vRes, SIMD_T::castsi_ps(SIMD_T::set1_epi32(NEGW)))); |
| |
| // GUARDBAND_LEFT |
| typename SIMD_T::Float gbMult = SIMD_T::mul_ps(vNegW, SIMD_T::template i32gather_ps<typename SIMD_T::ScaleFactor(4)>(&state.gbState.left[0], viewportIndexes)); |
| vRes = SIMD_T::cmplt_ps(vertex.x, gbMult); |
| clipCodes = SIMD_T::or_ps(clipCodes, SIMD_T::and_ps(vRes, SIMD_T::castsi_ps(SIMD_T::set1_epi32(GUARDBAND_LEFT)))); |
| |
| // GUARDBAND_TOP |
| gbMult = SIMD_T::mul_ps(vNegW, SIMD_T::template i32gather_ps<typename SIMD_T::ScaleFactor(4)>(&state.gbState.top[0], viewportIndexes)); |
| vRes = SIMD_T::cmplt_ps(vertex.y, gbMult); |
| clipCodes = SIMD_T::or_ps(clipCodes, SIMD_T::and_ps(vRes, SIMD_T::castsi_ps(SIMD_T::set1_epi32(GUARDBAND_TOP)))); |
| |
| // GUARDBAND_RIGHT |
| gbMult = SIMD_T::mul_ps(vertex.w, SIMD_T::template i32gather_ps<typename SIMD_T::ScaleFactor(4)>(&state.gbState.right[0], viewportIndexes)); |
| vRes = SIMD_T::cmpgt_ps(vertex.x, gbMult); |
| clipCodes = SIMD_T::or_ps(clipCodes, SIMD_T::and_ps(vRes, SIMD_T::castsi_ps(SIMD_T::set1_epi32(GUARDBAND_RIGHT)))); |
| |
| // GUARDBAND_BOTTOM |
| gbMult = SIMD_T::mul_ps(vertex.w, SIMD_T::template i32gather_ps<typename SIMD_T::ScaleFactor(4)>(&state.gbState.bottom[0], viewportIndexes)); |
| vRes = SIMD_T::cmpgt_ps(vertex.y, gbMult); |
| clipCodes = SIMD_T::or_ps(clipCodes, SIMD_T::and_ps(vRes, SIMD_T::castsi_ps(SIMD_T::set1_epi32(GUARDBAND_BOTTOM)))); |
| } |
| |
| template<typename SIMD_T> |
| struct BinnerChooser |
| { |
| }; |
| |
| template<> |
| struct BinnerChooser<SIMD256> |
| { |
| PFN_PROCESS_PRIMS pfnBinFunc; |
| |
| BinnerChooser(uint32_t numVertsPerPrim, uint32_t conservativeRast) |
| :pfnBinFunc(nullptr) |
| { |
| if (numVertsPerPrim == 3) |
| { |
| pfnBinFunc = GetBinTrianglesFunc(conservativeRast > 0); |
| |
| } |
| else if (numVertsPerPrim == 2) |
| { |
| pfnBinFunc = BinLines; |
| } |
| else |
| { |
| SWR_ASSERT(0 && "Unexpected points in clipper."); |
| } |
| } |
| |
| BinnerChooser(PRIMITIVE_TOPOLOGY topology, uint32_t conservativeRast) |
| :pfnBinFunc(nullptr) |
| { |
| switch (topology) |
| { |
| case TOP_POINT_LIST: |
| pfnBinFunc = BinPoints; |
| break; |
| case TOP_LINE_LIST: |
| case TOP_LINE_STRIP: |
| case TOP_LINE_LOOP: |
| case TOP_LINE_LIST_ADJ: |
| case TOP_LISTSTRIP_ADJ: |
| pfnBinFunc = BinLines; |
| break; |
| default: |
| pfnBinFunc = GetBinTrianglesFunc(conservativeRast > 0); |
| break; |
| }; |
| } |
| |
| void BinFunc(DRAW_CONTEXT *pDC, PA_STATE &pa, uint32_t workerId, SIMD256::Vec4 prims[], uint32_t primMask, SIMD256::Integer const &primID, SIMD256::Integer &viewportIdx, SIMD256::Integer &rtIdx) |
| { |
| SWR_ASSERT(pfnBinFunc != nullptr); |
| |
| pfnBinFunc(pDC, pa, workerId, prims, primMask, primID, viewportIdx, rtIdx); |
| } |
| }; |
| |
| #if USE_SIMD16_FRONTEND |
| template<> |
| struct BinnerChooser<SIMD512> |
| { |
| PFN_PROCESS_PRIMS_SIMD16 pfnBinFunc; |
| |
| BinnerChooser(uint32_t numVertsPerPrim, uint32_t conservativeRast) |
| :pfnBinFunc(nullptr) |
| { |
| if (numVertsPerPrim == 3) |
| { |
| pfnBinFunc = GetBinTrianglesFunc_simd16(conservativeRast > 0); |
| |
| } |
| else if (numVertsPerPrim == 2) |
| { |
| pfnBinFunc = BinLines_simd16; |
| } |
| else |
| { |
| SWR_ASSERT(0 && "Unexpected points in clipper."); |
| } |
| } |
| |
| BinnerChooser(PRIMITIVE_TOPOLOGY topology, uint32_t conservativeRast) |
| :pfnBinFunc(nullptr) |
| { |
| switch (topology) |
| { |
| case TOP_POINT_LIST: |
| pfnBinFunc = BinPoints_simd16; |
| break; |
| case TOP_LINE_LIST: |
| case TOP_LINE_STRIP: |
| case TOP_LINE_LOOP: |
| case TOP_LINE_LIST_ADJ: |
| case TOP_LISTSTRIP_ADJ: |
| pfnBinFunc = BinLines_simd16; |
| break; |
| default: |
| pfnBinFunc = GetBinTrianglesFunc_simd16(conservativeRast > 0); |
| break; |
| }; |
| } |
| |
| void BinFunc(DRAW_CONTEXT *pDC, PA_STATE &pa, uint32_t workerId, SIMD512::Vec4 prims[], uint32_t primMask, SIMD512::Integer const &primID, SIMD512::Integer &viewportIdx, SIMD512::Integer &rtIdx) |
| { |
| SWR_ASSERT(pfnBinFunc != nullptr); |
| |
| pfnBinFunc(pDC, pa, workerId, prims, primMask, primID, viewportIdx, rtIdx); |
| } |
| }; |
| |
| #endif |
| template<typename SIMD_T> |
| struct SimdHelper |
| { |
| }; |
| |
| template<> |
| struct SimdHelper<SIMD256> |
| { |
| static SIMD256::Float insert_lo_ps(SIMD256::Float a) |
| { |
| return a; |
| } |
| |
| static SIMD256::Mask cmpeq_ps_mask(SIMD256::Float a, SIMD256::Float b) |
| { |
| return SIMD256::movemask_ps(SIMD256::cmpeq_ps(a, b)); |
| } |
| }; |
| |
| #if USE_SIMD16_FRONTEND |
| template<> |
| struct SimdHelper<SIMD512> |
| { |
| static SIMD512::Float insert_lo_ps(SIMD256::Float a) |
| { |
| return SIMD512::insert_ps<0>(SIMD512::setzero_ps(), a); |
| } |
| |
| static SIMD512::Mask cmpeq_ps_mask(SIMD512::Float a, SIMD512::Float b) |
| { |
| return SIMD512::cmp_ps_mask<SIMD16::CompareType::EQ_OQ>(a, b); |
| } |
| }; |
| |
| #endif |
| // Temp storage used by the clipper |
| template<typename SIMD_T> |
| struct ClipHelper |
| { |
| }; |
| |
| template<> |
| struct ClipHelper<SIMD256> |
| { |
| static SIMDVERTEX_T<SIMD256> *GetTempVertices() |
| { |
| return tlsTempVertices; |
| } |
| }; |
| |
| #if USE_SIMD16_FRONTEND |
| template<> |
| struct ClipHelper<SIMD512> |
| { |
| static SIMDVERTEX_T<SIMD512> *GetTempVertices() |
| { |
| return tlsTempVertices_simd16; |
| } |
| }; |
| |
| #endif |
| template<typename SIMD_T, uint32_t NumVertsPerPrim> |
| class Clipper |
| { |
| public: |
| INLINE Clipper(uint32_t in_workerId, DRAW_CONTEXT* in_pDC) : |
| workerId(in_workerId), pDC(in_pDC), state(GetApiState(in_pDC)) |
| { |
| static_assert(NumVertsPerPrim >= 1 && NumVertsPerPrim <= 3, "Invalid NumVertsPerPrim"); |
| } |
| |
| void ComputeClipCodes(typename SIMD_T::Vec4 vertex[], const typename SIMD_T::Integer &viewportIndexes) |
| { |
| for (uint32_t i = 0; i < NumVertsPerPrim; ++i) |
| { |
| ::ComputeClipCodes<SIMD_T>(state, vertex[i], clipCodes[i], viewportIndexes); |
| } |
| } |
| |
| typename SIMD_T::Float ComputeClipCodeIntersection() |
| { |
| typename SIMD_T::Float result = clipCodes[0]; |
| |
| for (uint32_t i = 1; i < NumVertsPerPrim; ++i) |
| { |
| result = SIMD_T::and_ps(result, clipCodes[i]); |
| } |
| |
| return result; |
| } |
| |
| typename SIMD_T::Float ComputeClipCodeUnion() |
| { |
| typename SIMD_T::Float result = clipCodes[0]; |
| |
| for (uint32_t i = 1; i < NumVertsPerPrim; ++i) |
| { |
| result = SIMD_T::or_ps(result, clipCodes[i]); |
| } |
| |
| return result; |
| } |
| |
| int ComputeClipMask() |
| { |
| typename SIMD_T::Float clipUnion = ComputeClipCodeUnion(); |
| |
| clipUnion = SIMD_T::and_ps(clipUnion, SIMD_T::castsi_ps(SIMD_T::set1_epi32(GUARDBAND_CLIP_MASK))); |
| |
| return SIMD_T::movemask_ps(SIMD_T::cmpneq_ps(clipUnion, SIMD_T::setzero_ps())); |
| } |
| |
| // clipper is responsible for culling any prims with NAN coordinates |
| int ComputeNaNMask(typename SIMD_T::Vec4 prim[]) |
| { |
| typename SIMD_T::Float vNanMask = SIMD_T::setzero_ps(); |
| |
| for (uint32_t e = 0; e < NumVertsPerPrim; ++e) |
| { |
| typename SIMD_T::Float vNan01 = SIMD_T::template cmp_ps<SIMD_T::CompareType::UNORD_Q>(prim[e].v[0], prim[e].v[1]); |
| vNanMask = SIMD_T::or_ps(vNanMask, vNan01); |
| |
| typename SIMD_T::Float vNan23 = SIMD_T::template cmp_ps<SIMD_T::CompareType::UNORD_Q>(prim[e].v[2], prim[e].v[3]); |
| vNanMask = SIMD_T::or_ps(vNanMask, vNan23); |
| } |
| |
| return SIMD_T::movemask_ps(vNanMask); |
| } |
| |
| int ComputeUserClipCullMask(PA_STATE &pa, typename SIMD_T::Vec4 prim[]) |
| { |
| uint8_t cullMask = state.backendState.cullDistanceMask; |
| uint32_t vertexClipCullOffset = state.backendState.vertexClipCullOffset; |
| |
| typename SIMD_T::Float vClipCullMask = SIMD_T::setzero_ps(); |
| |
| typename SIMD_T::Vec4 vClipCullDistLo[3]; |
| typename SIMD_T::Vec4 vClipCullDistHi[3]; |
| |
| pa.Assemble(vertexClipCullOffset, vClipCullDistLo); |
| pa.Assemble(vertexClipCullOffset + 1, vClipCullDistHi); |
| |
| DWORD index; |
| while (_BitScanForward(&index, cullMask)) |
| { |
| cullMask &= ~(1 << index); |
| uint32_t slot = index >> 2; |
| uint32_t component = index & 0x3; |
| |
| typename SIMD_T::Float vCullMaskElem = SIMD_T::set1_ps(-1.0f); |
| for (uint32_t e = 0; e < NumVertsPerPrim; ++e) |
| { |
| typename SIMD_T::Float vCullComp; |
| if (slot == 0) |
| { |
| vCullComp = vClipCullDistLo[e][component]; |
| } |
| else |
| { |
| vCullComp = vClipCullDistHi[e][component]; |
| } |
| |
| // cull if cull distance < 0 || NAN |
| typename SIMD_T::Float vCull = SIMD_T::template cmp_ps<SIMD_T::CompareType::NLE_UQ>(SIMD_T::setzero_ps(), vCullComp); |
| vCullMaskElem = SIMD_T::and_ps(vCullMaskElem, vCull); |
| } |
| vClipCullMask = SIMD_T::or_ps(vClipCullMask, vCullMaskElem); |
| } |
| |
| // clipper should also discard any primitive with NAN clip distance |
| uint8_t clipMask = state.backendState.clipDistanceMask; |
| while (_BitScanForward(&index, clipMask)) |
| { |
| clipMask &= ~(1 << index); |
| uint32_t slot = index >> 2; |
| uint32_t component = index & 0x3; |
| |
| for (uint32_t e = 0; e < NumVertsPerPrim; ++e) |
| { |
| typename SIMD_T::Float vClipComp; |
| if (slot == 0) |
| { |
| vClipComp = vClipCullDistLo[e][component]; |
| } |
| else |
| { |
| vClipComp = vClipCullDistHi[e][component]; |
| } |
| |
| typename SIMD_T::Float vClip = SIMD_T::template cmp_ps<SIMD_T::CompareType::UNORD_Q>(vClipComp, vClipComp); |
| vClipCullMask = SIMD_T::or_ps(vClipCullMask, vClip); |
| } |
| } |
| |
| return SIMD_T::movemask_ps(vClipCullMask); |
| } |
| |
| void ClipSimd(const typename SIMD_T::Vec4 prim[], const typename SIMD_T::Float &vPrimMask, const typename SIMD_T::Float &vClipMask, PA_STATE &pa, |
| const typename SIMD_T::Integer &vPrimId, const typename SIMD_T::Integer &vViewportIdx, const typename SIMD_T::Integer &vRtIdx) |
| { |
| // input/output vertex store for clipper |
| SIMDVERTEX_T<SIMD_T> vertices[7]; // maximum 7 verts generated per triangle |
| |
| uint32_t constantInterpMask = state.backendState.constantInterpolationMask; |
| uint32_t provokingVertex = 0; |
| if (pa.binTopology == TOP_TRIANGLE_FAN) |
| { |
| provokingVertex = state.frontendState.provokingVertex.triFan; |
| } |
| ///@todo: line topology for wireframe? |
| |
| // assemble pos |
| typename SIMD_T::Vec4 tmpVector[NumVertsPerPrim]; |
| for (uint32_t i = 0; i < NumVertsPerPrim; ++i) |
| { |
| vertices[i].attrib[VERTEX_POSITION_SLOT] = prim[i]; |
| } |
| |
| // assemble attribs |
| const SWR_BACKEND_STATE& backendState = state.backendState; |
| |
| int32_t maxSlot = -1; |
| for (uint32_t slot = 0; slot < backendState.numAttributes; ++slot) |
| { |
| // Compute absolute attrib slot in vertex array |
| uint32_t mapSlot = backendState.swizzleEnable ? backendState.swizzleMap[slot].sourceAttrib : slot; |
| maxSlot = std::max<int32_t>(maxSlot, mapSlot); |
| uint32_t inputSlot = backendState.vertexAttribOffset + mapSlot; |
| |
| pa.Assemble(inputSlot, tmpVector); |
| |
| // if constant interpolation enabled for this attribute, assign the provoking |
| // vertex values to all edges |
| if (CheckBit(constantInterpMask, slot)) |
| { |
| for (uint32_t i = 0; i < NumVertsPerPrim; ++i) |
| { |
| vertices[i].attrib[inputSlot] = tmpVector[provokingVertex]; |
| } |
| } |
| else |
| { |
| for (uint32_t i = 0; i < NumVertsPerPrim; ++i) |
| { |
| vertices[i].attrib[inputSlot] = tmpVector[i]; |
| } |
| } |
| } |
| |
| // assemble user clip distances if enabled |
| uint32_t vertexClipCullSlot = state.backendState.vertexClipCullOffset; |
| if (state.backendState.clipDistanceMask & 0xf) |
| { |
| pa.Assemble(vertexClipCullSlot, tmpVector); |
| for (uint32_t i = 0; i < NumVertsPerPrim; ++i) |
| { |
| vertices[i].attrib[vertexClipCullSlot] = tmpVector[i]; |
| } |
| } |
| |
| if (state.backendState.clipDistanceMask & 0xf0) |
| { |
| pa.Assemble(vertexClipCullSlot + 1, tmpVector); |
| for (uint32_t i = 0; i < NumVertsPerPrim; ++i) |
| { |
| vertices[i].attrib[vertexClipCullSlot + 1] = tmpVector[i]; |
| } |
| } |
| |
| uint32_t numAttribs = maxSlot + 1; |
| |
| typename SIMD_T::Integer vNumClippedVerts = ClipPrims((float*)&vertices[0], vPrimMask, vClipMask, numAttribs); |
| |
| BinnerChooser<SIMD_T> binner(NumVertsPerPrim, pa.pDC->pState->state.rastState.conservativeRast); |
| |
| // set up new PA for binning clipped primitives |
| PRIMITIVE_TOPOLOGY clipTopology = TOP_UNKNOWN; |
| if (NumVertsPerPrim == 3) |
| { |
| clipTopology = TOP_TRIANGLE_FAN; |
| |
| // so that the binner knows to bloat wide points later |
| if (pa.binTopology == TOP_POINT_LIST) |
| { |
| clipTopology = TOP_POINT_LIST; |
| } |
| } |
| else if (NumVertsPerPrim == 2) |
| { |
| clipTopology = TOP_LINE_LIST; |
| } |
| else |
| { |
| SWR_ASSERT(0 && "Unexpected points in clipper."); |
| } |
| |
| const uint32_t *pVertexCount = reinterpret_cast<const uint32_t *>(&vNumClippedVerts); |
| const uint32_t *pPrimitiveId = reinterpret_cast<const uint32_t *>(&vPrimId); |
| const uint32_t *pViewportIdx = reinterpret_cast<const uint32_t *>(&vViewportIdx); |
| const uint32_t *pRtIdx = reinterpret_cast<const uint32_t *>(&vRtIdx); |
| |
| const SIMD256::Integer vOffsets = SIMD256::set_epi32( |
| 0 * sizeof(SIMDVERTEX_T<SIMD_T>), // unused lane |
| 6 * sizeof(SIMDVERTEX_T<SIMD_T>), |
| 5 * sizeof(SIMDVERTEX_T<SIMD_T>), |
| 4 * sizeof(SIMDVERTEX_T<SIMD_T>), |
| 3 * sizeof(SIMDVERTEX_T<SIMD_T>), |
| 2 * sizeof(SIMDVERTEX_T<SIMD_T>), |
| 1 * sizeof(SIMDVERTEX_T<SIMD_T>), |
| 0 * sizeof(SIMDVERTEX_T<SIMD_T>)); |
| |
| // only need to gather 7 verts |
| // @todo dynamic mask based on actual # of verts generated per lane |
| const SIMD256::Float vMask = SIMD256::set_ps(0, -1, -1, -1, -1, -1, -1, -1); |
| |
| uint32_t numClippedPrims = 0; |
| |
| // tranpose clipper output so that each lane's vertices are in SIMD order |
| // set aside space for 2 vertices, as the PA will try to read up to 16 verts |
| // for triangle fan |
| |
| #if defined(_DEBUG) |
| // TODO: need to increase stack size, allocating SIMD16-widened transposedPrims causes stack overflow in debug builds |
| SIMDVERTEX_T<SIMD_T> *transposedPrims = reinterpret_cast<SIMDVERTEX_T<SIMD_T> *>(AlignedMalloc(sizeof(SIMDVERTEX_T<SIMD_T>) * 2, 64)); |
| |
| #else |
| SIMDVERTEX_T<SIMD_T> transposedPrims[2]; |
| |
| #endif |
| uint32_t numInputPrims = pa.NumPrims(); |
| for (uint32_t inputPrim = 0; inputPrim < numInputPrims; ++inputPrim) |
| { |
| uint32_t numEmittedVerts = pVertexCount[inputPrim]; |
| if (numEmittedVerts < NumVertsPerPrim) |
| { |
| continue; |
| } |
| SWR_ASSERT(numEmittedVerts <= 7, "Unexpected vertex count from clipper."); |
| |
| uint32_t numEmittedPrims = GetNumPrims(clipTopology, numEmittedVerts); |
| SWR_ASSERT(numEmittedPrims <= 7, "Unexpected primitive count from clipper."); |
| |
| numClippedPrims += numEmittedPrims; |
| |
| // tranpose clipper output so that each lane's vertices are in SIMD order |
| // set aside space for 2 vertices, as the PA will try to read up to 16 verts |
| // for triangle fan |
| |
| // transpose pos |
| uint8_t *pBase = reinterpret_cast<uint8_t *>(&vertices[0].attrib[VERTEX_POSITION_SLOT]) + sizeof(float) * inputPrim; |
| |
| #if 0 |
| // TEMPORARY WORKAROUND for bizarre VS2015 code-gen bug |
| static const float *dummy = reinterpret_cast<const float *>(pBase); |
| |
| #endif |
| for (uint32_t c = 0; c < 4; ++c) |
| { |
| SIMD256::Float temp = SIMD256::template mask_i32gather_ps<typename SIMD_T::ScaleFactor(1)>(SIMD256::setzero_ps(), reinterpret_cast<const float *>(pBase), vOffsets, vMask); |
| transposedPrims[0].attrib[VERTEX_POSITION_SLOT][c] = SimdHelper<SIMD_T>::insert_lo_ps(temp); |
| pBase += sizeof(typename SIMD_T::Float); |
| } |
| |
| // transpose attribs |
| pBase = reinterpret_cast<uint8_t *>(&vertices[0].attrib[backendState.vertexAttribOffset]) + sizeof(float) * inputPrim; |
| |
| for (uint32_t attrib = 0; attrib < numAttribs; ++attrib) |
| { |
| uint32_t attribSlot = backendState.vertexAttribOffset + attrib; |
| |
| for (uint32_t c = 0; c < 4; ++c) |
| { |
| SIMD256::Float temp = SIMD256::template mask_i32gather_ps<typename SIMD_T::ScaleFactor(1)>(SIMD256::setzero_ps(), reinterpret_cast<const float *>(pBase), vOffsets, vMask); |
| transposedPrims[0].attrib[attribSlot][c] = SimdHelper<SIMD_T>::insert_lo_ps(temp); |
| pBase += sizeof(typename SIMD_T::Float); |
| } |
| } |
| |
| // transpose user clip distances if enabled |
| uint32_t vertexClipCullSlot = backendState.vertexClipCullOffset; |
| if (state.backendState.clipDistanceMask & 0x0f) |
| { |
| pBase = reinterpret_cast<uint8_t *>(&vertices[0].attrib[vertexClipCullSlot]) + sizeof(float) * inputPrim; |
| |
| for (uint32_t c = 0; c < 4; ++c) |
| { |
| SIMD256::Float temp = SIMD256::template mask_i32gather_ps<typename SIMD_T::ScaleFactor(1)>(SIMD256::setzero_ps(), reinterpret_cast<const float *>(pBase), vOffsets, vMask); |
| transposedPrims[0].attrib[vertexClipCullSlot][c] = SimdHelper<SIMD_T>::insert_lo_ps(temp); |
| pBase += sizeof(typename SIMD_T::Float); |
| } |
| } |
| |
| if (state.backendState.clipDistanceMask & 0xf0) |
| { |
| pBase = reinterpret_cast<uint8_t *>(&vertices[0].attrib[vertexClipCullSlot + 1]) + sizeof(float) * inputPrim; |
| |
| for (uint32_t c = 0; c < 4; ++c) |
| { |
| SIMD256::Float temp = SIMD256::template mask_i32gather_ps<typename SIMD_T::ScaleFactor(1)>(SIMD256::setzero_ps(), reinterpret_cast<const float *>(pBase), vOffsets, vMask); |
| transposedPrims[0].attrib[vertexClipCullSlot + 1][c] = SimdHelper<SIMD_T>::insert_lo_ps(temp); |
| pBase += sizeof(typename SIMD_T::Float); |
| } |
| } |
| |
| PA_STATE_OPT clipPA(pDC, numEmittedPrims, reinterpret_cast<uint8_t *>(&transposedPrims[0]), numEmittedVerts, SWR_VTX_NUM_SLOTS, true, NumVertsPerPrim, clipTopology); |
| clipPA.viewportArrayActive = pa.viewportArrayActive; |
| clipPA.rtArrayActive = pa.rtArrayActive; |
| |
| static const uint32_t primMaskMap[] = { 0x0, 0x1, 0x3, 0x7, 0xf, 0x1f, 0x3f, 0x7f }; |
| |
| const uint32_t primMask = primMaskMap[numEmittedPrims]; |
| |
| const typename SIMD_T::Integer primID = SIMD_T::set1_epi32(pPrimitiveId[inputPrim]); |
| const typename SIMD_T::Integer viewportIdx = SIMD_T::set1_epi32(pViewportIdx[inputPrim]); |
| const typename SIMD_T::Integer rtIdx = SIMD_T::set1_epi32(pRtIdx[inputPrim]); |
| |
| |
| while (clipPA.GetNextStreamOutput()) |
| { |
| do |
| { |
| typename SIMD_T::Vec4 attrib[NumVertsPerPrim]; |
| |
| bool assemble = clipPA.Assemble(VERTEX_POSITION_SLOT, attrib); |
| |
| if (assemble) |
| { |
| binner.pfnBinFunc(pDC, clipPA, workerId, attrib, primMask, primID, viewportIdx, rtIdx); |
| } |
| |
| } while (clipPA.NextPrim()); |
| } |
| } |
| |
| #if defined(_DEBUG) |
| AlignedFree(transposedPrims); |
| |
| #endif |
| // update global pipeline stat |
| UPDATE_STAT_FE(CPrimitives, numClippedPrims); |
| } |
| |
| void ExecuteStage(PA_STATE &pa, typename SIMD_T::Vec4 prim[], uint32_t primMask, |
| typename SIMD_T::Integer const &primId, typename SIMD_T::Integer const &viewportIdx, typename SIMD_T::Integer const &rtIdx) |
| { |
| SWR_ASSERT(pa.pDC != nullptr); |
| |
| SWR_CONTEXT *pContext = pa.pDC->pContext; |
| |
| BinnerChooser<SIMD_T> binner(pa.binTopology, pa.pDC->pState->state.rastState.conservativeRast); |
| |
| // update clipper invocations pipeline stat |
| uint32_t numInvoc = _mm_popcnt_u32(primMask); |
| UPDATE_STAT_FE(CInvocations, numInvoc); |
| |
| ComputeClipCodes(prim, viewportIdx); |
| |
| // cull prims with NAN coords |
| primMask &= ~ComputeNaNMask(prim); |
| |
| // user cull distance cull |
| if (state.backendState.cullDistanceMask) |
| { |
| primMask &= ~ComputeUserClipCullMask(pa, prim); |
| } |
| |
| // cull prims outside view frustum |
| typename SIMD_T::Float clipIntersection = ComputeClipCodeIntersection(); |
| int validMask = primMask & SimdHelper<SIMD_T>::cmpeq_ps_mask(clipIntersection, SIMD_T::setzero_ps()); |
| |
| // skip clipping for points |
| uint32_t clipMask = 0; |
| if (NumVertsPerPrim != 1) |
| { |
| clipMask = primMask & ComputeClipMask(); |
| } |
| |
| if (clipMask) |
| { |
| AR_BEGIN(FEGuardbandClip, pa.pDC->drawId); |
| // we have to clip tris, execute the clipper, which will also |
| // call the binner |
| ClipSimd(prim, SIMD_T::vmask_ps(primMask), SIMD_T::vmask_ps(clipMask), pa, primId, viewportIdx, rtIdx); |
| AR_END(FEGuardbandClip, 1); |
| } |
| else if (validMask) |
| { |
| // update CPrimitives pipeline state |
| UPDATE_STAT_FE(CPrimitives, _mm_popcnt_u32(validMask)); |
| |
| // forward valid prims directly to binner |
| binner.pfnBinFunc(this->pDC, pa, this->workerId, prim, validMask, primId, viewportIdx, rtIdx); |
| } |
| } |
| |
| private: |
| typename SIMD_T::Float ComputeInterpFactor(typename SIMD_T::Float const &boundaryCoord0, typename SIMD_T::Float const &boundaryCoord1) |
| { |
| return SIMD_T::div_ps(boundaryCoord0, SIMD_T::sub_ps(boundaryCoord0, boundaryCoord1)); |
| } |
| |
| typename SIMD_T::Integer ComputeOffsets(uint32_t attrib, typename SIMD_T::Integer const &vIndices, uint32_t component) |
| { |
| const uint32_t simdVertexStride = sizeof(SIMDVERTEX_T<SIMD_T>); |
| const uint32_t componentStride = sizeof(typename SIMD_T::Float); |
| const uint32_t attribStride = sizeof(typename SIMD_T::Vec4); |
| |
| static const OSALIGNSIMD16(uint32_t) elemOffset[16] = |
| { |
| 0 * sizeof(float), |
| 1 * sizeof(float), |
| 2 * sizeof(float), |
| 3 * sizeof(float), |
| 4 * sizeof(float), |
| 5 * sizeof(float), |
| 6 * sizeof(float), |
| 7 * sizeof(float), |
| 8 * sizeof(float), |
| 9 * sizeof(float), |
| 10 * sizeof(float), |
| 11 * sizeof(float), |
| 12 * sizeof(float), |
| 13 * sizeof(float), |
| 14 * sizeof(float), |
| 15 * sizeof(float), |
| }; |
| |
| static_assert(sizeof(typename SIMD_T::Integer) <= sizeof(elemOffset), "Clipper::ComputeOffsets, Increase number of element offsets."); |
| |
| typename SIMD_T::Integer vElemOffset = SIMD_T::loadu_si(reinterpret_cast<const typename SIMD_T::Integer *>(elemOffset)); |
| |
| // step to the simdvertex |
| typename SIMD_T::Integer vOffsets = SIMD_T::mullo_epi32(vIndices, SIMD_T::set1_epi32(simdVertexStride)); |
| |
| // step to the attribute and component |
| vOffsets = SIMD_T::add_epi32(vOffsets, SIMD_T::set1_epi32(attribStride * attrib + componentStride * component)); |
| |
| // step to the lane |
| vOffsets = SIMD_T::add_epi32(vOffsets, vElemOffset); |
| |
| return vOffsets; |
| } |
| |
| typename SIMD_T::Float GatherComponent(const float* pBuffer, uint32_t attrib, typename SIMD_T::Float const &vMask, typename SIMD_T::Integer const &vIndices, uint32_t component) |
| { |
| typename SIMD_T::Integer vOffsets = ComputeOffsets(attrib, vIndices, component); |
| typename SIMD_T::Float vSrc = SIMD_T::setzero_ps(); |
| |
| return SIMD_T::template mask_i32gather_ps<typename SIMD_T::ScaleFactor(1)>(vSrc, pBuffer, vOffsets, vMask); |
| } |
| |
| void ScatterComponent(const float* pBuffer, uint32_t attrib, typename SIMD_T::Float const &vMask, typename SIMD_T::Integer const &vIndices, uint32_t component, typename SIMD_T::Float const &vSrc) |
| { |
| typename SIMD_T::Integer vOffsets = ComputeOffsets(attrib, vIndices, component); |
| |
| const uint32_t *pOffsets = reinterpret_cast<const uint32_t *>(&vOffsets); |
| const float *pSrc = reinterpret_cast<const float *>(&vSrc); |
| uint32_t mask = SIMD_T::movemask_ps(vMask); |
| DWORD lane; |
| while (_BitScanForward(&lane, mask)) |
| { |
| mask &= ~(1 << lane); |
| const uint8_t *pBuf = reinterpret_cast<const uint8_t *>(pBuffer) + pOffsets[lane]; |
| *(float *)pBuf = pSrc[lane]; |
| } |
| } |
| |
| template<SWR_CLIPCODES ClippingPlane> |
| void intersect( |
| const typename SIMD_T::Float &vActiveMask, // active lanes to operate on |
| const typename SIMD_T::Integer &s, // index to first edge vertex v0 in pInPts. |
| const typename SIMD_T::Integer &p, // index to second edge vertex v1 in pInPts. |
| const typename SIMD_T::Vec4 &v1, // vertex 0 position |
| const typename SIMD_T::Vec4 &v2, // vertex 1 position |
| typename SIMD_T::Integer &outIndex, // output index. |
| const float *pInVerts, // array of all the input positions. |
| uint32_t numInAttribs, // number of attributes per vertex. |
| float *pOutVerts) // array of output positions. We'll write our new intersection point at i*4. |
| { |
| uint32_t vertexAttribOffset = this->state.backendState.vertexAttribOffset; |
| uint32_t vertexClipCullOffset = this->state.backendState.vertexClipCullOffset; |
| |
| // compute interpolation factor |
| typename SIMD_T::Float t; |
| switch (ClippingPlane) |
| { |
| case FRUSTUM_LEFT: t = ComputeInterpFactor(SIMD_T::add_ps(v1[3], v1[0]), SIMD_T::add_ps(v2[3], v2[0])); break; |
| case FRUSTUM_RIGHT: t = ComputeInterpFactor(SIMD_T::sub_ps(v1[3], v1[0]), SIMD_T::sub_ps(v2[3], v2[0])); break; |
| case FRUSTUM_TOP: t = ComputeInterpFactor(SIMD_T::add_ps(v1[3], v1[1]), SIMD_T::add_ps(v2[3], v2[1])); break; |
| case FRUSTUM_BOTTOM: t = ComputeInterpFactor(SIMD_T::sub_ps(v1[3], v1[1]), SIMD_T::sub_ps(v2[3], v2[1])); break; |
| case FRUSTUM_NEAR: |
| // DX Znear plane is 0, GL is -w |
| if (this->state.rastState.clipHalfZ) |
| { |
| t = ComputeInterpFactor(v1[2], v2[2]); |
| } |
| else |
| { |
| t = ComputeInterpFactor(SIMD_T::add_ps(v1[3], v1[2]), SIMD_T::add_ps(v2[3], v2[2])); |
| } |
| break; |
| case FRUSTUM_FAR: t = ComputeInterpFactor(SIMD_T::sub_ps(v1[3], v1[2]), SIMD_T::sub_ps(v2[3], v2[2])); break; |
| default: SWR_INVALID("invalid clipping plane: %d", ClippingPlane); |
| }; |
| |
| // interpolate position and store |
| for (uint32_t c = 0; c < 4; ++c) |
| { |
| typename SIMD_T::Float vOutPos = SIMD_T::fmadd_ps(SIMD_T::sub_ps(v2[c], v1[c]), t, v1[c]); |
| ScatterComponent(pOutVerts, VERTEX_POSITION_SLOT, vActiveMask, outIndex, c, vOutPos); |
| } |
| |
| // interpolate attributes and store |
| for (uint32_t a = 0; a < numInAttribs; ++a) |
| { |
| uint32_t attribSlot = vertexAttribOffset + a; |
| for (uint32_t c = 0; c < 4; ++c) |
| { |
| typename SIMD_T::Float vAttrib0 = GatherComponent(pInVerts, attribSlot, vActiveMask, s, c); |
| typename SIMD_T::Float vAttrib1 = GatherComponent(pInVerts, attribSlot, vActiveMask, p, c); |
| typename SIMD_T::Float vOutAttrib = SIMD_T::fmadd_ps(SIMD_T::sub_ps(vAttrib1, vAttrib0), t, vAttrib0); |
| ScatterComponent(pOutVerts, attribSlot, vActiveMask, outIndex, c, vOutAttrib); |
| } |
| } |
| |
| // interpolate clip distance if enabled |
| if (this->state.backendState.clipDistanceMask & 0xf) |
| { |
| uint32_t attribSlot = vertexClipCullOffset; |
| for (uint32_t c = 0; c < 4; ++c) |
| { |
| typename SIMD_T::Float vAttrib0 = GatherComponent(pInVerts, attribSlot, vActiveMask, s, c); |
| typename SIMD_T::Float vAttrib1 = GatherComponent(pInVerts, attribSlot, vActiveMask, p, c); |
| typename SIMD_T::Float vOutAttrib = SIMD_T::fmadd_ps(SIMD_T::sub_ps(vAttrib1, vAttrib0), t, vAttrib0); |
| ScatterComponent(pOutVerts, attribSlot, vActiveMask, outIndex, c, vOutAttrib); |
| } |
| } |
| |
| if (this->state.backendState.clipDistanceMask & 0xf0) |
| { |
| uint32_t attribSlot = vertexClipCullOffset + 1; |
| for (uint32_t c = 0; c < 4; ++c) |
| { |
| typename SIMD_T::Float vAttrib0 = GatherComponent(pInVerts, attribSlot, vActiveMask, s, c); |
| typename SIMD_T::Float vAttrib1 = GatherComponent(pInVerts, attribSlot, vActiveMask, p, c); |
| typename SIMD_T::Float vOutAttrib = SIMD_T::fmadd_ps(SIMD_T::sub_ps(vAttrib1, vAttrib0), t, vAttrib0); |
| ScatterComponent(pOutVerts, attribSlot, vActiveMask, outIndex, c, vOutAttrib); |
| } |
| } |
| } |
| |
| template<SWR_CLIPCODES ClippingPlane> |
| typename SIMD_T::Float inside(const typename SIMD_T::Vec4 &v) |
| { |
| switch (ClippingPlane) |
| { |
| case FRUSTUM_LEFT: return SIMD_T::cmpge_ps(v[0], SIMD_T::mul_ps(v[3], SIMD_T::set1_ps(-1.0f))); |
| case FRUSTUM_RIGHT: return SIMD_T::cmple_ps(v[0], v[3]); |
| case FRUSTUM_TOP: return SIMD_T::cmpge_ps(v[1], SIMD_T::mul_ps(v[3], SIMD_T::set1_ps(-1.0f))); |
| case FRUSTUM_BOTTOM: return SIMD_T::cmple_ps(v[1], v[3]); |
| case FRUSTUM_NEAR: return SIMD_T::cmpge_ps(v[2], this->state.rastState.clipHalfZ ? SIMD_T::setzero_ps() : SIMD_T::mul_ps(v[3], SIMD_T::set1_ps(-1.0f))); |
| case FRUSTUM_FAR: return SIMD_T::cmple_ps(v[2], v[3]); |
| default: |
| SWR_INVALID("invalid clipping plane: %d", ClippingPlane); |
| return SIMD_T::setzero_ps(); |
| } |
| } |
| |
| template<SWR_CLIPCODES ClippingPlane> |
| typename SIMD_T::Integer ClipTriToPlane(const float *pInVerts, const typename SIMD_T::Integer &vNumInPts, uint32_t numInAttribs, float *pOutVerts) |
| { |
| uint32_t vertexAttribOffset = this->state.backendState.vertexAttribOffset; |
| |
| typename SIMD_T::Integer vCurIndex = SIMD_T::setzero_si(); |
| typename SIMD_T::Integer vOutIndex = SIMD_T::setzero_si(); |
| typename SIMD_T::Float vActiveMask = SIMD_T::castsi_ps(SIMD_T::cmplt_epi32(vCurIndex, vNumInPts)); |
| |
| while (!SIMD_T::testz_ps(vActiveMask, vActiveMask)) // loop until activeMask is empty |
| { |
| typename SIMD_T::Integer s = vCurIndex; |
| typename SIMD_T::Integer p = SIMD_T::add_epi32(s, SIMD_T::set1_epi32(1)); |
| typename SIMD_T::Integer underFlowMask = SIMD_T::cmpgt_epi32(vNumInPts, p); |
| p = SIMD_T::castps_si(SIMD_T::blendv_ps(SIMD_T::setzero_ps(), SIMD_T::castsi_ps(p), SIMD_T::castsi_ps(underFlowMask))); |
| |
| // gather position |
| typename SIMD_T::Vec4 vInPos0, vInPos1; |
| for (uint32_t c = 0; c < 4; ++c) |
| { |
| vInPos0[c] = GatherComponent(pInVerts, VERTEX_POSITION_SLOT, vActiveMask, s, c); |
| vInPos1[c] = GatherComponent(pInVerts, VERTEX_POSITION_SLOT, vActiveMask, p, c); |
| } |
| |
| // compute inside mask |
| typename SIMD_T::Float s_in = inside<ClippingPlane>(vInPos0); |
| typename SIMD_T::Float p_in = inside<ClippingPlane>(vInPos1); |
| |
| // compute intersection mask (s_in != p_in) |
| typename SIMD_T::Float intersectMask = SIMD_T::xor_ps(s_in, p_in); |
| intersectMask = SIMD_T::and_ps(intersectMask, vActiveMask); |
| |
| // store s if inside |
| s_in = SIMD_T::and_ps(s_in, vActiveMask); |
| if (!SIMD_T::testz_ps(s_in, s_in)) |
| { |
| // store position |
| for (uint32_t c = 0; c < 4; ++c) |
| { |
| ScatterComponent(pOutVerts, VERTEX_POSITION_SLOT, s_in, vOutIndex, c, vInPos0[c]); |
| } |
| |
| // store attribs |
| for (uint32_t a = 0; a < numInAttribs; ++a) |
| { |
| uint32_t attribSlot = vertexAttribOffset + a; |
| for (uint32_t c = 0; c < 4; ++c) |
| { |
| typename SIMD_T::Float vAttrib = GatherComponent(pInVerts, attribSlot, s_in, s, c); |
| ScatterComponent(pOutVerts, attribSlot, s_in, vOutIndex, c, vAttrib); |
| } |
| } |
| |
| // store clip distance if enabled |
| uint32_t vertexClipCullSlot = this->state.backendState.vertexClipCullOffset; |
| if (this->state.backendState.clipDistanceMask & 0xf) |
| { |
| uint32_t attribSlot = vertexClipCullSlot; |
| for (uint32_t c = 0; c < 4; ++c) |
| { |
| typename SIMD_T::Float vAttrib = GatherComponent(pInVerts, attribSlot, s_in, s, c); |
| ScatterComponent(pOutVerts, attribSlot, s_in, vOutIndex, c, vAttrib); |
| } |
| } |
| |
| if (this->state.backendState.clipDistanceMask & 0xf0) |
| { |
| uint32_t attribSlot = vertexClipCullSlot + 1; |
| for (uint32_t c = 0; c < 4; ++c) |
| { |
| typename SIMD_T::Float vAttrib = GatherComponent(pInVerts, attribSlot, s_in, s, c); |
| ScatterComponent(pOutVerts, attribSlot, s_in, vOutIndex, c, vAttrib); |
| } |
| } |
| |
| // increment outIndex |
| vOutIndex = SIMD_T::blendv_epi32(vOutIndex, SIMD_T::add_epi32(vOutIndex, SIMD_T::set1_epi32(1)), s_in); |
| } |
| |
| // compute and store intersection |
| if (!SIMD_T::testz_ps(intersectMask, intersectMask)) |
| { |
| intersect<ClippingPlane>(intersectMask, s, p, vInPos0, vInPos1, vOutIndex, pInVerts, numInAttribs, pOutVerts); |
| |
| // increment outIndex for active lanes |
| vOutIndex = SIMD_T::blendv_epi32(vOutIndex, SIMD_T::add_epi32(vOutIndex, SIMD_T::set1_epi32(1)), intersectMask); |
| } |
| |
| // increment loop index and update active mask |
| vCurIndex = SIMD_T::add_epi32(vCurIndex, SIMD_T::set1_epi32(1)); |
| vActiveMask = SIMD_T::castsi_ps(SIMD_T::cmplt_epi32(vCurIndex, vNumInPts)); |
| } |
| |
| return vOutIndex; |
| } |
| |
| template<SWR_CLIPCODES ClippingPlane> |
| typename SIMD_T::Integer ClipLineToPlane(const float *pInVerts, const typename SIMD_T::Integer &vNumInPts, uint32_t numInAttribs, float *pOutVerts) |
| { |
| uint32_t vertexAttribOffset = this->state.backendState.vertexAttribOffset; |
| |
| typename SIMD_T::Integer vCurIndex = SIMD_T::setzero_si(); |
| typename SIMD_T::Integer vOutIndex = SIMD_T::setzero_si(); |
| typename SIMD_T::Float vActiveMask = SIMD_T::castsi_ps(SIMD_T::cmplt_epi32(vCurIndex, vNumInPts)); |
| |
| if (!SIMD_T::testz_ps(vActiveMask, vActiveMask)) |
| { |
| typename SIMD_T::Integer s = vCurIndex; |
| typename SIMD_T::Integer p = SIMD_T::add_epi32(s, SIMD_T::set1_epi32(1)); |
| |
| // gather position |
| typename SIMD_T::Vec4 vInPos0, vInPos1; |
| for (uint32_t c = 0; c < 4; ++c) |
| { |
| vInPos0[c] = GatherComponent(pInVerts, VERTEX_POSITION_SLOT, vActiveMask, s, c); |
| vInPos1[c] = GatherComponent(pInVerts, VERTEX_POSITION_SLOT, vActiveMask, p, c); |
| } |
| |
| // compute inside mask |
| typename SIMD_T::Float s_in = inside<ClippingPlane>(vInPos0); |
| typename SIMD_T::Float p_in = inside<ClippingPlane>(vInPos1); |
| |
| // compute intersection mask (s_in != p_in) |
| typename SIMD_T::Float intersectMask = SIMD_T::xor_ps(s_in, p_in); |
| intersectMask = SIMD_T::and_ps(intersectMask, vActiveMask); |
| |
| // store s if inside |
| s_in = SIMD_T::and_ps(s_in, vActiveMask); |
| if (!SIMD_T::testz_ps(s_in, s_in)) |
| { |
| for (uint32_t c = 0; c < 4; ++c) |
| { |
| ScatterComponent(pOutVerts, VERTEX_POSITION_SLOT, s_in, vOutIndex, c, vInPos0[c]); |
| } |
| |
| // interpolate attributes and store |
| for (uint32_t a = 0; a < numInAttribs; ++a) |
| { |
| uint32_t attribSlot = vertexAttribOffset + a; |
| for (uint32_t c = 0; c < 4; ++c) |
| { |
| typename SIMD_T::Float vAttrib = GatherComponent(pInVerts, attribSlot, s_in, s, c); |
| ScatterComponent(pOutVerts, attribSlot, s_in, vOutIndex, c, vAttrib); |
| } |
| } |
| |
| // increment outIndex |
| vOutIndex = SIMD_T::blendv_epi32(vOutIndex, SIMD_T::add_epi32(vOutIndex, SIMD_T::set1_epi32(1)), s_in); |
| } |
| |
| // compute and store intersection |
| if (!SIMD_T::testz_ps(intersectMask, intersectMask)) |
| { |
| intersect<ClippingPlane>(intersectMask, s, p, vInPos0, vInPos1, vOutIndex, pInVerts, numInAttribs, pOutVerts); |
| |
| // increment outIndex for active lanes |
| vOutIndex = SIMD_T::blendv_epi32(vOutIndex, SIMD_T::add_epi32(vOutIndex, SIMD_T::set1_epi32(1)), intersectMask); |
| } |
| |
| // store p if inside |
| p_in = SIMD_T::and_ps(p_in, vActiveMask); |
| if (!SIMD_T::testz_ps(p_in, p_in)) |
| { |
| for (uint32_t c = 0; c < 4; ++c) |
| { |
| ScatterComponent(pOutVerts, VERTEX_POSITION_SLOT, p_in, vOutIndex, c, vInPos1[c]); |
| } |
| |
| // interpolate attributes and store |
| for (uint32_t a = 0; a < numInAttribs; ++a) |
| { |
| uint32_t attribSlot = vertexAttribOffset + a; |
| for (uint32_t c = 0; c < 4; ++c) |
| { |
| typename SIMD_T::Float vAttrib = GatherComponent(pInVerts, attribSlot, p_in, p, c); |
| ScatterComponent(pOutVerts, attribSlot, p_in, vOutIndex, c, vAttrib); |
| } |
| } |
| |
| // increment outIndex |
| vOutIndex = SIMD_T::blendv_epi32(vOutIndex, SIMD_T::add_epi32(vOutIndex, SIMD_T::set1_epi32(1)), p_in); |
| } |
| } |
| |
| return vOutIndex; |
| } |
| |
| typename SIMD_T::Integer ClipPrims(float *pVertices, const typename SIMD_T::Float &vPrimMask, const typename SIMD_T::Float &vClipMask, int numAttribs) |
| { |
| // temp storage |
| float *pTempVerts = reinterpret_cast<float *>(ClipHelper<SIMD_T>::GetTempVertices()); |
| |
| // zero out num input verts for non-active lanes |
| typename SIMD_T::Integer vNumInPts = SIMD_T::set1_epi32(NumVertsPerPrim); |
| vNumInPts = SIMD_T::blendv_epi32(SIMD_T::setzero_si(), vNumInPts, vClipMask); |
| |
| // clip prims to frustum |
| typename SIMD_T::Integer vNumOutPts; |
| if (NumVertsPerPrim == 3) |
| { |
| vNumOutPts = ClipTriToPlane<FRUSTUM_NEAR>(pVertices, vNumInPts, numAttribs, pTempVerts); |
| vNumOutPts = ClipTriToPlane<FRUSTUM_FAR>(pTempVerts, vNumOutPts, numAttribs, pVertices); |
| vNumOutPts = ClipTriToPlane<FRUSTUM_LEFT>(pVertices, vNumOutPts, numAttribs, pTempVerts); |
| vNumOutPts = ClipTriToPlane<FRUSTUM_RIGHT>(pTempVerts, vNumOutPts, numAttribs, pVertices); |
| vNumOutPts = ClipTriToPlane<FRUSTUM_BOTTOM>(pVertices, vNumOutPts, numAttribs, pTempVerts); |
| vNumOutPts = ClipTriToPlane<FRUSTUM_TOP>(pTempVerts, vNumOutPts, numAttribs, pVertices); |
| } |
| else |
| { |
| SWR_ASSERT(NumVertsPerPrim == 2); |
| vNumOutPts = ClipLineToPlane<FRUSTUM_NEAR>(pVertices, vNumInPts, numAttribs, pTempVerts); |
| vNumOutPts = ClipLineToPlane<FRUSTUM_FAR>(pTempVerts, vNumOutPts, numAttribs, pVertices); |
| vNumOutPts = ClipLineToPlane<FRUSTUM_LEFT>(pVertices, vNumOutPts, numAttribs, pTempVerts); |
| vNumOutPts = ClipLineToPlane<FRUSTUM_RIGHT>(pTempVerts, vNumOutPts, numAttribs, pVertices); |
| vNumOutPts = ClipLineToPlane<FRUSTUM_BOTTOM>(pVertices, vNumOutPts, numAttribs, pTempVerts); |
| vNumOutPts = ClipLineToPlane<FRUSTUM_TOP>(pTempVerts, vNumOutPts, numAttribs, pVertices); |
| } |
| |
| // restore num verts for non-clipped, active lanes |
| typename SIMD_T::Float vNonClippedMask = SIMD_T::andnot_ps(vClipMask, vPrimMask); |
| vNumOutPts = SIMD_T::blendv_epi32(vNumOutPts, SIMD_T::set1_epi32(NumVertsPerPrim), vNonClippedMask); |
| |
| return vNumOutPts; |
| } |
| |
| const uint32_t workerId{ 0 }; |
| DRAW_CONTEXT *pDC{ nullptr }; |
| const API_STATE &state; |
| typename SIMD_T::Float clipCodes[NumVertsPerPrim]; |
| }; |
| |
| |
| // pipeline stage functions |
| void ClipTriangles(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simdvector prims[], uint32_t primMask, simdscalari const &primId, simdscalari const &viewportIdx, simdscalari const &rtIdx); |
| void ClipLines(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simdvector prims[], uint32_t primMask, simdscalari const &primId, simdscalari const &viewportIdx, simdscalari const &rtIdx); |
| void ClipPoints(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simdvector prims[], uint32_t primMask, simdscalari const &primId, simdscalari const &viewportIdx, simdscalari const &rtIdx); |
| #if USE_SIMD16_FRONTEND |
| void SIMDCALL ClipTriangles_simd16(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simd16vector prims[], uint32_t primMask, simd16scalari const &primId, simd16scalari const &viewportIdx, simd16scalari const &rtIdx); |
| void SIMDCALL ClipLines_simd16(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simd16vector prims[], uint32_t primMask, simd16scalari const &primId, simd16scalari const &viewportIdx, simd16scalari const &rtIdx); |
| void SIMDCALL ClipPoints_simd16(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simd16vector prims[], uint32_t primMask, simd16scalari const &primId, simd16scalari const &viewportIdx, simd16scalari const &rtIdx); |
| #endif |
| |