| /**************************************************************************** |
| * Copyright (C) 2014-2018 Intel Corporation. All Rights Reserved. |
| * |
| * Permission is hereby granted, free of charge, to any person obtaining a |
| * copy of this software and associated documentation files (the "Software"), |
| * to deal in the Software without restriction, including without limitation |
| * the rights to use, copy, modify, merge, publish, distribute, sublicense, |
| * and/or sell copies of the Software, and to permit persons to whom the |
| * Software is furnished to do so, subject to the following conditions: |
| * |
| * The above copyright notice and this permission notice (including the next |
| * paragraph) shall be included in all copies or substantial portions of the |
| * Software. |
| * |
| * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR |
| * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, |
| * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL |
| * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER |
| * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING |
| * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS |
| * IN THE SOFTWARE. |
| * |
| * @file backend.cpp |
| * |
| * @brief Backend handles rasterization, pixel shading and output merger |
| * operations. |
| * |
| ******************************************************************************/ |
| |
| #include <smmintrin.h> |
| |
| #include "backend.h" |
| #include "backend_impl.h" |
| #include "tilemgr.h" |
| #include "memory/tilingtraits.h" |
| #include "core/multisample.h" |
| |
| #include <algorithm> |
| |
| template <typename T> |
| void BackendSingleSample(DRAW_CONTEXT* pDC, |
| uint32_t workerId, |
| uint32_t x, |
| uint32_t y, |
| SWR_TRIANGLE_DESC& work, |
| RenderOutputBuffers& renderBuffers) |
| { |
| RDTSC_BEGIN(pDC->pContext->pBucketMgr, BESingleSampleBackend, pDC->drawId); |
| RDTSC_BEGIN(pDC->pContext->pBucketMgr, BESetup, pDC->drawId); |
| |
| void* pWorkerData = pDC->pContext->threadPool.pThreadData[workerId].pWorkerPrivateData; |
| |
| const API_STATE& state = GetApiState(pDC); |
| |
| BarycentricCoeffs coeffs; |
| SetupBarycentricCoeffs(&coeffs, work); |
| |
| SWR_PS_CONTEXT psContext; |
| const SWR_MULTISAMPLE_POS& samplePos = state.rastState.samplePositions; |
| SetupPixelShaderContext<T>(&psContext, samplePos, work); |
| |
| uint8_t *pDepthBuffer, *pStencilBuffer; |
| SetupRenderBuffers(psContext.pColorBuffer, |
| &pDepthBuffer, |
| &pStencilBuffer, |
| state.colorHottileEnable, |
| renderBuffers); |
| |
| // Indicates backend rendered something to the color buffer |
| bool isTileDirty = false; |
| |
| RDTSC_END(pDC->pContext->pBucketMgr, BESetup, 1); |
| |
| psContext.vY.UL = _simd_add_ps(vULOffsetsY, _simd_set1_ps(static_cast<float>(y))); |
| psContext.vY.center = _simd_add_ps(vCenterOffsetsY, _simd_set1_ps(static_cast<float>(y))); |
| |
| const simdscalar dy = _simd_set1_ps(static_cast<float>(SIMD_TILE_Y_DIM)); |
| |
| for (uint32_t yy = y; yy < y + KNOB_TILE_Y_DIM; yy += SIMD_TILE_Y_DIM) |
| { |
| psContext.vX.UL = _simd_add_ps(vULOffsetsX, _simd_set1_ps(static_cast<float>(x))); |
| psContext.vX.center = _simd_add_ps(vCenterOffsetsX, _simd_set1_ps(static_cast<float>(x))); |
| |
| const simdscalar dx = _simd_set1_ps(static_cast<float>(SIMD_TILE_X_DIM)); |
| |
| for (uint32_t xx = x; xx < x + KNOB_TILE_X_DIM; xx += SIMD_TILE_X_DIM) |
| { |
| const bool useAlternateOffset = ((xx & SIMD_TILE_X_DIM) != 0); |
| |
| |
| simdmask coverageMask = work.coverageMask[0] & MASK; |
| |
| if (coverageMask) |
| { |
| if (state.depthHottileEnable && state.depthBoundsState.depthBoundsTestEnable) |
| { |
| static_assert(KNOB_DEPTH_HOT_TILE_FORMAT == R32_FLOAT, |
| "Unsupported depth hot tile format"); |
| |
| const simdscalar z = |
| _simd_load_ps(reinterpret_cast<const float*>(pDepthBuffer)); |
| |
| const float minz = state.depthBoundsState.depthBoundsTestMinValue; |
| const float maxz = state.depthBoundsState.depthBoundsTestMaxValue; |
| |
| coverageMask &= CalcDepthBoundsAcceptMask(z, minz, maxz); |
| } |
| |
| if (T::InputCoverage != SWR_INPUT_COVERAGE_NONE) |
| { |
| const uint64_t* pCoverageMask = |
| (T::InputCoverage == SWR_INPUT_COVERAGE_INNER_CONSERVATIVE) |
| ? &work.innerCoverageMask |
| : &work.coverageMask[0]; |
| |
| generateInputCoverage<T, T::InputCoverage>( |
| pCoverageMask, psContext.inputMask, state.blendState.sampleMask); |
| } |
| |
| RDTSC_BEGIN(pDC->pContext->pBucketMgr, BEBarycentric, pDC->drawId); |
| |
| CalcPixelBarycentrics(coeffs, psContext); |
| |
| CalcCentroid<T, true>( |
| &psContext, samplePos, coeffs, work.coverageMask, state.blendState.sampleMask); |
| |
| // interpolate and quantize z |
| psContext.vZ = vplaneps( |
| coeffs.vZa, coeffs.vZb, coeffs.vZc, psContext.vI.center, psContext.vJ.center); |
| psContext.vZ = state.pfnQuantizeDepth(psContext.vZ); |
| |
| RDTSC_END(pDC->pContext->pBucketMgr, BEBarycentric, 1); |
| |
| // interpolate user clip distance if available |
| if (state.backendState.clipDistanceMask) |
| { |
| coverageMask &= ~ComputeUserClipMask(state.backendState.clipDistanceMask, |
| work.pUserClipBuffer, |
| psContext.vI.center, |
| psContext.vJ.center); |
| } |
| |
| simdscalar vCoverageMask = _simd_vmask_ps(coverageMask); |
| simdscalar depthPassMask = vCoverageMask; |
| simdscalar stencilPassMask = vCoverageMask; |
| |
| // Early-Z? |
| if (T::bCanEarlyZ) |
| { |
| RDTSC_BEGIN(pDC->pContext->pBucketMgr, BEEarlyDepthTest, pDC->drawId); |
| depthPassMask = DepthStencilTest(&state, |
| work.triFlags.frontFacing, |
| work.triFlags.viewportIndex, |
| psContext.vZ, |
| pDepthBuffer, |
| vCoverageMask, |
| pStencilBuffer, |
| &stencilPassMask); |
| AR_EVENT(EarlyDepthStencilInfoSingleSample(_simd_movemask_ps(depthPassMask), |
| _simd_movemask_ps(stencilPassMask), |
| _simd_movemask_ps(vCoverageMask))); |
| RDTSC_END(pDC->pContext->pBucketMgr, BEEarlyDepthTest, 0); |
| |
| // early-exit if no pixels passed depth or earlyZ is forced on |
| if (state.psState.forceEarlyZ || !_simd_movemask_ps(depthPassMask)) |
| { |
| DepthStencilWrite(&state.vp[work.triFlags.viewportIndex], |
| &state.depthStencilState, |
| work.triFlags.frontFacing, |
| psContext.vZ, |
| pDepthBuffer, |
| depthPassMask, |
| vCoverageMask, |
| pStencilBuffer, |
| stencilPassMask); |
| |
| if (!_simd_movemask_ps(depthPassMask)) |
| { |
| goto Endtile; |
| } |
| } |
| } |
| |
| psContext.sampleIndex = 0; |
| psContext.activeMask = _simd_castps_si(vCoverageMask); |
| |
| // execute pixel shader |
| RDTSC_BEGIN(pDC->pContext->pBucketMgr, BEPixelShader, pDC->drawId); |
| state.psState.pfnPixelShader(GetPrivateState(pDC), pWorkerData, &psContext); |
| RDTSC_END(pDC->pContext->pBucketMgr, BEPixelShader, 0); |
| |
| // update stats |
| UPDATE_STAT_BE(PsInvocations, _mm_popcnt_u32(_simd_movemask_ps(vCoverageMask))); |
| AR_EVENT(PSStats((HANDLE)&psContext.stats)); |
| |
| vCoverageMask = _simd_castsi_ps(psContext.activeMask); |
| |
| if (_simd_movemask_ps(vCoverageMask)) |
| { |
| isTileDirty = true; |
| } |
| |
| // late-Z |
| if (!T::bCanEarlyZ) |
| { |
| RDTSC_BEGIN(pDC->pContext->pBucketMgr, BELateDepthTest, pDC->drawId); |
| depthPassMask = DepthStencilTest(&state, |
| work.triFlags.frontFacing, |
| work.triFlags.viewportIndex, |
| psContext.vZ, |
| pDepthBuffer, |
| vCoverageMask, |
| pStencilBuffer, |
| &stencilPassMask); |
| AR_EVENT(LateDepthStencilInfoSingleSample(_simd_movemask_ps(depthPassMask), |
| _simd_movemask_ps(stencilPassMask), |
| _simd_movemask_ps(vCoverageMask))); |
| RDTSC_END(pDC->pContext->pBucketMgr, BELateDepthTest, 0); |
| |
| if (!_simd_movemask_ps(depthPassMask)) |
| { |
| // need to call depth/stencil write for stencil write |
| DepthStencilWrite(&state.vp[work.triFlags.viewportIndex], |
| &state.depthStencilState, |
| work.triFlags.frontFacing, |
| psContext.vZ, |
| pDepthBuffer, |
| depthPassMask, |
| vCoverageMask, |
| pStencilBuffer, |
| stencilPassMask); |
| goto Endtile; |
| } |
| } |
| else |
| { |
| // for early z, consolidate discards from shader |
| // into depthPassMask |
| depthPassMask = _simd_and_ps(depthPassMask, vCoverageMask); |
| } |
| |
| uint32_t statMask = _simd_movemask_ps(depthPassMask); |
| uint32_t statCount = _mm_popcnt_u32(statMask); |
| UPDATE_STAT_BE(DepthPassCount, statCount); |
| |
| // output merger |
| RDTSC_BEGIN(pDC->pContext->pBucketMgr, BEOutputMerger, pDC->drawId); |
| |
| OutputMerger8x2(pDC, |
| psContext, |
| psContext.pColorBuffer, |
| 0, |
| &state.blendState, |
| state.pfnBlendFunc, |
| vCoverageMask, |
| depthPassMask, |
| state.psState.renderTargetMask, |
| useAlternateOffset, |
| workerId); |
| |
| // do final depth write after all pixel kills |
| if (!state.psState.forceEarlyZ) |
| { |
| DepthStencilWrite(&state.vp[work.triFlags.viewportIndex], |
| &state.depthStencilState, |
| work.triFlags.frontFacing, |
| psContext.vZ, |
| pDepthBuffer, |
| depthPassMask, |
| vCoverageMask, |
| pStencilBuffer, |
| stencilPassMask); |
| } |
| RDTSC_END(pDC->pContext->pBucketMgr, BEOutputMerger, 0); |
| } |
| |
| Endtile: |
| RDTSC_BEGIN(pDC->pContext->pBucketMgr, BEEndTile, pDC->drawId); |
| |
| work.coverageMask[0] >>= (SIMD_TILE_Y_DIM * SIMD_TILE_X_DIM); |
| if (T::InputCoverage == SWR_INPUT_COVERAGE_INNER_CONSERVATIVE) |
| { |
| work.innerCoverageMask >>= (SIMD_TILE_Y_DIM * SIMD_TILE_X_DIM); |
| } |
| |
| if (useAlternateOffset) |
| { |
| unsigned long rt; |
| uint32_t rtMask = state.colorHottileEnable; |
| while (_BitScanForward(&rt, rtMask)) |
| { |
| rtMask &= ~(1 << rt); |
| psContext.pColorBuffer[rt] += |
| (2 * KNOB_SIMD_WIDTH * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp) / 8; |
| } |
| } |
| |
| pDepthBuffer += (KNOB_SIMD_WIDTH * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp) / 8; |
| pStencilBuffer += |
| (KNOB_SIMD_WIDTH * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp) / 8; |
| |
| RDTSC_END(pDC->pContext->pBucketMgr, BEEndTile, 0); |
| |
| psContext.vX.UL = _simd_add_ps(psContext.vX.UL, dx); |
| psContext.vX.center = _simd_add_ps(psContext.vX.center, dx); |
| } |
| |
| psContext.vY.UL = _simd_add_ps(psContext.vY.UL, dy); |
| psContext.vY.center = _simd_add_ps(psContext.vY.center, dy); |
| } |
| |
| if (isTileDirty) |
| { |
| SetRenderHotTilesDirty(pDC, renderBuffers); |
| } |
| |
| RDTSC_END(pDC->pContext->pBucketMgr, BESingleSampleBackend, 0); |
| } |
| |
| // Recursive template used to auto-nest conditionals. Converts dynamic enum function |
| // arguments to static template arguments. |
| template <uint32_t... ArgsT> |
| struct BEChooserSingleSample |
| { |
| // Last Arg Terminator |
| static PFN_BACKEND_FUNC GetFunc(SWR_BACKEND_FUNCS tArg) |
| { |
| switch (tArg) |
| { |
| case SWR_BACKEND_SINGLE_SAMPLE: |
| return BackendSingleSample<SwrBackendTraits<ArgsT...>>; |
| break; |
| case SWR_BACKEND_MSAA_PIXEL_RATE: |
| case SWR_BACKEND_MSAA_SAMPLE_RATE: |
| default: |
| SWR_ASSERT(0 && "Invalid backend func\n"); |
| return nullptr; |
| break; |
| } |
| } |
| |
| // Recursively parse args |
| template <typename... TArgsT> |
| static PFN_BACKEND_FUNC GetFunc(SWR_INPUT_COVERAGE tArg, TArgsT... remainingArgs) |
| { |
| switch (tArg) |
| { |
| case SWR_INPUT_COVERAGE_NONE: |
| return BEChooserSingleSample<ArgsT..., SWR_INPUT_COVERAGE_NONE>::GetFunc( |
| remainingArgs...); |
| break; |
| case SWR_INPUT_COVERAGE_NORMAL: |
| return BEChooserSingleSample<ArgsT..., SWR_INPUT_COVERAGE_NORMAL>::GetFunc( |
| remainingArgs...); |
| break; |
| case SWR_INPUT_COVERAGE_INNER_CONSERVATIVE: |
| return BEChooserSingleSample<ArgsT..., SWR_INPUT_COVERAGE_INNER_CONSERVATIVE>::GetFunc( |
| remainingArgs...); |
| break; |
| default: |
| SWR_ASSERT(0 && "Invalid sample pattern\n"); |
| return BEChooserSingleSample<ArgsT..., SWR_INPUT_COVERAGE_NONE>::GetFunc( |
| remainingArgs...); |
| break; |
| } |
| } |
| |
| // Recursively parse args |
| template <typename... TArgsT> |
| static PFN_BACKEND_FUNC GetFunc(SWR_MULTISAMPLE_COUNT tArg, TArgsT... remainingArgs) |
| { |
| switch (tArg) |
| { |
| case SWR_MULTISAMPLE_1X: |
| return BEChooserSingleSample<ArgsT..., SWR_MULTISAMPLE_1X>::GetFunc(remainingArgs...); |
| break; |
| case SWR_MULTISAMPLE_2X: |
| return BEChooserSingleSample<ArgsT..., SWR_MULTISAMPLE_2X>::GetFunc(remainingArgs...); |
| break; |
| case SWR_MULTISAMPLE_4X: |
| return BEChooserSingleSample<ArgsT..., SWR_MULTISAMPLE_4X>::GetFunc(remainingArgs...); |
| break; |
| case SWR_MULTISAMPLE_8X: |
| return BEChooserSingleSample<ArgsT..., SWR_MULTISAMPLE_8X>::GetFunc(remainingArgs...); |
| break; |
| case SWR_MULTISAMPLE_16X: |
| return BEChooserSingleSample<ArgsT..., SWR_MULTISAMPLE_16X>::GetFunc(remainingArgs...); |
| break; |
| default: |
| SWR_ASSERT(0 && "Invalid sample count\n"); |
| return BEChooserSingleSample<ArgsT..., SWR_MULTISAMPLE_1X>::GetFunc(remainingArgs...); |
| break; |
| } |
| } |
| |
| // Recursively parse args |
| template <typename... TArgsT> |
| static PFN_BACKEND_FUNC GetFunc(bool tArg, TArgsT... remainingArgs) |
| { |
| if (tArg == true) |
| { |
| return BEChooserSingleSample<ArgsT..., 1>::GetFunc(remainingArgs...); |
| } |
| |
| return BEChooserSingleSample<ArgsT..., 0>::GetFunc(remainingArgs...); |
| } |
| }; |
| |
| void InitBackendSingleFuncTable(PFN_BACKEND_FUNC (&table)[SWR_INPUT_COVERAGE_COUNT][2][2]) |
| { |
| for (uint32_t inputCoverage = 0; inputCoverage < SWR_INPUT_COVERAGE_COUNT; inputCoverage++) |
| { |
| for (uint32_t isCentroid = 0; isCentroid < 2; isCentroid++) |
| { |
| for (uint32_t canEarlyZ = 0; canEarlyZ < 2; canEarlyZ++) |
| { |
| table[inputCoverage][isCentroid][canEarlyZ] = |
| BEChooserSingleSample<>::GetFunc(SWR_MULTISAMPLE_1X, |
| false, |
| (SWR_INPUT_COVERAGE)inputCoverage, |
| (isCentroid > 0), |
| false, |
| (canEarlyZ > 0), |
| SWR_BACKEND_SINGLE_SAMPLE); |
| } |
| } |
| } |
| } |