blob: 33d37e3cecea7da635a2841d0156b464f2e59da2 [file] [log] [blame]
/****************************************************************************
* Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved.
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice (including the next
* paragraph) shall be included in all copies or substantial portions of the
* Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
* IN THE SOFTWARE.
****************************************************************************/
#ifndef __SWR_INTRIN_H__
#define __SWR_INTRIN_H__
#include "os.h"
#define SIMD_ARCH KNOB_ARCH
#include "simdlib_types.hpp"
typedef SIMDImpl::SIMD128Impl::Float simd4scalar;
typedef SIMDImpl::SIMD128Impl::Double simd4scalard;
typedef SIMDImpl::SIMD128Impl::Integer simd4scalari;
typedef SIMDImpl::SIMD128Impl::Vec4 simd4vector;
typedef SIMDImpl::SIMD128Impl::Mask simd4mask;
typedef SIMDImpl::SIMD256Impl::Float simd8scalar;
typedef SIMDImpl::SIMD256Impl::Double simd8scalard;
typedef SIMDImpl::SIMD256Impl::Integer simd8scalari;
typedef SIMDImpl::SIMD256Impl::Vec4 simd8vector;
typedef SIMDImpl::SIMD256Impl::Mask simd8mask;
typedef SIMDImpl::SIMD512Impl::Float simd16scalar;
typedef SIMDImpl::SIMD512Impl::Double simd16scalard;
typedef SIMDImpl::SIMD512Impl::Integer simd16scalari;
typedef SIMDImpl::SIMD512Impl::Vec4 simd16vector;
typedef SIMDImpl::SIMD512Impl::Mask simd16mask;
#if KNOB_SIMD_WIDTH == 8
typedef simd8scalar simdscalar;
typedef simd8scalard simdscalard;
typedef simd8scalari simdscalari;
typedef simd8vector simdvector;
typedef simd8mask simdmask;
#else
#error Unsupported vector width
#endif
INLINE
UINT pdep_u32(UINT a, UINT mask)
{
#if KNOB_ARCH >= KNOB_ARCH_AVX2
return _pdep_u32(a, mask);
#else
UINT result = 0;
// copied from http://wm.ite.pl/articles/pdep-soft-emu.html
// using bsf instead of funky loop
DWORD maskIndex;
while (_BitScanForward(&maskIndex, mask))
{
// 1. isolate lowest set bit of mask
const UINT lowest = 1 << maskIndex;
// 2. populate LSB from src
const UINT LSB = (UINT)((int)(a << 31) >> 31);
// 3. copy bit from mask
result |= LSB & lowest;
// 4. clear lowest bit
mask &= ~lowest;
// 5. prepare for next iteration
a >>= 1;
}
return result;
#endif
}
INLINE
UINT pext_u32(UINT a, UINT mask)
{
#if KNOB_ARCH >= KNOB_ARCH_AVX2
return _pext_u32(a, mask);
#else
UINT result = 0;
DWORD maskIndex;
uint32_t currentBit = 0;
while (_BitScanForward(&maskIndex, mask))
{
// 1. isolate lowest set bit of mask
const UINT lowest = 1 << maskIndex;
// 2. copy bit from mask
result |= ((a & lowest) > 0) << currentBit++;
// 3. clear lowest bit
mask &= ~lowest;
}
return result;
#endif
}
#endif//__SWR_INTRIN_H__