diff --git a/source/Lib/CommonLib/x86/CommonDefX86.h b/source/Lib/CommonLib/x86/CommonDefX86.h index eda3ec91507bf1a137f51b29f95cd806566805a3..c3138999cc70190ad16f330fca6b3a8011094a42 100644 --- a/source/Lib/CommonLib/x86/CommonDefX86.h +++ b/source/Lib/CommonLib/x86/CommonDefX86.h @@ -31,18 +31,9 @@ * THE POSSIBILITY OF SUCH DAMAGE. */ -/** \file CommonDefX86.h -*/ - -#ifndef __COMMONDEFX86__ -#define __COMMONDEFX86__ - - +#pragma once #include "CommonLib/CommonDef.h" -//! \ingroup CommonLib -//! \{ - #ifdef TARGET_SIMD_X86 #include <immintrin.h> @@ -59,315 +50,4 @@ #define SIMDX86 SSE41 #endif - -#define TRANSPOSE4x4(T) \ -{\ - __m128i a01b01 = _mm_unpacklo_epi32(T[0], T[1]);\ - __m128i a23b23 = _mm_unpackhi_epi32(T[0], T[1]);\ - __m128i c01d01 = _mm_unpacklo_epi32(T[2], T[3]);\ - __m128i c23d23 = _mm_unpackhi_epi32(T[2], T[3]);\ -\ - T[0] = _mm_unpacklo_epi64(a01b01, c01d01);\ - T[1] = _mm_unpackhi_epi64(a01b01, c01d01);\ - T[2] = _mm_unpacklo_epi64(a23b23, c23d23);\ - T[3] = _mm_unpackhi_epi64(a23b23, c23d23);\ -}\ - -#define TRANSPOSE8x8(T) \ -{\ - __m128i a03b03 = _mm_unpacklo_epi16(T[0], T[1]);\ - __m128i c03d03 = _mm_unpacklo_epi16(T[2], T[3]);\ - __m128i e03f03 = _mm_unpacklo_epi16(T[4], T[5]);\ - __m128i g03h03 = _mm_unpacklo_epi16(T[6], T[7]);\ - __m128i a47b47 = _mm_unpackhi_epi16(T[0], T[1]);\ - __m128i c47d47 = _mm_unpackhi_epi16(T[2], T[3]);\ - __m128i e47f47 = _mm_unpackhi_epi16(T[4], T[5]);\ - __m128i g47h47 = _mm_unpackhi_epi16(T[6], T[7]);\ -\ - __m128i a01b01c01d01 = _mm_unpacklo_epi32(a03b03, c03d03);\ - __m128i a23b23c23d23 = _mm_unpackhi_epi32(a03b03, c03d03);\ - __m128i e01f01g01h01 = _mm_unpacklo_epi32(e03f03, g03h03);\ - __m128i e23f23g23h23 = _mm_unpackhi_epi32(e03f03, g03h03);\ - __m128i a45b45c45d45 = _mm_unpacklo_epi32(a47b47, c47d47);\ - __m128i a67b67c67d67 = _mm_unpackhi_epi32(a47b47, c47d47);\ - __m128i e45f45g45h45 = _mm_unpacklo_epi32(e47f47, g47h47);\ - __m128i e67f67g67h67 = _mm_unpackhi_epi32(e47f47, g47h47);\ -\ - T[0] = _mm_unpacklo_epi64(a01b01c01d01, e01f01g01h01);\ - T[1] = _mm_unpackhi_epi64(a01b01c01d01, e01f01g01h01);\ - T[2] = _mm_unpacklo_epi64(a23b23c23d23, e23f23g23h23);\ - T[3] = _mm_unpackhi_epi64(a23b23c23d23, e23f23g23h23);\ - T[4] = _mm_unpacklo_epi64(a45b45c45d45, e45f45g45h45);\ - T[5] = _mm_unpackhi_epi64(a45b45c45d45, e45f45g45h45);\ - T[6] = _mm_unpacklo_epi64(a67b67c67d67, e67f67g67h67);\ - T[7] = _mm_unpackhi_epi64(a67b67c67d67, e67f67g67h67);\ -}\ - -#define TRANSPOSESAT4x4(T)\ -{\ - TRANSPOSE4x4(T);\ - T[0] = _mm_cvtepi16_epi32(_mm_packs_epi32(T[0], vzero));\ - T[1] = _mm_cvtepi16_epi32(_mm_packs_epi32(T[1], vzero));\ - T[2] = _mm_cvtepi16_epi32(_mm_packs_epi32(T[2], vzero));\ - T[3] = _mm_cvtepi16_epi32(_mm_packs_epi32(T[3], vzero));\ -}\ - -#define ADDCLIP4(dstptr, res, min, max)\ -{\ - __m128i vdst = _mm_lddqu_si128((__m128i*) dstptr);\ - vdst = _mm_add_epi16(vdst, _mm_packs_epi32(res, vzero));\ - vdst = _mm_min_epi16(max, _mm_max_epi16(min, vdst));\ - _mm_storel_epi64((__m128i*) dstptr, vdst);\ -}\ - -#define TRANSPOSESTORE8x8_ALGN(T, D, stride)\ -{\ - TRANSPOSE8x8(T); \ -\ - _mm_store_si128((__m128i*)&D[0*stride], T[0]);\ - _mm_store_si128((__m128i*)&D[1*stride], T[1]);\ - _mm_store_si128((__m128i*)&D[2*stride], T[2]);\ - _mm_store_si128((__m128i*)&D[3*stride], T[3]);\ - _mm_store_si128((__m128i*)&D[4*stride], T[4]);\ - _mm_store_si128((__m128i*)&D[5*stride], T[5]);\ - _mm_store_si128((__m128i*)&D[6*stride], T[6]);\ - _mm_store_si128((__m128i*)&D[7*stride], T[7]);\ -}\ - -#define ADDCLIP(dstptr, res, min, max)\ -{\ - __m128i vdst = _mm_load_si128((__m128i*) dstptr);\ - vdst = _mm_add_epi16(vdst, res ); \ - vdst = _mm_min_epi16(max,_mm_max_epi16(min, vdst));\ - _mm_store_si128((__m128i*)dstptr, vdst);\ -}\ - -#define TRANSPOSEADDCLIPSTORE8x8_ALGN(T, D, stride, min, max)\ -{\ - TRANSPOSE8x8(T); \ -\ - ADDCLIP(&D[0*stride], T[0], min, max);\ - ADDCLIP(&D[1*stride], T[1], min, max);\ - ADDCLIP(&D[2*stride], T[2], min, max);\ - ADDCLIP(&D[3*stride], T[3], min, max);\ - ADDCLIP(&D[4*stride], T[4], min, max);\ - ADDCLIP(&D[5*stride], T[5], min, max);\ - ADDCLIP(&D[6*stride], T[6], min, max);\ - ADDCLIP(&D[7*stride], T[7], min, max);\ -}\ - - -static inline __m128i _mm_sel_si128(__m128i a, __m128i b, __m128i mask) -{ -#ifdef USE_SSE41 - return _mm_blendv_epi8( a, b, mask); -#else - return _mm_or_si128( _mm_andnot_si128( mask, a ), _mm_and_si128( b, mask )); -#endif -} - - -static inline __m128i _mm_clip_epi8(__m128i v, __m128i low, __m128i hi) -{ -#ifdef USE_SSE41 - return _mm_min_epi8(_mm_max_epi8(v, low), hi); -#else - __m128i vlowm = _mm_cmplt_epi8(v, low); - __m128i vhighm = _mm_cmpgt_epi8(v, hi); - return _mm_sel_si128(_mm_sel_si128(v, low, vlowm), hi, vhighm); -#endif -} - - -#ifdef USE_AVX2 - - -#define TRANSPOSESTORE16x16_ALGN(T, D, stride)\ -{\ - TRANSPOSE16x16_AVX2(T); \ -\ - for (int _i_=0; _i_ < 16; _i_++)\ - _mm256_store_si256((__m256i*)&D[_i_*stride], T[_i_]);\ -}\ - -#define ADDCLIPAVX2(dstptr, res, min, max)\ -{\ - __m256i vdst = _mm256_load_si256((__m256i*) dstptr);\ - vdst = _mm256_adds_epi16(vdst, res ); \ - vdst = _mm256_min_epi16(max,_mm256_max_epi16(min, vdst));\ - _mm256_store_si256((__m256i*)dstptr, vdst);\ -}\ - -#define TRANSPOSEADDCLIPSTORE16x16_ALGN(T, D, stride, min, max)\ -{\ - TRANSPOSE16x16_AVX2(T); \ -\ - for (int _i_=0; _i_ < 16; _i_++)\ - ADDCLIPAVX2(&D[_i_*stride], T[_i_], min, max);\ -} - -static inline void TRANSPOSE16x16_AVX2(__m256i T[16]) -{ - __m256i T_03[8]; - __m256i T_47[8]; - for (int i = 0; i < 8; i++) - { - T_03[i] = _mm256_unpacklo_epi16(T[2*i], T[2*i+1]); - T_47[i] = _mm256_unpackhi_epi16(T[2*i], T[2*i+1]); - } - - __m256i T_01[4]; - __m256i T_23[4]; - __m256i T_45[4]; - __m256i T_67[4]; - for (int i = 0; i < 4; i++) - { - T_01[i] = _mm256_unpacklo_epi32(T_03[2*i], T_03[2*i+1]); - T_23[i] = _mm256_unpackhi_epi32(T_03[2*i], T_03[2*i+1]); - T_45[i] = _mm256_unpacklo_epi32(T_47[2*i], T_47[2*i+1]); - T_67[i] = _mm256_unpackhi_epi32(T_47[2*i], T_47[2*i+1]); - } - __m256i TR[8][2]; - - for (int i = 0; i < 2; i++) - { - TR[0][i] = _mm256_unpacklo_epi64(T_01[2*i], T_01[2*i+1]); - TR[1][i] = _mm256_unpackhi_epi64(T_01[2*i], T_01[2*i+1]); - TR[2][i] = _mm256_unpacklo_epi64(T_23[2*i], T_23[2*i+1]); - TR[3][i] = _mm256_unpackhi_epi64(T_23[2*i], T_23[2*i+1]); - TR[4][i] = _mm256_unpacklo_epi64(T_45[2*i], T_45[2*i+1]); - TR[5][i] = _mm256_unpackhi_epi64(T_45[2*i], T_45[2*i+1]); - TR[6][i] = _mm256_unpacklo_epi64(T_67[2*i], T_67[2*i+1]); - TR[7][i] = _mm256_unpackhi_epi64(T_67[2*i], T_67[2*i+1]); - } - for (int i = 0; i < 8; i++) - { - T[i] = _mm256_permute2x128_si256(TR[i][0], TR[i][1], 0x20); - T[i+8] = _mm256_permute2x128_si256(TR[i][0], TR[i][1], 0x31); - } -} - -static inline void TRANSPOSE16x8_AVX2(__m256i T[8]) -{ - __m256i T_03[4]; - __m256i T_47[4]; - for (int i = 0; i < 4; i++) - { - T_03[i] = _mm256_unpacklo_epi16(T[2*i], T[2*i+1]); - T_47[i] = _mm256_unpackhi_epi16(T[2*i], T[2*i+1]); - } - - __m256i T_01[2]; - __m256i T_23[2]; - __m256i T_45[2]; - __m256i T_67[2]; - for (int i = 0; i < 2; i++) - { - T_01[i] = _mm256_unpacklo_epi32(T_03[2*i], T_03[2*i+1]); - T_23[i] = _mm256_unpackhi_epi32(T_03[2*i], T_03[2*i+1]); - T_45[i] = _mm256_unpacklo_epi32(T_47[2*i], T_47[2*i+1]); - T_67[i] = _mm256_unpackhi_epi32(T_47[2*i], T_47[2*i+1]); - } - T[0] = _mm256_unpacklo_epi64(T_01[0], T_01[1]); - T[1] = _mm256_unpackhi_epi64(T_01[0], T_01[1]); - T[2] = _mm256_unpacklo_epi64(T_23[0], T_23[1]); - T[3] = _mm256_unpackhi_epi64(T_23[0], T_23[1]); - T[4] = _mm256_unpacklo_epi64(T_45[0], T_45[1]); - T[5] = _mm256_unpackhi_epi64(T_45[0], T_45[1]); - T[6] = _mm256_unpacklo_epi64(T_67[0], T_67[1]); - T[7] = _mm256_unpackhi_epi64(T_67[0], T_67[1]); -} - -static inline void TRANSPOSE8x8_32b_AVX2(__m256i T[8]) -{ - __m256i T_03[4]; - __m256i T_47[4]; - for (int i=0; i<4; i++){ - T_03[i] = _mm256_unpacklo_epi32(T[2*i], T[2*i+1]); - T_47[i] = _mm256_unpackhi_epi32(T[2*i], T[2*i+1]); - } - __m256i T_01[2]; - __m256i T_23[2]; - __m256i T_45[2]; - __m256i T_67[2]; - for (int i = 0; i < 2; i++) - { - T_01[i] = _mm256_unpacklo_epi64(T_03[2*i], T_03[2*i+1]); - T_23[i] = _mm256_unpackhi_epi64(T_03[2*i], T_03[2*i+1]); - T_45[i] = _mm256_unpacklo_epi64(T_47[2*i], T_47[2*i+1]); - T_67[i] = _mm256_unpackhi_epi64(T_47[2*i], T_47[2*i+1]); - } - - T[0] = _mm256_permute2x128_si256(T_01[0], T_01[1], 0x20); - T[4] = _mm256_permute2x128_si256(T_01[0], T_01[1], 0x31); - T[1] = _mm256_permute2x128_si256(T_23[0], T_23[1], 0x20); - T[5] = _mm256_permute2x128_si256(T_23[0], T_23[1], 0x31); - T[2] = _mm256_permute2x128_si256(T_45[0], T_45[1], 0x20); - T[6] = _mm256_permute2x128_si256(T_45[0], T_45[1], 0x31); - T[3] = _mm256_permute2x128_si256(T_67[0], T_67[1], 0x20); - T[7] = _mm256_permute2x128_si256(T_67[0], T_67[1], 0x31); -} - - -#endif - -#ifdef USE_AVX512 - -ALWAYS_INLINE inline __m512i -_mm512_set_epi16( int16_t x31, int16_t x30, int16_t x29, int16_t x28, - int16_t x27, int16_t x26, int16_t x25, int16_t x24, - int16_t x23, int16_t x22, int16_t x21, int16_t x20, - int16_t x19, int16_t x18, int16_t x17, int16_t x16, - int16_t x15, int16_t x14, int16_t x13, int16_t x12, - int16_t x11, int16_t x10, int16_t x9, int16_t x8, - int16_t x7, int16_t x6, int16_t x5, int16_t x4, - int16_t x3, int16_t x2, int16_t x1, int16_t x0 ) -{ - return _mm512_set_epi32( (x31<<16) + (0xffff & x30), (x29<<16) + (0xffff & x28), - (x27<<16) + (0xffff & x26), (x25<<16) + (0xffff & x24), - (x23<<16) + (0xffff & x22), (x21<<16) + (0xffff & x20), - (x19<<16) + (0xffff & x18), (x17<<16) + (0xffff & x16), - (x15<<16) + (0xffff & x14), (x13<<16) + (0xffff & x12), - (x11<<16) + (0xffff & x10), ( x9<<16) + (0xffff & x8), - ( x7<<16) + (0xffff & x6), ( x5<<16) + (0xffff & x4), - ( x3<<16) + (0xffff & x2), ( x1<<16) + (0xffff & x0) ); -} -#endif - -#ifdef ENABLE_REGISTER_PRINTING -/* note for gcc: this helper throws a compilation error - * because of name mangling when used with different types for R at the same time, - * the workaround is to compile with -fabi-version=4 or higher - * (needs gcc >= 4.5) */ -template< class T, class R > -static void _printReg( const R var, const char* varname, uint8_t count = sizeof( R ) ) -{ - T *val = (T*)&var; - const int varcnt = std::min(count, (uint8_t)(sizeof(R)/sizeof(T))); - std::cout << varname << ":"; - for( int i = 0; i < varcnt; ++i ) - { - std::cout << " " << std::setw(sizeof(T)*2 + 1)<< val[i]; - } - std::cout << std::endl; -} - -#define PREG( var, t, cnt ) \ -{ \ - static unsigned c = cnt; \ - if( c ) \ - { \ - std::cout << cnt - c << " "; \ - _printReg<t>( var, #var ); \ - c--; \ - } \ -} -#else -#define PREG( var, t, cnt ) -#endif - -#endif // TARGET_SIMD_X86 - -#endif // __COMMONDEFX86__ - - +#endif // TARGET_SIMD_X86