Skip to content
Snippets Groups Projects
Commit 935c50a3 authored by Frank Bossen's avatar Frank Bossen
Browse files

Clean up CommonDefX86.h

Remove unused functions and macros
parent 70a9723a
No related branches found
No related tags found
1 merge request!2667Clean up CommonDefX86.h
Pipeline #10926 passed
...@@ -31,18 +31,9 @@ ...@@ -31,18 +31,9 @@
* THE POSSIBILITY OF SUCH DAMAGE. * THE POSSIBILITY OF SUCH DAMAGE.
*/ */
/** \file CommonDefX86.h #pragma once
*/
#ifndef __COMMONDEFX86__
#define __COMMONDEFX86__
#include "CommonLib/CommonDef.h" #include "CommonLib/CommonDef.h"
//! \ingroup CommonLib
//! \{
#ifdef TARGET_SIMD_X86 #ifdef TARGET_SIMD_X86
#include <immintrin.h> #include <immintrin.h>
...@@ -59,315 +50,4 @@ ...@@ -59,315 +50,4 @@
#define SIMDX86 SSE41 #define SIMDX86 SSE41
#endif #endif
#endif // TARGET_SIMD_X86
#define TRANSPOSE4x4(T) \
{\
__m128i a01b01 = _mm_unpacklo_epi32(T[0], T[1]);\
__m128i a23b23 = _mm_unpackhi_epi32(T[0], T[1]);\
__m128i c01d01 = _mm_unpacklo_epi32(T[2], T[3]);\
__m128i c23d23 = _mm_unpackhi_epi32(T[2], T[3]);\
\
T[0] = _mm_unpacklo_epi64(a01b01, c01d01);\
T[1] = _mm_unpackhi_epi64(a01b01, c01d01);\
T[2] = _mm_unpacklo_epi64(a23b23, c23d23);\
T[3] = _mm_unpackhi_epi64(a23b23, c23d23);\
}\
#define TRANSPOSE8x8(T) \
{\
__m128i a03b03 = _mm_unpacklo_epi16(T[0], T[1]);\
__m128i c03d03 = _mm_unpacklo_epi16(T[2], T[3]);\
__m128i e03f03 = _mm_unpacklo_epi16(T[4], T[5]);\
__m128i g03h03 = _mm_unpacklo_epi16(T[6], T[7]);\
__m128i a47b47 = _mm_unpackhi_epi16(T[0], T[1]);\
__m128i c47d47 = _mm_unpackhi_epi16(T[2], T[3]);\
__m128i e47f47 = _mm_unpackhi_epi16(T[4], T[5]);\
__m128i g47h47 = _mm_unpackhi_epi16(T[6], T[7]);\
\
__m128i a01b01c01d01 = _mm_unpacklo_epi32(a03b03, c03d03);\
__m128i a23b23c23d23 = _mm_unpackhi_epi32(a03b03, c03d03);\
__m128i e01f01g01h01 = _mm_unpacklo_epi32(e03f03, g03h03);\
__m128i e23f23g23h23 = _mm_unpackhi_epi32(e03f03, g03h03);\
__m128i a45b45c45d45 = _mm_unpacklo_epi32(a47b47, c47d47);\
__m128i a67b67c67d67 = _mm_unpackhi_epi32(a47b47, c47d47);\
__m128i e45f45g45h45 = _mm_unpacklo_epi32(e47f47, g47h47);\
__m128i e67f67g67h67 = _mm_unpackhi_epi32(e47f47, g47h47);\
\
T[0] = _mm_unpacklo_epi64(a01b01c01d01, e01f01g01h01);\
T[1] = _mm_unpackhi_epi64(a01b01c01d01, e01f01g01h01);\
T[2] = _mm_unpacklo_epi64(a23b23c23d23, e23f23g23h23);\
T[3] = _mm_unpackhi_epi64(a23b23c23d23, e23f23g23h23);\
T[4] = _mm_unpacklo_epi64(a45b45c45d45, e45f45g45h45);\
T[5] = _mm_unpackhi_epi64(a45b45c45d45, e45f45g45h45);\
T[6] = _mm_unpacklo_epi64(a67b67c67d67, e67f67g67h67);\
T[7] = _mm_unpackhi_epi64(a67b67c67d67, e67f67g67h67);\
}\
#define TRANSPOSESAT4x4(T)\
{\
TRANSPOSE4x4(T);\
T[0] = _mm_cvtepi16_epi32(_mm_packs_epi32(T[0], vzero));\
T[1] = _mm_cvtepi16_epi32(_mm_packs_epi32(T[1], vzero));\
T[2] = _mm_cvtepi16_epi32(_mm_packs_epi32(T[2], vzero));\
T[3] = _mm_cvtepi16_epi32(_mm_packs_epi32(T[3], vzero));\
}\
#define ADDCLIP4(dstptr, res, min, max)\
{\
__m128i vdst = _mm_lddqu_si128((__m128i*) dstptr);\
vdst = _mm_add_epi16(vdst, _mm_packs_epi32(res, vzero));\
vdst = _mm_min_epi16(max, _mm_max_epi16(min, vdst));\
_mm_storel_epi64((__m128i*) dstptr, vdst);\
}\
#define TRANSPOSESTORE8x8_ALGN(T, D, stride)\
{\
TRANSPOSE8x8(T); \
\
_mm_store_si128((__m128i*)&D[0*stride], T[0]);\
_mm_store_si128((__m128i*)&D[1*stride], T[1]);\
_mm_store_si128((__m128i*)&D[2*stride], T[2]);\
_mm_store_si128((__m128i*)&D[3*stride], T[3]);\
_mm_store_si128((__m128i*)&D[4*stride], T[4]);\
_mm_store_si128((__m128i*)&D[5*stride], T[5]);\
_mm_store_si128((__m128i*)&D[6*stride], T[6]);\
_mm_store_si128((__m128i*)&D[7*stride], T[7]);\
}\
#define ADDCLIP(dstptr, res, min, max)\
{\
__m128i vdst = _mm_load_si128((__m128i*) dstptr);\
vdst = _mm_add_epi16(vdst, res ); \
vdst = _mm_min_epi16(max,_mm_max_epi16(min, vdst));\
_mm_store_si128((__m128i*)dstptr, vdst);\
}\
#define TRANSPOSEADDCLIPSTORE8x8_ALGN(T, D, stride, min, max)\
{\
TRANSPOSE8x8(T); \
\
ADDCLIP(&D[0*stride], T[0], min, max);\
ADDCLIP(&D[1*stride], T[1], min, max);\
ADDCLIP(&D[2*stride], T[2], min, max);\
ADDCLIP(&D[3*stride], T[3], min, max);\
ADDCLIP(&D[4*stride], T[4], min, max);\
ADDCLIP(&D[5*stride], T[5], min, max);\
ADDCLIP(&D[6*stride], T[6], min, max);\
ADDCLIP(&D[7*stride], T[7], min, max);\
}\
static inline __m128i _mm_sel_si128(__m128i a, __m128i b, __m128i mask)
{
#ifdef USE_SSE41
return _mm_blendv_epi8( a, b, mask);
#else
return _mm_or_si128( _mm_andnot_si128( mask, a ), _mm_and_si128( b, mask ));
#endif
}
static inline __m128i _mm_clip_epi8(__m128i v, __m128i low, __m128i hi)
{
#ifdef USE_SSE41
return _mm_min_epi8(_mm_max_epi8(v, low), hi);
#else
__m128i vlowm = _mm_cmplt_epi8(v, low);
__m128i vhighm = _mm_cmpgt_epi8(v, hi);
return _mm_sel_si128(_mm_sel_si128(v, low, vlowm), hi, vhighm);
#endif
}
#ifdef USE_AVX2
#define TRANSPOSESTORE16x16_ALGN(T, D, stride)\
{\
TRANSPOSE16x16_AVX2(T); \
\
for (int _i_=0; _i_ < 16; _i_++)\
_mm256_store_si256((__m256i*)&D[_i_*stride], T[_i_]);\
}\
#define ADDCLIPAVX2(dstptr, res, min, max)\
{\
__m256i vdst = _mm256_load_si256((__m256i*) dstptr);\
vdst = _mm256_adds_epi16(vdst, res ); \
vdst = _mm256_min_epi16(max,_mm256_max_epi16(min, vdst));\
_mm256_store_si256((__m256i*)dstptr, vdst);\
}\
#define TRANSPOSEADDCLIPSTORE16x16_ALGN(T, D, stride, min, max)\
{\
TRANSPOSE16x16_AVX2(T); \
\
for (int _i_=0; _i_ < 16; _i_++)\
ADDCLIPAVX2(&D[_i_*stride], T[_i_], min, max);\
}
static inline void TRANSPOSE16x16_AVX2(__m256i T[16])
{
__m256i T_03[8];
__m256i T_47[8];
for (int i = 0; i < 8; i++)
{
T_03[i] = _mm256_unpacklo_epi16(T[2*i], T[2*i+1]);
T_47[i] = _mm256_unpackhi_epi16(T[2*i], T[2*i+1]);
}
__m256i T_01[4];
__m256i T_23[4];
__m256i T_45[4];
__m256i T_67[4];
for (int i = 0; i < 4; i++)
{
T_01[i] = _mm256_unpacklo_epi32(T_03[2*i], T_03[2*i+1]);
T_23[i] = _mm256_unpackhi_epi32(T_03[2*i], T_03[2*i+1]);
T_45[i] = _mm256_unpacklo_epi32(T_47[2*i], T_47[2*i+1]);
T_67[i] = _mm256_unpackhi_epi32(T_47[2*i], T_47[2*i+1]);
}
__m256i TR[8][2];
for (int i = 0; i < 2; i++)
{
TR[0][i] = _mm256_unpacklo_epi64(T_01[2*i], T_01[2*i+1]);
TR[1][i] = _mm256_unpackhi_epi64(T_01[2*i], T_01[2*i+1]);
TR[2][i] = _mm256_unpacklo_epi64(T_23[2*i], T_23[2*i+1]);
TR[3][i] = _mm256_unpackhi_epi64(T_23[2*i], T_23[2*i+1]);
TR[4][i] = _mm256_unpacklo_epi64(T_45[2*i], T_45[2*i+1]);
TR[5][i] = _mm256_unpackhi_epi64(T_45[2*i], T_45[2*i+1]);
TR[6][i] = _mm256_unpacklo_epi64(T_67[2*i], T_67[2*i+1]);
TR[7][i] = _mm256_unpackhi_epi64(T_67[2*i], T_67[2*i+1]);
}
for (int i = 0; i < 8; i++)
{
T[i] = _mm256_permute2x128_si256(TR[i][0], TR[i][1], 0x20);
T[i+8] = _mm256_permute2x128_si256(TR[i][0], TR[i][1], 0x31);
}
}
static inline void TRANSPOSE16x8_AVX2(__m256i T[8])
{
__m256i T_03[4];
__m256i T_47[4];
for (int i = 0; i < 4; i++)
{
T_03[i] = _mm256_unpacklo_epi16(T[2*i], T[2*i+1]);
T_47[i] = _mm256_unpackhi_epi16(T[2*i], T[2*i+1]);
}
__m256i T_01[2];
__m256i T_23[2];
__m256i T_45[2];
__m256i T_67[2];
for (int i = 0; i < 2; i++)
{
T_01[i] = _mm256_unpacklo_epi32(T_03[2*i], T_03[2*i+1]);
T_23[i] = _mm256_unpackhi_epi32(T_03[2*i], T_03[2*i+1]);
T_45[i] = _mm256_unpacklo_epi32(T_47[2*i], T_47[2*i+1]);
T_67[i] = _mm256_unpackhi_epi32(T_47[2*i], T_47[2*i+1]);
}
T[0] = _mm256_unpacklo_epi64(T_01[0], T_01[1]);
T[1] = _mm256_unpackhi_epi64(T_01[0], T_01[1]);
T[2] = _mm256_unpacklo_epi64(T_23[0], T_23[1]);
T[3] = _mm256_unpackhi_epi64(T_23[0], T_23[1]);
T[4] = _mm256_unpacklo_epi64(T_45[0], T_45[1]);
T[5] = _mm256_unpackhi_epi64(T_45[0], T_45[1]);
T[6] = _mm256_unpacklo_epi64(T_67[0], T_67[1]);
T[7] = _mm256_unpackhi_epi64(T_67[0], T_67[1]);
}
static inline void TRANSPOSE8x8_32b_AVX2(__m256i T[8])
{
__m256i T_03[4];
__m256i T_47[4];
for (int i=0; i<4; i++){
T_03[i] = _mm256_unpacklo_epi32(T[2*i], T[2*i+1]);
T_47[i] = _mm256_unpackhi_epi32(T[2*i], T[2*i+1]);
}
__m256i T_01[2];
__m256i T_23[2];
__m256i T_45[2];
__m256i T_67[2];
for (int i = 0; i < 2; i++)
{
T_01[i] = _mm256_unpacklo_epi64(T_03[2*i], T_03[2*i+1]);
T_23[i] = _mm256_unpackhi_epi64(T_03[2*i], T_03[2*i+1]);
T_45[i] = _mm256_unpacklo_epi64(T_47[2*i], T_47[2*i+1]);
T_67[i] = _mm256_unpackhi_epi64(T_47[2*i], T_47[2*i+1]);
}
T[0] = _mm256_permute2x128_si256(T_01[0], T_01[1], 0x20);
T[4] = _mm256_permute2x128_si256(T_01[0], T_01[1], 0x31);
T[1] = _mm256_permute2x128_si256(T_23[0], T_23[1], 0x20);
T[5] = _mm256_permute2x128_si256(T_23[0], T_23[1], 0x31);
T[2] = _mm256_permute2x128_si256(T_45[0], T_45[1], 0x20);
T[6] = _mm256_permute2x128_si256(T_45[0], T_45[1], 0x31);
T[3] = _mm256_permute2x128_si256(T_67[0], T_67[1], 0x20);
T[7] = _mm256_permute2x128_si256(T_67[0], T_67[1], 0x31);
}
#endif
#ifdef USE_AVX512
ALWAYS_INLINE inline __m512i
_mm512_set_epi16( int16_t x31, int16_t x30, int16_t x29, int16_t x28,
int16_t x27, int16_t x26, int16_t x25, int16_t x24,
int16_t x23, int16_t x22, int16_t x21, int16_t x20,
int16_t x19, int16_t x18, int16_t x17, int16_t x16,
int16_t x15, int16_t x14, int16_t x13, int16_t x12,
int16_t x11, int16_t x10, int16_t x9, int16_t x8,
int16_t x7, int16_t x6, int16_t x5, int16_t x4,
int16_t x3, int16_t x2, int16_t x1, int16_t x0 )
{
return _mm512_set_epi32( (x31<<16) + (0xffff & x30), (x29<<16) + (0xffff & x28),
(x27<<16) + (0xffff & x26), (x25<<16) + (0xffff & x24),
(x23<<16) + (0xffff & x22), (x21<<16) + (0xffff & x20),
(x19<<16) + (0xffff & x18), (x17<<16) + (0xffff & x16),
(x15<<16) + (0xffff & x14), (x13<<16) + (0xffff & x12),
(x11<<16) + (0xffff & x10), ( x9<<16) + (0xffff & x8),
( x7<<16) + (0xffff & x6), ( x5<<16) + (0xffff & x4),
( x3<<16) + (0xffff & x2), ( x1<<16) + (0xffff & x0) );
}
#endif
#ifdef ENABLE_REGISTER_PRINTING
/* note for gcc: this helper throws a compilation error
* because of name mangling when used with different types for R at the same time,
* the workaround is to compile with -fabi-version=4 or higher
* (needs gcc >= 4.5) */
template< class T, class R >
static void _printReg( const R var, const char* varname, uint8_t count = sizeof( R ) )
{
T *val = (T*)&var;
const int varcnt = std::min(count, (uint8_t)(sizeof(R)/sizeof(T)));
std::cout << varname << ":";
for( int i = 0; i < varcnt; ++i )
{
std::cout << " " << std::setw(sizeof(T)*2 + 1)<< val[i];
}
std::cout << std::endl;
}
#define PREG( var, t, cnt ) \
{ \
static unsigned c = cnt; \
if( c ) \
{ \
std::cout << cnt - c << " "; \
_printReg<t>( var, #var ); \
c--; \
} \
}
#else
#define PREG( var, t, cnt )
#endif
#endif // TARGET_SIMD_X86
#endif // __COMMONDEFX86__
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment