From 38738d6a65f7e172a8a57b7a1b41dc6003afb1c4 Mon Sep 17 00:00:00 2001 From: Vadim Seregin <vseregin@qti.qualcomm.com> Date: Wed, 20 Oct 2021 20:10:31 -0700 Subject: [PATCH] split AVX2 SSE SIMD --- source/Lib/CommonLib/x86/RdCostX86.h | 47 ++++++++++++++++++---------- 1 file changed, 31 insertions(+), 16 deletions(-) diff --git a/source/Lib/CommonLib/x86/RdCostX86.h b/source/Lib/CommonLib/x86/RdCostX86.h index 529287f2e..0bbbe46e5 100644 --- a/source/Lib/CommonLib/x86/RdCostX86.h +++ b/source/Lib/CommonLib/x86/RdCostX86.h @@ -389,26 +389,41 @@ Distortion RdCost::xGetSSE_NxN_SIMD(const DistParam &rcDtParam) if (vext >= AVX2 && ((iCols & 15) == 0)) { #ifdef USE_AVX2 - __m256i Sum = _mm256_setzero_si256(); __m256i vzero = _mm256_setzero_si256(); - for (int iY = 0; iY < iRows; iY++) + + int num = 1; + int rows = iRows; + + if( iRows > 64 ) + { + num = iRows / 64; + rows = 64; + } + + for( int i = 0; i < num; i++ ) { - for (int iX = 0; iX < iCols; iX += 16) + __m256i Sum = _mm256_setzero_si256(); + + for( int iY = 0; iY < rows; iY++ ) { - __m256i Src1 = (sizeof(Torg) > 1) ? (_mm256_lddqu_si256((__m256i *)(&pSrc1[iX]))) : (_mm256_unpacklo_epi8(_mm256_permute4x64_epi64(_mm256_castsi128_si256(_mm_lddqu_si128((__m128i *) (&pSrc1[iX]))), 0xD8), vzero)); - __m256i Src2 = (sizeof(Tcur) > 1) ? (_mm256_lddqu_si256((__m256i *)(&pSrc2[iX]))) : (_mm256_unpacklo_epi8(_mm256_permute4x64_epi64(_mm256_castsi128_si256(_mm_lddqu_si128((__m128i *) (&pSrc2[iX]))), 0xD8), vzero)); - __m256i Diff = _mm256_sub_epi16(Src1, Src2); - __m256i Res = _mm256_madd_epi16(Diff, Diff); - Sum = _mm256_add_epi32(Sum, Res); + for( int iX = 0; iX < iCols; iX += 16 ) + { + __m256i Src1 = (sizeof( Torg ) > 1) ? (_mm256_lddqu_si256( (__m256i *)(&pSrc1[iX]) )) : (_mm256_unpacklo_epi8( _mm256_permute4x64_epi64( _mm256_castsi128_si256( _mm_lddqu_si128( (__m128i *) (&pSrc1[iX]) ) ), 0xD8 ), vzero )); + __m256i Src2 = (sizeof( Tcur ) > 1) ? (_mm256_lddqu_si256( (__m256i *)(&pSrc2[iX]) )) : (_mm256_unpacklo_epi8( _mm256_permute4x64_epi64( _mm256_castsi128_si256( _mm_lddqu_si128( (__m128i *) (&pSrc2[iX]) ) ), 0xD8 ), vzero )); + __m256i Diff = _mm256_sub_epi16( Src1, Src2 ); + __m256i Res = _mm256_madd_epi16( Diff, Diff ); + Sum = _mm256_add_epi32( Sum, Res ); + } + pSrc1 += iStrideSrc1; + pSrc2 += iStrideSrc2; } - pSrc1 += iStrideSrc1; - pSrc2 += iStrideSrc2; - } - Sum = _mm256_add_epi64(_mm256_unpacklo_epi32(Sum, vzero), _mm256_unpackhi_epi32(Sum, vzero)); - Sum = _mm256_add_epi64(Sum, _mm256_permute4x64_epi64(Sum, 14)); - Sum = _mm256_add_epi64(Sum, _mm256_permute4x64_epi64(Sum, 1)); - uiRet = _mm_cvtsi128_si64(_mm256_castsi256_si128(Sum)) >> uiShift; + Sum = _mm256_add_epi64( _mm256_unpacklo_epi32( Sum, vzero ), _mm256_unpackhi_epi32( Sum, vzero ) ); + Sum = _mm256_add_epi64( Sum, _mm256_permute4x64_epi64( Sum, 14 ) ); + Sum = _mm256_add_epi64( Sum, _mm256_permute4x64_epi64( Sum, 1 ) ); + uiRet += _mm_cvtsi128_si64( _mm256_castsi256_si128( Sum ) ) >> uiShift; + } + uiRet >>= uiShift; #endif } else if (iCols > 32 && (iCols & 15) == 0) @@ -2578,7 +2593,7 @@ void RdCost::_initRdCostX86() m_afpDistortFunc[DF_SSE16] = xGetSSE_NxN_SIMD<16, vext>; m_afpDistortFunc[DF_SSE32] = xGetSSE_NxN_SIMD<32, vext>; m_afpDistortFunc[DF_SSE64] = xGetSSE_NxN_SIMD<64, vext>; -#if DIST_SSE_ENABLE && CTU_256 && 0 +#if CTU_256 m_afpDistortFunc[DF_SSE16N] = xGetSSE_NxN_SIMD<vext>; #else m_afpDistortFunc[DF_SSE16N] = xGetSSE_NxN_SIMD<128, vext>; -- GitLab