diff --git a/source/Lib/CommonLib/x86/AdaptiveLoopFilterX86.h b/source/Lib/CommonLib/x86/AdaptiveLoopFilterX86.h index e7ef6b7f5e75f7f6cc55e85a3c5e6837d352a9be..4a45541aae6bbfe7393b093675c8f113c26a91ff 100644 --- a/source/Lib/CommonLib/x86/AdaptiveLoopFilterX86.h +++ b/source/Lib/CommonLib/x86/AdaptiveLoopFilterX86.h @@ -370,9 +370,16 @@ static void simdDeriveClassificationBlk_HBD(AlfClassifier **classifier, int **la __m128i d1 = _mm_max_epi32(sumD0, sumD1); __m128i d0 = _mm_min_epi32(sumD0, sumD1); - __m128i a = _mm_xor_si128(_mm_mullo_epi32(d1, hv0), _mm_set1_epi32(0x80000000)); - __m128i b = _mm_xor_si128(_mm_mullo_epi32(hv1, d0), _mm_set1_epi32(0x80000000)); - __m128i dirIdx = _mm_cmpgt_epi32(a, b); + __m128i a0 = _mm_mul_epu32(d1, hv0); + __m128i b0 = _mm_mul_epu32(hv1, d0); + __m128i dirIdx0 = _mm_cmpgt_epi64(a0, b0); // SSE4.2 + + __m128i a1 = _mm_mul_epu32(_mm_srli_si128(d1, 4), _mm_srli_si128(hv0, 4)); + __m128i b1 = _mm_mul_epu32(_mm_srli_si128(hv1, 4), _mm_srli_si128(d0, 4)); + __m128i dirIdx1 = _mm_cmpgt_epi64(a1, b1); // SSE4.2 + + __m128i dirIdx = _mm_blend_epi16(dirIdx0, dirIdx1, 0xcc); // SSE4.1 + __m128i hvd1 = _mm_blendv_epi8(hv1, d1, dirIdx); __m128i hvd0 = _mm_blendv_epi8(hv0, d0, dirIdx); @@ -1680,7 +1687,10 @@ template <X86_VEXT vext> void AdaptiveLoopFilter::_initAdaptiveLoopFilterX86() { #if RExt__HIGH_BIT_DEPTH_SUPPORT - m_deriveClassificationBlk = simdDeriveClassificationBlk_HBD; + if (vext >= SSE42) + { + m_deriveClassificationBlk = simdDeriveClassificationBlk_HBD; + } #ifdef USE_AVX2 if (vext >= AVX2) {