diff --git a/source/Lib/CommonLib/x86/AdaptiveLoopFilterX86.h b/source/Lib/CommonLib/x86/AdaptiveLoopFilterX86.h index 5bf5a76354b543b1dd1121fba1b3d7f916a479ee..0aecb82edeb355a0f122496ed225e2a915b952f5 100644 --- a/source/Lib/CommonLib/x86/AdaptiveLoopFilterX86.h +++ b/source/Lib/CommonLib/x86/AdaptiveLoopFilterX86.h @@ -147,21 +147,32 @@ static void simdDeriveClassificationBlk(AlfClassifier **classifier, int **laplac x0 = _mm_add_epi16(x0, x1); x2 = _mm_add_epi16(x2, x3); - x0 = _mm_add_epi16(x0, x2); - x4 = _mm_add_epi16(x4, x5); x6 = _mm_add_epi16(x6, x7); - x4 = _mm_add_epi16(x4, x6); - - x1 = _mm_unpacklo_epi16(x0, x4); - x5 = _mm_unpackhi_epi16(x0, x4); - x0 = _mm_unpacklo_epi16(x1, x5); - x4 = _mm_unpackhi_epi16(x1, x5); - __m128i sumV = _mm_cvtepu16_epi32(x0); - __m128i sumH = _mm_unpackhi_epi16(x0, _mm_setzero_si128()); - __m128i sumD0 = _mm_cvtepu16_epi32(x4); - __m128i sumD1 = _mm_unpackhi_epi16(x4, _mm_setzero_si128()); + __m128i x0l = _mm_cvtepu16_epi32(x0); + __m128i x0h = _mm_unpackhi_epi16(x0, _mm_setzero_si128()); + __m128i x2l = _mm_cvtepu16_epi32(x2); + __m128i x2h = _mm_unpackhi_epi16(x2, _mm_setzero_si128()); + __m128i x4l = _mm_cvtepu16_epi32(x4); + __m128i x4h = _mm_unpackhi_epi16(x4, _mm_setzero_si128()); + __m128i x6l = _mm_cvtepu16_epi32(x6); + __m128i x6h = _mm_unpackhi_epi16(x6, _mm_setzero_si128()); + + x0l = _mm_add_epi32(x0l, x2l); + x4l = _mm_add_epi32(x4l, x6l); + x0h = _mm_add_epi32(x0h, x2h); + x4h = _mm_add_epi32(x4h, x6h); + + x2l = _mm_unpacklo_epi32(x0l, x4l); + x2h = _mm_unpackhi_epi32(x0l, x4l); + x6l = _mm_unpacklo_epi32(x0h, x4h); + x6h = _mm_unpackhi_epi32(x0h, x4h); + + __m128i sumV = _mm_unpacklo_epi32(x2l, x6l); + __m128i sumH = _mm_unpackhi_epi32(x2l, x6l); + __m128i sumD0 = _mm_unpacklo_epi32(x2h, x6h); + __m128i sumD1 = _mm_unpackhi_epi32(x2h, x6h); // uint32_t tempAct = sumV + sumH; __m128i tempAct = _mm_add_epi32(sumV, sumH);