diff --git a/source/Lib/CommonLib/x86/BufferX86.h b/source/Lib/CommonLib/x86/BufferX86.h index 6baefb7b1bf6be3d0a5629af43cda9a8403789cd..7e3c4104f6b70c5d75cf265a70e2fd77fddc00c9 100644 --- a/source/Lib/CommonLib/x86/BufferX86.h +++ b/source/Lib/CommonLib/x86/BufferX86.h @@ -367,7 +367,7 @@ void applyPROF_SSE(Pel* dstPel, int dstStride, const Pel* srcPel, int srcStride, #if !JVET_P0057_BDOF_PROF_HARMONIZATION __m256i mm_dIoffset = _mm256_set1_epi32(1); #endif - __m256i mm_offset = _mm256_set1_epi32(offset); + __m256i mm_offset = _mm256_set1_epi16(offset); __m256i vibdimin = _mm256_set1_epi16(clpRng.min); __m256i vibdimax = _mm256_set1_epi16(clpRng.max); __m256i mm_dimin = _mm256_set1_epi32(-dILimit); @@ -377,7 +377,7 @@ void applyPROF_SSE(Pel* dstPel, int dstStride, const Pel* srcPel, int srcStride, #if !JVET_P0057_BDOF_PROF_HARMONIZATION __m128i mm_dIoffset = _mm_set1_epi32(1); #endif - __m128i mm_offset = _mm_set1_epi32(offset); + __m128i mm_offset = _mm_set1_epi16(offset); __m128i vibdimin = _mm_set1_epi16(clpRng.min); __m128i vibdimax = _mm_set1_epi16(clpRng.max); __m128i mm_dimin = _mm_set1_epi32(-dILimit); @@ -458,9 +458,7 @@ void applyPROF_SSE(Pel* dstPel, int dstStride, const Pel* srcPel, int srcStride, mm_dI = _mm256_add_epi16(mm_dI, mm_src); if (!bi) { - __m256i tmp0 = _mm256_srai_epi32(_mm256_add_epi32(_mm256_srai_epi32(_mm256_unpacklo_epi16(mm_dI, mm_dI),16), mm_offset), shiftNum); - __m256i tmp1 = _mm256_srai_epi32(_mm256_add_epi32(_mm256_srai_epi32(_mm256_unpackhi_epi16(mm_dI, mm_dI),16), mm_offset), shiftNum); - mm_dI = _mm256_packs_epi32(tmp0, tmp1); + mm_dI = _mm256_srai_epi16(_mm256_adds_epi16(mm_dI, mm_offset), shiftNum); mm_dI = _mm256_min_epi16(vibdimax, _mm256_max_epi16(vibdimin, mm_dI)); } @@ -499,9 +497,7 @@ void applyPROF_SSE(Pel* dstPel, int dstStride, const Pel* srcPel, int srcStride, mm_dI = _mm_add_epi16(_mm_unpacklo_epi64(_mm_loadl_epi64((const __m128i *)src), _mm_loadl_epi64((const __m128i *)(src + srcStride))), mm_dI); if (!bi) { - __m128i tmp0 = _mm_srai_epi32(_mm_add_epi32(_mm_srai_epi32(_mm_unpacklo_epi16(mm_dI, mm_dI),16), mm_offset), shiftNum); - __m128i tmp1 = _mm_srai_epi32(_mm_add_epi32(_mm_srai_epi32(_mm_unpackhi_epi16(mm_dI, mm_dI),16), mm_offset), shiftNum); - mm_dI = _mm_packs_epi32(tmp0, tmp1); + mm_dI = _mm_srai_epi16(_mm_adds_epi16(mm_dI, mm_offset), shiftNum); mm_dI = _mm_min_epi16(vibdimax, _mm_max_epi16(vibdimin, mm_dI)); }