From 5d7955be7d48d015a871f48ce42eb8a886d7cec3 Mon Sep 17 00:00:00 2001
From: Jeeva Raj A <jeeva.raj@ittiam.com>
Date: Fri, 1 Feb 2019 18:59:38 +0530
Subject: [PATCH] use _mm256_sra_epi16 instead of _mm256_srai_epi16

---
 .../CommonLib/x86/InterpolationFilterX86.h    | 19 ++++++++++---------
 1 file changed, 10 insertions(+), 9 deletions(-)

diff --git a/source/Lib/CommonLib/x86/InterpolationFilterX86.h b/source/Lib/CommonLib/x86/InterpolationFilterX86.h
index e31a8b61b..4bea013a3 100644
--- a/source/Lib/CommonLib/x86/InterpolationFilterX86.h
+++ b/source/Lib/CommonLib/x86/InterpolationFilterX86.h
@@ -989,7 +989,7 @@ static void simdInterpolateN2_M4( const int16_t* src, int srcStride, int16_t *ds
 }
 #if JVET_M0147_DMVR
 #ifdef USE_AVX2
-static inline __m256i simdInterpolateLuma10Bit2P16(int16_t const *src1, int srcStride, __m256i *mmCoeff, const __m256i & mmOffset, int shift)
+static inline __m256i simdInterpolateLuma10Bit2P16(int16_t const *src1, int srcStride, __m256i *mmCoeff, const __m256i & mmOffset, __m128i &mmShift)
 {
   __m256i sumLo;
   {
@@ -999,12 +999,12 @@ static inline __m256i simdInterpolateLuma10Bit2P16(int16_t const *src1, int srcS
     __m256i lo1 = _mm256_mullo_epi16(mmPix1, mmCoeff[1]);
     sumLo = _mm256_add_epi16(lo0, lo1);
   }
-  sumLo = _mm256_srai_epi16(_mm256_add_epi16(sumLo, mmOffset), shift);
+  sumLo = _mm256_sra_epi16(_mm256_add_epi16(sumLo, mmOffset), mmShift);
   return(sumLo);
 }
 #endif
 
-static inline __m128i simdInterpolateLuma10Bit2P8(int16_t const *src1, int srcStride, __m128i *mmCoeff, const __m128i & mmOffset, int shift)
+static inline __m128i simdInterpolateLuma10Bit2P8(int16_t const *src1, int srcStride, __m128i *mmCoeff, const __m128i & mmOffset, __m128i &mmShift)
 {
   __m128i sumLo;
   {
@@ -1014,11 +1014,11 @@ static inline __m128i simdInterpolateLuma10Bit2P8(int16_t const *src1, int srcSt
     __m128i lo1 = _mm_mullo_epi16(mmPix1, mmCoeff[1]);
     sumLo = _mm_add_epi16(lo0, lo1);
   }
-  sumLo = _mm_srai_epi16(_mm_add_epi16(sumLo, mmOffset), shift);
+  sumLo = _mm_sra_epi16(_mm_add_epi16(sumLo, mmOffset), mmShift);
   return(sumLo);
 }
 
-static inline __m128i simdInterpolateLuma10Bit2P4(int16_t const *src, int srcStride, __m128i *mmCoeff, const __m128i & mmOffset, int shift)
+static inline __m128i simdInterpolateLuma10Bit2P4(int16_t const *src, int srcStride, __m128i *mmCoeff, const __m128i & mmOffset, __m128i &mmShift)
 {
   __m128i sumLo;
   {
@@ -1028,7 +1028,7 @@ static inline __m128i simdInterpolateLuma10Bit2P4(int16_t const *src, int srcStr
     __m128i lo1 = _mm_mullo_epi16(mmPix1, mmCoeff[1]);
     sumLo = _mm_add_epi16(lo0, lo1);
   }
-  sumLo = _mm_srai_epi16(_mm_add_epi16(sumLo, mmOffset), shift);
+  sumLo = _mm_sra_epi16(_mm_add_epi16(sumLo, mmOffset), mmShift);
   return sumLo;
 }
 
@@ -1037,6 +1037,7 @@ static void simdInterpolateN2_10BIT_M4(const int16_t* src, int srcStride, int16_
 {
   int row, col;
   __m128i mmOffset = _mm_set1_epi16(offset);
+  __m128i mmShift = _mm_set_epi64x(0, shift);
   __m128i mmCoeff[2];
   for (int n = 0; n < 2; n++)
     mmCoeff[n] = _mm_set1_epi16(c[n]);
@@ -1056,19 +1057,19 @@ static void simdInterpolateN2_10BIT_M4(const int16_t* src, int srcStride, int16_
     // multiple of 16
     for (; col < ((width >> 4) << 4); col += 16)
     {
-      __m256i mmFiltered = simdInterpolateLuma10Bit2P16(src + col, cStride, mm256Coeff, mm256Offset, shift);
+      __m256i mmFiltered = simdInterpolateLuma10Bit2P16(src + col, cStride, mm256Coeff, mm256Offset, mmShift);
       _mm256_storeu_si256((__m256i *)(dst + col), mmFiltered);
     }
 #endif
     // multiple of 8
     for (; col < ((width >> 3) << 3); col += 8)
     {
-      __m128i mmFiltered = simdInterpolateLuma10Bit2P8(src + col, cStride, mmCoeff, mmOffset, shift);
+      __m128i mmFiltered = simdInterpolateLuma10Bit2P8(src + col, cStride, mmCoeff, mmOffset, mmShift);
       _mm_storeu_si128((__m128i *)(dst + col), mmFiltered);
     }
 
     // last 4 samples
-    __m128i mmFiltered = simdInterpolateLuma10Bit2P4(src + col, cStride, mmCoeff, mmOffset, shift);
+    __m128i mmFiltered = simdInterpolateLuma10Bit2P4(src + col, cStride, mmCoeff, mmOffset, mmShift);
     _mm_storel_epi64((__m128i *)(dst + col), mmFiltered);
     src += srcStride;
     dst += dstStride;
-- 
GitLab