diff --git a/source/Lib/CommonLib/InterPrediction.cpp b/source/Lib/CommonLib/InterPrediction.cpp
index a023f9a6eb7a36c7fab2b7e5952431e614f41591..5edd4970ac58ff0f85e2b31a669308142af6e97d 100644
--- a/source/Lib/CommonLib/InterPrediction.cpp
+++ b/source/Lib/CommonLib/InterPrediction.cpp
@@ -1878,20 +1878,7 @@ void InterPrediction::motionCompensation( PredictionUnit &pu, const RefPicList &
 
 int InterPrediction::rightShiftMSB(int numer, int denom)
 {
-  int     d;
-  int msbIdx = 0;
-  for (msbIdx = 0; msbIdx<32; msbIdx++)
-  {
-    if (denom < ((int)1 << msbIdx))
-    {
-      break;
-    }
-  }
-
-  int shiftIdx = msbIdx - 1;
-  d = (numer >> shiftIdx);
-
-  return d;
+  return numer >> floorLog2(denom);
 }
 
 void InterPrediction::motionCompensation4Triangle( CodingUnit &cu, MergeCtx &triangleMrgCtx, const bool splitDir, const uint8_t candIdx0, const uint8_t candIdx1 )
diff --git a/source/Lib/CommonLib/x86/BufferX86.h b/source/Lib/CommonLib/x86/BufferX86.h
index 9198a8e7d671736a072b4dac89f7bc69599343ff..4a64ef834fc8f346dc999688f17f184af3b16ebb 100644
--- a/source/Lib/CommonLib/x86/BufferX86.h
+++ b/source/Lib/CommonLib/x86/BufferX86.h
@@ -53,44 +53,33 @@ void addAvg_SSE( const int16_t* src0, int src0Stride, const int16_t* src1, int s
 {
   if( W == 8 )
   {
-    // TODO: AVX2 impl
-    {
-      __m128i vzero    = _mm_setzero_si128();
-      __m128i voffset  = _mm_set1_epi32( offset );
-      __m128i vibdimin = _mm_set1_epi16( clpRng.min );
-      __m128i vibdimax = _mm_set1_epi16( clpRng.max );
-
-      for( int row = 0; row < height; row++ )
-      {
-        for( int col = 0; col < width; col += 8 )
-        {
-          __m128i vsrc0 = _mm_loadu_si128( ( const __m128i * )&src0[col] );
-          __m128i vsrc1 = _mm_loadu_si128( ( const __m128i * )&src1[col] );
+    CHECK(offset & 1, "offset must be even");
+    CHECK(offset < -32768 || offset > 32767, "offset must be a 16-bit value");
 
-          __m128i vtmp, vsum, vdst;
-          vsum = _mm_cvtepi16_epi32   ( vsrc0 );
-          vdst = _mm_cvtepi16_epi32   ( vsrc1 );
-          vsum = _mm_add_epi32        ( vsum, vdst );
-          vsum = _mm_add_epi32        ( vsum, voffset );
-          vtmp = _mm_srai_epi32       ( vsum, shift );
-
-          vsrc0 = _mm_unpackhi_epi64  ( vsrc0, vzero );
-          vsrc1 = _mm_unpackhi_epi64  ( vsrc1, vzero );
-          vsum = _mm_cvtepi16_epi32   ( vsrc0 );
-          vdst = _mm_cvtepi16_epi32   ( vsrc1 );
-          vsum = _mm_add_epi32        ( vsum, vdst );
-          vsum = _mm_add_epi32        ( vsum, voffset );
-          vsum = _mm_srai_epi32       ( vsum, shift );
-          vsum = _mm_packs_epi32      ( vtmp, vsum );
-
-          vsum = _mm_min_epi16( vibdimax, _mm_max_epi16( vibdimin, vsum ) );
-          _mm_storeu_si128( ( __m128i * )&dst[col], vsum );
-        }
+    __m128i vibdimin = _mm_set1_epi16(clpRng.min);
+    __m128i vibdimax = _mm_set1_epi16(clpRng.max);
 
-        src0 += src0Stride;
-        src1 += src1Stride;
-        dst  +=  dstStride;
+    for (int row = 0; row < height; row++)
+    {
+      for (int col = 0; col < width; col += 8)
+      {
+        __m128i vsrc0 = _mm_loadu_si128((const __m128i *) &src0[col]);
+        __m128i vsrc1 = _mm_loadu_si128((const __m128i *) &src1[col]);
+
+        vsrc0 = _mm_xor_si128(vsrc0, _mm_set1_epi16(0x7fff));
+        vsrc1 = _mm_xor_si128(vsrc1, _mm_set1_epi16(0x7fff));
+        vsrc0 = _mm_avg_epu16(vsrc0, vsrc1);
+        vsrc0 = _mm_xor_si128(vsrc0, _mm_set1_epi16(0x7fff));
+        vsrc0 = _mm_adds_epi16(vsrc0, _mm_set1_epi16(offset >> 1));
+        vsrc0 = _mm_sra_epi16(vsrc0, _mm_cvtsi32_si128(shift - 1));
+        vsrc0 = _mm_max_epi16(vsrc0, vibdimin);
+        vsrc0 = _mm_min_epi16(vsrc0, vibdimax);
+        _mm_storeu_si128((__m128i *) &dst[col], vsrc0);
       }
+
+      src0 += src0Stride;
+      src1 += src1Stride;
+      dst += dstStride;
     }
   }
   else if( W == 4 )
@@ -131,126 +120,118 @@ void addAvg_SSE( const int16_t* src0, int src0Stride, const int16_t* src1, int s
 template<X86_VEXT vext>
 void copyBufferSimd(Pel *src, int srcStride, Pel *dst, int dstStride, int width, int height)
 {
-  __m128i x;
-#ifdef USE_AVX2
-  __m256i x16;
-#endif
-  int j, temp;
-  for (int i = 0; i < height; i++)
+  if (width < 8)
   {
-    j = 0;
-    temp = width;
-#ifdef USE_AVX2
-    while ((temp >> 4) > 0)
-    {
-      x16 = _mm256_loadu_si256((const __m256i*)(&src[i * srcStride + j]));
-      _mm256_storeu_si256((__m256i*)(&dst[i * dstStride + j]), x16);
-      j += 16;
-      temp -= 16;
-    }
-#endif
-    while ((temp >> 3) > 0)
-    {
-      x = _mm_loadu_si128((const __m128i*)(&src[ i * srcStride + j]));
-      _mm_storeu_si128((__m128i*)(&dst[ i * dstStride + j]), x);
-      j += 8;
-      temp -= 8;
-    }
-    while ((temp >> 2) > 0)
+    CHECK(width < 4, "width must be at least 4");
+
+    for (size_t x = 0; x < width; x += 4)
     {
-      x = _mm_loadl_epi64((const __m128i*)(&src[i * srcStride + j]));
-      _mm_storel_epi64((__m128i*)(&dst[i*dstStride + j]), x);
-      j += 4;
-      temp -= 4;
+      if (x > width - 4)
+        x = width - 4;
+      for (size_t y = 0; y < height; y++)
+      {
+        __m128i val = _mm_loadl_epi64((const __m128i *) (src + y * srcStride + x));
+        _mm_storel_epi64((__m128i *) (dst + y * dstStride + x), val);
+      }
     }
-    while (temp > 0)
+  }
+  else
+  {
+    for (size_t x = 0; x < width; x += 8)
     {
-      dst[i * dstStride + j] = src[i * srcStride + j];
-      j++;
-      temp--;
+      if (x > width - 8)
+        x = width - 8;
+      for (size_t y = 0; y < height; y++)
+      {
+        __m128i val = _mm_loadu_si128((const __m128i *) (src + y * srcStride + x));
+        _mm_storeu_si128((__m128i *) (dst + y * dstStride + x), val);
+      }
     }
   }
 }
 
-
 template<X86_VEXT vext>
 void paddingSimd(Pel *dst, int stride, int width, int height, int padSize)
 {
-  __m128i x;
-#ifdef USE_AVX2
-  __m256i x16;
-#endif
-  int temp, j;
-  for (int i = 1; i <= padSize; i++)
+  size_t extWidth = width + 2 * padSize;
+  CHECK(extWidth < 8, "width plus 2 times padding size must be at least 8");
+
+  if (padSize == 1)
   {
-    j = 0;
-    temp = width;
-#ifdef USE_AVX2
-    while ((temp >> 4) > 0)
+    for (size_t i = 0; i < height; i++)
     {
+      Pel left                = dst[i * stride];
+      Pel right               = dst[i * stride + width - 1];
+      dst[i * stride - 1]     = left;
+      dst[i * stride + width] = right;
+    }
 
-      x16 = _mm256_loadu_si256((const __m256i*)(&(dst[j])));
-      _mm256_storeu_si256((__m256i*)(dst + j - i*stride), x16);
-      x16 = _mm256_loadu_si256((const __m256i*)(dst + j + (height - 1)*stride));
-      _mm256_storeu_si256((__m256i*)(dst + j + (height - 1 + i)*stride), x16);
-
+    dst -= 1;
 
-      j = j + 16;
-      temp = temp - 16;
-    }
-#endif
-    while ((temp >> 3) > 0)
+    for (size_t i = 0; i < extWidth - 8; i++)
     {
+      __m128i top = _mm_loadu_si128((const __m128i *) (dst + i));
+      _mm_storeu_si128((__m128i *) (dst - stride + i), top);
+    }
+    __m128i top = _mm_loadu_si128((const __m128i *) (dst + extWidth - 8));
+    _mm_storeu_si128((__m128i *) (dst - stride + extWidth - 8), top);
 
-      x = _mm_loadu_si128((const __m128i*)(&(dst[j])));
-      _mm_storeu_si128((__m128i*)(dst + j - i*stride), x);
-      x = _mm_loadu_si128((const __m128i*)(dst + j + (height - 1)*stride));
-      _mm_storeu_si128((__m128i*)(dst + j + (height - 1 + i)*stride), x);
+    dst += height * stride;
 
-      j = j + 8;
-      temp = temp - 8;
-    }
-    while ((temp >> 2) > 0)
+    for (size_t i = 0; i < extWidth - 8; i++)
     {
-      x = _mm_loadl_epi64((const __m128i*)(&dst[j]));
-      _mm_storel_epi64((__m128i*)(dst + j - i*stride), x);
-      x = _mm_loadl_epi64((const __m128i*)(dst + j + (height - 1)*stride));
-      _mm_storel_epi64((__m128i*)(dst + j + (height - 1 + i)*stride), x);
-
-      j = j + 4;
-      temp = temp - 4;
+      __m128i bottom = _mm_loadu_si128((const __m128i *) (dst - stride + i));
+      _mm_storeu_si128((__m128i *) (dst + i), bottom);
     }
-    while (temp > 0)
+    __m128i bottom = _mm_loadu_si128((const __m128i *) (dst - stride + extWidth - 8));
+    _mm_storeu_si128((__m128i *) (dst + extWidth - 8), bottom);
+  }
+  else if (padSize == 2)
+  {
+    for (size_t i = 0; i < height; i++)
     {
-      dst[j - i*stride] = dst[j];
-      dst[j + (height - 1 + i)*stride] = dst[j + (height - 1)*stride];
-      j++;
-      temp--;
+      Pel left                    = dst[i * stride];
+      Pel right                   = dst[i * stride + width - 1];
+      dst[i * stride - 2]         = left;
+      dst[i * stride - 1]         = left;
+      dst[i * stride + width]     = right;
+      dst[i * stride + width + 1] = right;
     }
-  }
 
+    dst -= 2;
 
-  //Left and Right Padding
-  Pel* ptr1 = dst - padSize*stride;
-  Pel* ptr2 = dst - padSize*stride + width - 1;
-  int offset = 0;
-  for (int i = 0; i < height + 2 * padSize; i++)
-  {
-    offset = stride * i;
-    for (int j = 1; j <= padSize; j++)
+    for (size_t i = 0; i < extWidth - 8; i++)
     {
-      *(ptr1 - j + offset) = *(ptr1 + offset);
-      *(ptr2 + j + offset) = *(ptr2 + offset);
+      __m128i top = _mm_loadu_si128((const __m128i *) (dst + i));
+      _mm_storeu_si128((__m128i *) (dst - 2 * stride + i), top);
+      _mm_storeu_si128((__m128i *) (dst - stride + i), top);
     }
+    __m128i top = _mm_loadu_si128((const __m128i *) (dst + extWidth - 8));
+    _mm_storeu_si128((__m128i *) (dst - 2 * stride + extWidth - 8), top);
+    _mm_storeu_si128((__m128i *) (dst - stride + extWidth - 8), top);
+
+    dst += height * stride;
 
+    for (size_t i = 0; i < extWidth - 8; i++)
+    {
+      __m128i bottom = _mm_loadu_si128((const __m128i *) (dst - stride + i));
+      _mm_storeu_si128((__m128i *) (dst + i), bottom);
+      _mm_storeu_si128((__m128i *) (dst + stride + i), bottom);
+    }
+    __m128i bottom = _mm_loadu_si128((const __m128i *) (dst - stride + extWidth - 8));
+    _mm_storeu_si128((__m128i *) (dst + extWidth - 8), bottom);
+    _mm_storeu_si128((__m128i *) (dst + stride + extWidth - 8), bottom);
+  }
+  else
+  {
+    CHECK(false, "padding size must be 1 or 2");
   }
 }
+
 template< X86_VEXT vext >
 void addBIOAvg4_SSE(const Pel* src0, int src0Stride, const Pel* src1, int src1Stride, Pel *dst, int dstStride, const Pel *gradX0, const Pel *gradX1, const Pel *gradY0, const Pel*gradY1, int gradStride, int width, int height, int tmpx, int tmpy, int shift, int offset, const ClpRng& clpRng)
 {
-  __m128i mm_tmpx = _mm_unpacklo_epi64(_mm_set1_epi16(tmpx), _mm_set1_epi16(tmpy));
-  __m128i mm_boffset = _mm_set1_epi32(1);
-  __m128i mm_offset = _mm_set1_epi32(offset);
+  __m128i c        = _mm_unpacklo_epi16(_mm_set1_epi16(tmpx), _mm_set1_epi16(tmpy));
   __m128i vibdimin = _mm_set1_epi16(clpRng.min);
   __m128i vibdimax = _mm_set1_epi16(clpRng.max);
 
@@ -258,20 +239,22 @@ void addBIOAvg4_SSE(const Pel* src0, int src0Stride, const Pel* src1, int src1St
   {
     for (int x = 0; x < width; x += 4)
     {
-      __m128i mm_a = _mm_unpacklo_epi64(_mm_loadl_epi64((const __m128i *)(gradX0 + x)), _mm_loadl_epi64((const __m128i *)(gradY0 + x)));
-      __m128i mm_b = _mm_unpacklo_epi64(_mm_loadl_epi64((const __m128i *)(gradX1 + x)), _mm_loadl_epi64((const __m128i *)(gradY1 + x)));
-      mm_a = _mm_sub_epi16(mm_a, mm_b);
-      mm_b = _mm_mulhi_epi16(mm_a, mm_tmpx);
-      mm_a = _mm_mullo_epi16(mm_a, mm_tmpx);
-
-      __m128i mm_sum = _mm_add_epi32(_mm_unpacklo_epi16(mm_a, mm_b), _mm_unpackhi_epi16(mm_a, mm_b));
-      mm_sum = _mm_srai_epi32(_mm_add_epi32(mm_sum, mm_boffset), 1);
-      mm_a = _mm_cvtepi16_epi32(_mm_loadl_epi64((const __m128i *)(src0 + x)));
-      mm_b = _mm_cvtepi16_epi32(_mm_loadl_epi64((const __m128i *)(src1 + x)));
-      mm_sum = _mm_add_epi32(_mm_add_epi32(mm_sum, mm_a), _mm_add_epi32(mm_b, mm_offset));
-      mm_sum = _mm_packs_epi32(_mm_srai_epi32(mm_sum, shift), mm_a);
-      mm_sum = _mm_min_epi16(vibdimax, _mm_max_epi16(vibdimin, mm_sum));
-      _mm_storel_epi64((__m128i *)(dst + x), mm_sum);
+      __m128i a   = _mm_unpacklo_epi16(_mm_loadl_epi64((const __m128i *) (gradX0 + x)),
+                                     _mm_loadl_epi64((const __m128i *) (gradY0 + x)));
+      __m128i b   = _mm_unpacklo_epi16(_mm_loadl_epi64((const __m128i *) (gradX1 + x)),
+                                     _mm_loadl_epi64((const __m128i *) (gradY1 + x)));
+      a           = _mm_sub_epi16(a, b);
+      __m128i sum = _mm_madd_epi16(a, c);
+
+      a   = _mm_unpacklo_epi16(_mm_loadl_epi64((const __m128i *) (src0 + x)),
+                             _mm_loadl_epi64((const __m128i *) (src1 + x)));
+      sum = _mm_add_epi32(sum, _mm_madd_epi16(a, _mm_set1_epi16(2)));
+      sum = _mm_add_epi32(sum, _mm_set1_epi32(2 * offset + 1));
+      sum = _mm_sra_epi32(sum, _mm_cvtsi32_si128(shift + 1));
+      sum = _mm_packs_epi32(sum, sum);
+      sum = _mm_max_epi16(sum, vibdimin);
+      sum = _mm_min_epi16(sum, vibdimax);
+      _mm_storel_epi64((__m128i *) (dst + x), sum);
     }
     dst += dstStride;       src0 += src0Stride;     src1 += src1Stride;
     gradX0 += gradStride; gradX1 += gradStride; gradY0 += gradStride; gradY1 += gradStride;
@@ -291,7 +274,7 @@ void calcBIOSums_SSE(const Pel* srcY0Tmp, const Pel* srcY1Tmp, Pel* gradX0, Pel*
   __m128i sumAbsGYTmp = _mm_setzero_si128();
   __m128i sumDIYTmp = _mm_setzero_si128();
   __m128i sumSignGyGxTmp = _mm_setzero_si128();
-  Pel tmpStore[8];
+
   for (int y = 0; y < 6; y++)
   {
     __m128i shiftSrcY0Tmp = _mm_srai_epi16(_mm_loadu_si128((__m128i*)(srcY0Tmp)), shift4);
@@ -321,16 +304,32 @@ void calcBIOSums_SSE(const Pel* srcY0Tmp, const Pel* srcY1Tmp, Pel* gradX0, Pel*
     gradY0 += widthG;
     gradY1 += widthG;
   }
-  _mm_storeu_si128((__m128i *)tmpStore, sumAbsGXTmp);
-  *sumAbsGX = tmpStore[0] + tmpStore[1] + tmpStore[2] + tmpStore[3] + tmpStore[4] + tmpStore[5];
-  _mm_storeu_si128((__m128i *)tmpStore, sumAbsGYTmp);
-  *sumAbsGY = tmpStore[0] + tmpStore[1] + tmpStore[2] + tmpStore[3] + tmpStore[4] + tmpStore[5];
-  _mm_storeu_si128((__m128i *)tmpStore, sumDIXTmp);
-  *sumDIX = tmpStore[0] + tmpStore[1] + tmpStore[2] + tmpStore[3] + tmpStore[4] + tmpStore[5];
-  _mm_storeu_si128((__m128i *)tmpStore, sumDIYTmp);
-  *sumDIY = tmpStore[0] + tmpStore[1] + tmpStore[2] + tmpStore[3] + tmpStore[4] + tmpStore[5];
-  _mm_storeu_si128((__m128i *)tmpStore, sumSignGyGxTmp);
-  *sumSignGY_GX = tmpStore[0] + tmpStore[1] + tmpStore[2] + tmpStore[3] + tmpStore[4] + tmpStore[5];
+
+  sumAbsGXTmp    = _mm_madd_epi16(sumAbsGXTmp, _mm_setr_epi16(1, 1, 1, 1, 1, 1, 0, 0));
+  sumDIXTmp      = _mm_madd_epi16(sumDIXTmp, _mm_setr_epi16(1, 1, 1, 1, 1, 1, 0, 0));
+  sumAbsGYTmp    = _mm_madd_epi16(sumAbsGYTmp, _mm_setr_epi16(1, 1, 1, 1, 1, 1, 0, 0));
+  sumDIYTmp      = _mm_madd_epi16(sumDIYTmp, _mm_setr_epi16(1, 1, 1, 1, 1, 1, 0, 0));
+  sumSignGyGxTmp = _mm_madd_epi16(sumSignGyGxTmp, _mm_setr_epi16(1, 1, 1, 1, 1, 1, 0, 0));
+
+  __m128i a12 = _mm_unpacklo_epi32(sumAbsGXTmp, sumAbsGYTmp);
+  __m128i a3  = _mm_unpackhi_epi32(sumAbsGXTmp, sumAbsGYTmp);
+  __m128i b12 = _mm_unpacklo_epi32(sumDIXTmp, sumDIYTmp);
+  __m128i b3  = _mm_unpackhi_epi32(sumDIXTmp, sumDIYTmp);
+  __m128i c1  = _mm_unpacklo_epi64(a12, b12);
+  __m128i c2  = _mm_unpackhi_epi64(a12, b12);
+  __m128i c3  = _mm_unpacklo_epi64(a3, b3);
+
+  c1 = _mm_add_epi32(c1, c2);
+  c1 = _mm_add_epi32(c1, c3);
+
+  *sumAbsGX = _mm_cvtsi128_si32(c1);
+  *sumAbsGY = _mm_cvtsi128_si32(_mm_shuffle_epi32(c1, 0x55));
+  *sumDIX   = _mm_cvtsi128_si32(_mm_shuffle_epi32(c1, 0xaa));
+  *sumDIY   = _mm_cvtsi128_si32(_mm_shuffle_epi32(c1, 0xff));
+
+  sumSignGyGxTmp = _mm_add_epi32(sumSignGyGxTmp, _mm_shuffle_epi32(sumSignGyGxTmp, 0x4e));   // 01001110
+  sumSignGyGxTmp = _mm_add_epi32(sumSignGyGxTmp, _mm_shuffle_epi32(sumSignGyGxTmp, 0xb1));   // 10110001
+  *sumSignGY_GX  = _mm_cvtsi128_si32(sumSignGyGxTmp);
 }
 #endif